[Likwid-commit] [likwid] 01/04: Imported Upstream version 4.1.0+dfsg1
Christoph Martin
chrism at debian.org
Tue Jun 21 10:34:18 UTC 2016
This is an automated email from the git hooks/post-receive script.
chrism pushed a commit to branch upstream
in repository likwid.
commit 2d3370410ca0bd86b781267242f8d33efc2a9da8
Author: Christoph Martin <martin at uni-mainz.de>
Date: Wed Jun 15 18:18:09 2016 +0200
Imported Upstream version 4.1.0+dfsg1
---
.travis.yml | 15 +
CHANGELOG | 59 +
INSTALL | 175 +-
Makefile | 690 ++-
README | 29 -
README.md | 58 +
bench/Makefile | 157 +
bench/includes/allocator.h | 50 +
bench/includes/allocator_types.h | 46 +
bench/includes/barrier.h | 58 +
bench/includes/barrier_types.h | 49 +
bench/includes/bstrlib.h | 1 +
bench/includes/likwid.h | 1 +
bench/includes/strUtil.h | 60 +
bench/includes/test_types.h | 113 +
bench/includes/threads.h | 114 +
bench/includes/threads_types.h | 56 +
bench/likwid-bench.c | 521 ++
bench/perl/AsmGen.pl | 284 ++
{perl => bench/perl}/Parse/RecDescent.pm | 0
{perl => bench/perl}/Template.pm | 0
{perl => bench/perl}/Template/Base.pm | 0
{perl => bench/perl}/Template/Config.pm | 0
{perl => bench/perl}/Template/Constants.pm | 0
{perl => bench/perl}/Template/Context.pm | 0
{perl => bench/perl}/Template/Directive.pm | 0
{perl => bench/perl}/Template/Document.pm | 0
{perl => bench/perl}/Template/Exception.pm | 0
{perl => bench/perl}/Template/Filters.pm | 0
{perl => bench/perl}/Template/Grammar.pm | 0
{perl => bench/perl}/Template/Iterator.pm | 0
.../perl}/Template/Namespace/Constants.pm | 0
{perl => bench/perl}/Template/Parser.pm | 0
{perl => bench/perl}/Template/Plugin.pm | 0
{perl => bench/perl}/Template/Plugin/Assert.pm | 0
{perl => bench/perl}/Template/Plugin/CGI.pm | 0
{perl => bench/perl}/Template/Plugin/Datafile.pm | 0
{perl => bench/perl}/Template/Plugin/Date.pm | 0
{perl => bench/perl}/Template/Plugin/Directory.pm | 0
{perl => bench/perl}/Template/Plugin/Dumper.pm | 0
{perl => bench/perl}/Template/Plugin/File.pm | 0
{perl => bench/perl}/Template/Plugin/Filter.pm | 0
{perl => bench/perl}/Template/Plugin/Format.pm | 0
{perl => bench/perl}/Template/Plugin/HTML.pm | 0
{perl => bench/perl}/Template/Plugin/Image.pm | 0
{perl => bench/perl}/Template/Plugin/Iterator.pm | 0
{perl => bench/perl}/Template/Plugin/Math.pm | 0
{perl => bench/perl}/Template/Plugin/Pod.pm | 0
{perl => bench/perl}/Template/Plugin/Procedural.pm | 0
{perl => bench/perl}/Template/Plugin/Scalar.pm | 0
{perl => bench/perl}/Template/Plugin/String.pm | 0
{perl => bench/perl}/Template/Plugin/Table.pm | 0
{perl => bench/perl}/Template/Plugin/URL.pm | 0
{perl => bench/perl}/Template/Plugin/View.pm | 0
{perl => bench/perl}/Template/Plugin/Wrap.pm | 0
{perl => bench/perl}/Template/Plugins.pm | 0
{perl => bench/perl}/Template/Provider.pm | 0
{perl => bench/perl}/Template/Service.pm | 0
{perl => bench/perl}/Template/Stash.pm | 0
{perl => bench/perl}/Template/Stash/Context.pm | 0
{perl => bench/perl}/Template/Stash/XS.pm | 0
{perl => bench/perl}/Template/Test.pm | 0
{perl => bench/perl}/Template/VMethods.pm | 0
{perl => bench/perl}/Template/View.pm | 0
bench/perl/gas.pm | 211 +
bench/perl/generatePas.pl | 198 +
{perl => bench/perl}/isax86.pm | 0
{perl => bench/perl}/isax86_64.pm | 0
{perl => bench/perl}/templates/bench.tt | 0
bench/perl/templates/group.tt | 157 +
{perl => bench/perl}/templates/group_types.tt | 0
bench/perl/templates/testcases.tt | 19 +
bench/phi/store.ptt | 8 +-
bench/phi/store_mem.ptt | 8 +-
bench/src/allocator.c | 209 +
bench/src/barrier.c | 167 +
bench/src/bench.c | 770 +++
bench/src/bstrlib.c | 2955 +++++++++++
bench/src/strUtil.c | 319 ++
bench/src/threads.c | 293 ++
bench/x86-64/branch.ptt | 36 -
bench/x86-64/clcopy.ptt | 6 +
bench/x86-64/clload.ptt | 6 +
bench/x86-64/clstore.ptt | 14 +-
bench/x86-64/copy.ptt | 24 +-
bench/x86-64/copy_avx.ptt | 6 +
bench/x86-64/copy_mem.ptt | 24 +-
bench/x86-64/copy_mem_avx.ptt | 8 +-
bench/x86-64/copy_mem_sse.ptt | 6 +
bench/x86-64/copy_plain.ptt | 16 -
bench/x86-64/copy_sse.ptt | 6 +
bench/x86-64/daxpy.ptt | 28 +
bench/x86-64/daxpy_avx.ptt | 31 +
bench/x86-64/daxpy_avx_fma.ptt | 25 +
bench/x86-64/daxpy_mem_avx.ptt | 30 +
bench/x86-64/daxpy_mem_avx_fma.ptt | 24 +
bench/x86-64/daxpy_mem_sse.ptt | 28 +
bench/x86-64/daxpy_mem_sse_fma.ptt | 24 +
bench/x86-64/daxpy_sp.ptt | 44 +
bench/x86-64/daxpy_sp_avx.ptt | 19 +
bench/x86-64/daxpy_sp_avx_fma.ptt | 25 +
bench/x86-64/daxpy_sp_mem_avx.ptt | 19 +
bench/x86-64/daxpy_sp_mem_avx_fma.ptt | 25 +
bench/x86-64/daxpy_sp_mem_sse.ptt | 20 +
bench/x86-64/daxpy_sp_mem_sse_fma.ptt | 24 +
bench/x86-64/daxpy_sp_sse.ptt | 28 +
bench/x86-64/daxpy_sp_sse_fma.ptt | 24 +
bench/x86-64/daxpy_sse.ptt | 28 +
bench/x86-64/daxpy_sse_fma.ptt | 24 +
bench/x86-64/ddot.ptt | 27 +
bench/x86-64/ddot_avx.ptt | 27 +
bench/x86-64/ddot_sp.ptt | 27 +
bench/x86-64/ddot_sp_avx.ptt | 19 +
bench/x86-64/ddot_sp_sse.ptt | 19 +
bench/x86-64/ddot_sse.ptt | 27 +
bench/x86-64/load.ptt | 20 +-
bench/x86-64/load_avx.ptt | 8 +-
bench/x86-64/load_mem.ptt | 15 +
bench/x86-64/load_plain.ptt | 12 -
bench/x86-64/load_sse.ptt | 8 +-
bench/x86-64/peak.ptt | 49 -
bench/x86-64/peak_avx.ptt | 49 -
bench/x86-64/peak_sse.ptt | 49 -
bench/x86-64/peakflops.ptt | 37 -
bench/x86-64/peakflops_avx.ptt | 37 -
bench/x86-64/peakflops_sse.ptt | 37 -
bench/x86-64/store.ptt | 25 +-
bench/x86-64/store_avx.ptt | 15 +-
bench/x86-64/store_mem.ptt | 14 +-
bench/x86-64/store_mem_avx.ptt | 14 +-
bench/x86-64/store_mem_sse.ptt | 14 +-
bench/x86-64/store_plain.ptt | 15 -
bench/x86-64/store_sse.ptt | 15 +-
bench/x86-64/stream.ptt | 42 +-
bench/x86-64/stream_avx.ptt | 49 +-
bench/x86-64/stream_avx_fma.ptt | 24 +
bench/x86-64/stream_mem.ptt | 31 +-
bench/x86-64/stream_mem_avx.ptt | 17 +
bench/x86-64/stream_mem_avx_fma.ptt | 24 +
bench/x86-64/stream_mem_sse.ptt | 17 +
bench/x86-64/stream_mem_sse_fma.ptt | 15 +
bench/x86-64/stream_sp.ptt | 45 +
bench/x86-64/stream_sp_avx.ptt | 28 +
bench/x86-64/stream_sp_avx_fma.ptt | 24 +
bench/x86-64/stream_sp_mem_avx.ptt | 28 +
bench/x86-64/stream_sp_mem_avx_fma.ptt | 24 +
bench/x86-64/stream_sp_mem_sse.ptt | 16 +
bench/x86-64/stream_sp_mem_sse_fma.ptt | 15 +
bench/x86-64/stream_sp_sse.ptt | 16 +
bench/x86-64/stream_sp_sse_fma.ptt | 15 +
bench/x86-64/stream_sse.ptt | 29 +
bench/x86-64/stream_sse_fma.ptt | 24 +
bench/x86-64/striad_avx.ptt | 23 -
bench/x86-64/striad_mem_avx.ptt | 11 -
bench/x86-64/striad_mem_sse.ptt | 11 -
bench/x86-64/striad_plain.ptt | 23 -
bench/x86-64/striad_sse.ptt | 23 -
bench/x86-64/sum.ptt | 44 +-
bench/x86-64/sum_avx.ptt | 36 +-
bench/x86-64/sum_plain.ptt | 15 -
bench/x86-64/sum_sp.ptt | 21 +
bench/x86-64/sum_sp_avx.ptt | 20 +
bench/x86-64/sum_sp_sse.ptt | 29 +
bench/x86-64/sum_sse.ptt | 6 +
bench/x86-64/triad.ptt | 40 +-
bench/x86-64/triad_avx.ptt | 32 +-
bench/x86-64/triad_avx_fma.ptt | 27 +
bench/x86-64/triad_mem.ptt | 10 -
bench/x86-64/triad_mem_avx.ptt | 18 +
bench/x86-64/triad_mem_avx_fma.ptt | 20 +
bench/x86-64/triad_mem_sse.ptt | 27 +
bench/x86-64/triad_mem_sse_fma.ptt | 27 +
bench/x86-64/triad_sp.ptt | 43 +
bench/x86-64/triad_sp_avx.ptt | 18 +
bench/x86-64/triad_sp_avx_fma.ptt | 16 +
bench/x86-64/triad_sp_mem_avx.ptt | 16 +
bench/x86-64/triad_sp_mem_avx_fma.ptt | 16 +
bench/x86-64/triad_sp_mem_sse.ptt | 27 +
bench/x86-64/triad_sp_mem_sse_fma.ptt | 27 +
bench/x86-64/triad_sp_sse.ptt | 27 +
bench/x86-64/triad_sp_sse_fma.ptt | 27 +
bench/x86-64/triad_split.ptt | 30 -
bench/x86-64/triad_sse.ptt | 28 +
bench/x86-64/triad_sse_fma.ptt | 27 +
bench/x86-64/update.ptt | 24 +-
bench/x86-64/update_avx.ptt | 6 +
bench/x86-64/update_plain.ptt | 15 -
bench/x86-64/update_sse.ptt | 6 +
bench/x86-64/vtriad_avx.ptt | 22 -
bench/x86-64/vtriad_mem_avx.ptt | 10 -
bench/x86-64/vtriad_mem_sse.ptt | 10 -
bench/x86-64/vtriad_plain.ptt | 22 -
bench/x86-64/vtriad_sse.ptt | 22 -
bench/x86/copy.ptt | 24 +-
bench/x86/load.ptt | 19 +-
bench/x86/store.ptt | 24 +-
bench/x86/stream.ptt | 42 +-
config.mk | 59 +-
doc/Doxyfile | 1781 +++++++
doc/applications/likwid-accessD.md | 55 +
doc/applications/likwid-agent.md | 94 +
doc/applications/likwid-bench.md | 93 +
doc/applications/likwid-genTopoCfg.md | 29 +
doc/applications/likwid-memsweeper.md | 34 +
doc/applications/likwid-mpirun.md | 83 +
doc/applications/likwid-perfctr.md | 260 +
doc/applications/likwid-perfscope.md | 107 +
doc/applications/likwid-pin.md | 170 +
doc/applications/likwid-powermeter.md | 75 +
doc/applications/likwid-setFreq.md | 13 +
doc/applications/likwid-setFrequencies.md | 50 +
doc/applications/likwid-topology.md | 68 +
doc/archs/atom.md | 104 +
doc/archs/broadwell.md | 203 +
doc/archs/core2.md | 103 +
doc/archs/haswell.md | 203 +
doc/archs/haswellep.md | 896 ++++
doc/archs/interlagos.md | 107 +
doc/archs/ivybridge.md | 190 +
doc/archs/ivybridgeep.md | 790 +++
doc/archs/k10.md | 68 +
doc/archs/k8.md | 68 +
doc/archs/kabini.md | 162 +
doc/archs/nehalem.md | 237 +
doc/archs/nehalemex.md | 554 +++
doc/archs/pentiumm.md | 63 +
doc/archs/phi.md | 78 +
doc/archs/sandybridge.md | 189 +
doc/archs/sandybridgeep.md | 775 +++
doc/archs/silvermont.md | 175 +
doc/archs/westmere.md | 239 +
doc/archs/westmereex.md | 555 +++
doc/bstrlib.txt | 3201 ++++++++++++
doc/likwid-accessD.1 | 10 +-
doc/likwid-agent.1 | 94 +
doc/likwid-bench.1 | 145 +-
doc/likwid-doxygen.md | 262 +
doc/likwid-features.1 | 54 +-
doc/likwid-genCfg.1 | 30 -
doc/likwid-genTopoCfg.1 | 30 +
doc/likwid-lua.1 | 111 +
doc/likwid-memsweeper.1 | 16 +-
doc/likwid-mpirun.1 | 77 +-
doc/likwid-perfctr.1 | 260 +-
doc/likwid-perfscope.1 | 178 +-
doc/likwid-pin.1 | 226 +-
doc/likwid-powermeter.1 | 69 +-
doc/likwid-setFreq.1 | 10 +-
doc/likwid-setFrequencies.1 | 40 +-
doc/likwid-topology.1 | 46 +-
doc/likwid.cfg.md | 38 +
doc/logo.png | Bin 0 -> 6776 bytes
doc/lua-doxygen.md | 2592 ++++++++++
examples/C-internalMarkerAPI.c | 152 +
examples/C-likwidAPI.c | 149 +
examples/C-markerAPI.c | 87 +
examples/F-markerAPI.F90 | 79 +
examples/Lua-likwidAPI.lua | 93 +
examples/Makefile | 64 +
examples/monitoring.c | 118 +
ext/hwloc/AUTHORS | 8 +
ext/hwloc/COPYING | 28 +
ext/hwloc/Makefile | 73 +
ext/hwloc/hwloc/base64.c | 306 ++
ext/hwloc/hwloc/bind.c | 781 +++
ext/hwloc/hwloc/bitmap.c | 1492 ++++++
ext/hwloc/hwloc/components.c | 792 +++
ext/hwloc/hwloc/diff.c | 426 ++
ext/hwloc/hwloc/distances.c | 995 ++++
ext/hwloc/hwloc/dolib.c | 47 +
ext/hwloc/hwloc/misc.c | 166 +
ext/hwloc/hwloc/pci-common.c | 482 ++
ext/hwloc/hwloc/topology-bgq.cb | 246 +
ext/hwloc/hwloc/topology-darwin.cb | 307 ++
ext/hwloc/hwloc/topology-fake.c | 61 +
ext/hwloc/hwloc/topology-freebsd.cb | 255 +
ext/hwloc/hwloc/topology-linux.c | 5133 ++++++++++++++++++++
ext/hwloc/hwloc/topology-noos.c | 58 +
ext/hwloc/hwloc/topology-opencl.cb | 346 ++
ext/hwloc/hwloc/topology-osf.cb | 392 ++
ext/hwloc/hwloc/topology-synthetic.c | 1128 +++++
ext/hwloc/hwloc/topology-x86.c | 1386 ++++++
ext/hwloc/hwloc/topology.c | 3436 +++++++++++++
ext/hwloc/hwloc/traversal.c | 701 +++
ext/hwloc/include/hwloc.h | 2206 +++++++++
ext/hwloc/include/hwloc/autogen/config.h | 202 +
ext/hwloc/include/hwloc/autogen/config.h.in | 201 +
ext/hwloc/include/hwloc/autogen/stamp-h2 | 1 +
ext/hwloc/include/hwloc/bitmap.h | 359 ++
ext/hwloc/include/hwloc/cuda.h | 224 +
ext/hwloc/include/hwloc/cudart.h | 184 +
ext/hwloc/include/hwloc/deprecated.h | 114 +
ext/hwloc/include/hwloc/diff.h | 299 ++
ext/hwloc/include/hwloc/export.h | 221 +
ext/hwloc/include/hwloc/gl.h | 135 +
ext/hwloc/include/hwloc/glibc-sched.h | 125 +
ext/hwloc/include/hwloc/helper.h | 1249 +++++
ext/hwloc/include/hwloc/inlines.h | 154 +
ext/hwloc/include/hwloc/intel-mic.h | 143 +
ext/hwloc/include/hwloc/linux-libnuma.h | 273 ++
ext/hwloc/include/hwloc/linux.h | 77 +
ext/hwloc/include/hwloc/myriexpress.h | 127 +
ext/hwloc/include/hwloc/nvml.h | 176 +
ext/hwloc/include/hwloc/opencl.h | 199 +
ext/hwloc/include/hwloc/openfabrics-verbs.h | 155 +
ext/hwloc/include/hwloc/plugins.h | 433 ++
ext/hwloc/include/hwloc/rename.h | 651 +++
ext/hwloc/include/numa.h | 468 ++
ext/hwloc/include/pci/config.h | 16 +
ext/hwloc/include/pci/header.h | 1195 +++++
ext/hwloc/include/pci/pci.h | 240 +
ext/hwloc/include/pci/types.h | 65 +
ext/hwloc/include/private/autogen/README.txt | 3 +
ext/hwloc/include/private/autogen/config.h | 772 +++
ext/hwloc/include/private/components.h | 40 +
ext/hwloc/include/private/cpuid-x86.h | 89 +
ext/hwloc/include/private/cpuid.h | 80 +
ext/hwloc/include/private/debug.h | 57 +
ext/hwloc/include/private/map.h | 110 +
ext/hwloc/include/private/misc.h | 382 ++
ext/hwloc/include/private/private.h | 335 ++
ext/hwloc/include/private/solaris-chiptype.h | 59 +
ext/hwloc/include/private/xml.h | 98 +
ext/hwloc/include/static-components.h | 17 +
filters/csv | 114 -
filters/xml | 184 +-
groups/atom/BRANCH.txt | 16 +-
groups/atom/DATA.txt | 12 +-
groups/atom/FLOPS_DP.txt | 6 +-
groups/atom/FLOPS_SP.txt | 6 +-
groups/atom/FLOPS_X87.txt | 6 +-
groups/atom/MEM.txt | 12 +-
groups/atom/TLB.txt | 3 +-
groups/broadwell/BRANCH.txt | 31 +
groups/broadwell/CLOCK.txt | 23 +
groups/broadwell/DATA.txt | 22 +
groups/broadwell/ENERGY.txt | 39 +
groups/broadwell/FALSE_SHARE.txt | 25 +
groups/broadwell/FLOPS_AVX.txt | 24 +
groups/broadwell/FLOPS_DP.txt | 29 +
groups/broadwell/FLOPS_SP.txt | 29 +
groups/broadwell/ICACHE.txt | 25 +
groups/broadwell/L2.txt | 37 +
groups/broadwell/L2CACHE.txt | 34 +
groups/broadwell/L3.txt | 36 +
groups/broadwell/L3CACHE.txt | 35 +
groups/broadwell/RECOVERY.txt | 22 +
groups/broadwell/TLB_DATA.txt | 35 +
groups/broadwell/TLB_INSTR.txt | 28 +
groups/broadwellD/BRANCH.txt | 31 +
groups/broadwellD/CACHES.txt | 123 +
groups/broadwellD/CLOCK.txt | 23 +
groups/broadwellD/DATA.txt | 22 +
groups/broadwellD/ENERGY.txt | 39 +
groups/broadwellD/FALSE_SHARE.txt | 25 +
groups/broadwellD/FLOPS_AVX.txt | 24 +
groups/broadwellD/FLOPS_DP.txt | 29 +
groups/broadwellD/FLOPS_SP.txt | 29 +
groups/broadwellD/HA.txt | 40 +
groups/broadwellD/ICACHE.txt | 25 +
groups/broadwellD/L2.txt | 37 +
groups/broadwellD/L2CACHE.txt | 34 +
groups/broadwellD/L3.txt | 36 +
groups/broadwellD/L3CACHE.txt | 35 +
groups/broadwellD/MEM.txt | 52 +
groups/broadwellD/MEM_DP.txt | 66 +
groups/broadwellD/MEM_SP.txt | 68 +
groups/broadwellD/RECOVERY.txt | 22 +
groups/broadwellD/TLB_DATA.txt | 35 +
groups/broadwellD/TLB_INSTR.txt | 28 +
groups/broadwellEP/BRANCH.txt | 31 +
groups/broadwellEP/CACHES.txt | 123 +
groups/broadwellEP/CLOCK.txt | 23 +
groups/broadwellEP/DATA.txt | 22 +
groups/broadwellEP/ENERGY.txt | 35 +
groups/broadwellEP/FALSE_SHARE.txt | 29 +
groups/broadwellEP/FLOPS_AVX.txt | 24 +
groups/broadwellEP/FLOPS_DP.txt | 29 +
groups/broadwellEP/FLOPS_SP.txt | 29 +
groups/broadwellEP/HA.txt | 40 +
groups/broadwellEP/ICACHE.txt | 25 +
groups/broadwellEP/L2.txt | 37 +
groups/broadwellEP/L2CACHE.txt | 34 +
groups/broadwellEP/L3.txt | 36 +
groups/broadwellEP/L3CACHE.txt | 35 +
groups/broadwellEP/MEM.txt | 52 +
groups/broadwellEP/MEM_DP.txt | 66 +
groups/broadwellEP/MEM_SP.txt | 68 +
groups/broadwellEP/NUMA.txt | 41 +
groups/broadwellEP/QPI.txt | 49 +
groups/broadwellEP/TLB_DATA.txt | 35 +
groups/broadwellEP/TLB_INSTR.txt | 28 +
groups/core2/BRANCH.txt | 16 +-
groups/core2/CACHE.txt | 29 +-
groups/core2/CLOCK.txt | 19 +
groups/core2/DATA.txt | 6 +-
groups/core2/FLOPS_DP.txt | 9 +-
groups/core2/FLOPS_SP.txt | 9 +-
groups/core2/FLOPS_X87.txt | 9 +-
groups/core2/L2.txt | 19 +-
groups/core2/L2CACHE.txt | 13 +-
groups/core2/MEM.txt | 9 +-
groups/core2/TLB.txt | 9 +-
groups/core2/UOPS.txt | 22 +
groups/core2/UOPS_RETIRE.txt | 25 +
groups/haswell/BRANCH.txt | 14 +-
groups/haswell/CACHES.txt | 71 +
groups/haswell/CLOCK.txt | 2 +-
groups/haswell/DATA.txt | 13 +-
groups/haswell/ENERGY.txt | 11 +-
groups/haswell/FALSE_SHARE.txt | 28 +
groups/haswell/FLOPS_AVX.txt | 28 +
groups/haswell/ICACHE.txt | 14 +-
groups/haswell/L2.txt | 20 +-
groups/haswell/L2CACHE.txt | 21 +-
groups/haswell/L3.txt | 22 +-
groups/haswell/L3CACHE.txt | 24 +-
groups/haswell/RECOVERY.txt | 22 +
groups/haswell/TLB_DATA.txt | 20 +-
groups/haswell/TLB_INSTR.txt | 10 +-
groups/haswell/UOPS.txt | 35 +
groups/haswell/UOPS_EXEC.txt | 31 +
groups/haswell/UOPS_ISSUE.txt | 31 +
groups/haswell/UOPS_RETIRE.txt | 31 +
groups/haswellEP/BRANCH.txt | 31 +
groups/haswellEP/CACHES.txt | 123 +
groups/haswellEP/CBOX.txt | 61 +
groups/haswellEP/CLOCK.txt | 23 +
groups/haswellEP/DATA.txt | 22 +
groups/haswellEP/ENERGY.txt | 35 +
groups/haswellEP/FALSE_SHARE.txt | 34 +
groups/haswellEP/FLOPS_AVX.txt | 28 +
groups/haswellEP/HA.txt | 40 +
groups/haswellEP/ICACHE.txt | 33 +
groups/haswellEP/L2.txt | 37 +
groups/haswellEP/L2CACHE.txt | 34 +
groups/haswellEP/L3.txt | 36 +
groups/haswellEP/L3CACHE.txt | 35 +
groups/haswellEP/MEM.txt | 52 +
groups/haswellEP/NUMA.txt | 33 +
groups/haswellEP/QPI.txt | 49 +
groups/haswellEP/RECOVERY.txt | 22 +
groups/haswellEP/SBOX.txt | 28 +
groups/haswellEP/TLB_DATA.txt | 35 +
groups/haswellEP/TLB_INSTR.txt | 28 +
groups/haswellEP/UOPS.txt | 35 +
groups/haswellEP/UOPS_EXEC.txt | 31 +
groups/haswellEP/UOPS_ISSUE.txt | 31 +
groups/haswellEP/UOPS_RETIRE.txt | 31 +
groups/interlagos/BRANCH.txt | 18 +-
groups/interlagos/CACHE.txt | 30 +-
groups/interlagos/CPI.txt | 5 +
groups/interlagos/DATA.txt | 4 +-
groups/interlagos/FLOPS_DP.txt | 8 +-
groups/interlagos/FLOPS_SP.txt | 8 +-
groups/interlagos/FPU_EXCEPTION.txt | 2 +-
groups/interlagos/ICACHE.txt | 16 +-
groups/interlagos/L2.txt | 12 +-
groups/interlagos/L2CACHE.txt | 18 +-
groups/interlagos/L3.txt | 23 +-
groups/interlagos/L3CACHE.txt | 18 +-
groups/interlagos/LINKS.txt | 4 +-
groups/interlagos/MEM.txt | 2 +-
groups/interlagos/NUMA.txt | 4 +-
groups/interlagos/NUMA_0_3.txt | 28 +
groups/interlagos/NUMA_4_7.txt | 28 +
groups/ivybridge/BRANCH.txt | 14 +-
groups/ivybridge/CLOCK.txt | 2 +-
groups/ivybridge/DATA.txt | 8 +-
groups/ivybridge/ENERGY.txt | 12 +-
groups/ivybridge/FALSE_SHARE.txt | 25 +
groups/ivybridge/FLOPS_AVX.txt | 12 +-
groups/ivybridge/FLOPS_DP.txt | 17 +-
groups/ivybridge/FLOPS_SP.txt | 18 +-
groups/ivybridge/ICACHE.txt | 14 +-
groups/ivybridge/L2.txt | 28 +-
groups/ivybridge/L2CACHE.txt | 19 +-
groups/ivybridge/L3.txt | 22 +-
groups/ivybridge/L3CACHE.txt | 25 +-
groups/ivybridge/MEM.txt | 32 -
groups/ivybridge/MEM_DP.txt | 57 -
groups/ivybridge/MEM_SP.txt | 57 -
groups/ivybridge/RECOVERY.txt | 22 +
groups/ivybridge/TLB_DATA.txt | 20 +-
groups/ivybridge/TLB_INSTR.txt | 10 +-
groups/ivybridge/UOPS.txt | 35 +
groups/ivybridge/UOPS_EXEC.txt | 31 +
groups/ivybridge/UOPS_ISSUE.txt | 31 +
groups/ivybridge/UOPS_RETIRE.txt | 31 +
groups/ivybridgeEP/BRANCH.txt | 31 +
groups/ivybridgeEP/CACHES.txt | 121 +
groups/ivybridgeEP/CBOX.txt | 55 +
groups/ivybridgeEP/CLOCK.txt | 23 +
groups/ivybridgeEP/DATA.txt | 22 +
groups/ivybridgeEP/ENERGY.txt | 33 +
groups/ivybridgeEP/FALSE_SHARE.txt | 32 +
groups/ivybridgeEP/FLOPS_AVX.txt | 26 +
groups/ivybridgeEP/FLOPS_DP.txt | 31 +
groups/ivybridgeEP/FLOPS_SP.txt | 31 +
groups/ivybridgeEP/ICACHE.txt | 33 +
groups/ivybridgeEP/L2.txt | 38 +
groups/ivybridgeEP/L2CACHE.txt | 34 +
groups/ivybridgeEP/L3.txt | 36 +
groups/ivybridgeEP/L3CACHE.txt | 36 +
groups/ivybridgeEP/MEM.txt | 49 +
groups/ivybridgeEP/MEM_DP.txt | 68 +
groups/ivybridgeEP/MEM_SP.txt | 70 +
groups/ivybridgeEP/NUMA.txt | 33 +
groups/ivybridgeEP/QPI.txt | 52 +
groups/ivybridgeEP/RECOVERY.txt | 22 +
groups/ivybridgeEP/TLB_DATA.txt | 35 +
groups/ivybridgeEP/TLB_INSTR.txt | 28 +
groups/ivybridgeEP/UNCORECLOCK.txt | 84 +
groups/ivybridgeEP/UOPS.txt | 35 +
groups/ivybridgeEP/UOPS_EXEC.txt | 31 +
groups/ivybridgeEP/UOPS_ISSUE.txt | 31 +
groups/ivybridgeEP/UOPS_RETIRE.txt | 31 +
groups/k10/BRANCH.txt | 20 +-
groups/k10/CACHE.txt | 30 +-
groups/k10/CPI.txt | 5 +
groups/k10/FLOPS_DP.txt | 16 +-
groups/k10/FLOPS_SP.txt | 16 +-
groups/k10/FLOPS_X87.txt | 18 +-
groups/k10/FPU_EXCEPTION.txt | 2 +-
groups/k10/ICACHE.txt | 16 +-
groups/k10/L2.txt | 20 +-
groups/k10/L2CACHE.txt | 12 +-
groups/k10/L3CACHE.txt | 20 +-
groups/k10/MEM.txt | 19 +-
groups/k10/NUMA.txt | 25 -
groups/k10/NUMA2.txt | 24 -
groups/k10/NUMA_0_3.txt | 27 +
groups/k10/NUMA_4_7.txt | 27 +
groups/k10/TLB.txt | 6 +-
groups/k8/BRANCH.txt | 20 +-
groups/k8/CACHE.txt | 30 +-
groups/k8/CPI.txt | 5 +
groups/k8/ICACHE.txt | 16 +-
groups/k8/L2.txt | 4 +-
groups/kabini/BRANCH.txt | 18 +-
groups/kabini/CACHE.txt | 30 +-
groups/kabini/CPI.txt | 5 +
groups/kabini/DATA.txt | 4 +-
groups/kabini/FLOPS_DP.txt | 11 +-
groups/kabini/FLOPS_SP.txt | 11 +-
groups/kabini/FPU_EXCEPTION.txt | 2 +-
groups/kabini/ICACHE.txt | 16 +-
groups/kabini/L2.txt | 20 +-
groups/kabini/MEM.txt | 2 +-
groups/kabini/NUMA.txt | 28 -
groups/kabini/NUMA2.txt | 28 -
groups/kabini/NUMA_0_3.txt | 28 +
groups/kabini/NUMA_4_7.txt | 28 +
groups/kabini/TLB.txt | 9 +-
groups/nehalem/BRANCH.txt | 14 +-
groups/nehalem/CACHE.txt | 29 +-
groups/nehalem/DATA.txt | 6 +-
groups/nehalem/FLOPS_DP.txt | 16 +-
groups/nehalem/FLOPS_SP.txt | 16 +-
groups/nehalem/FLOPS_X87.txt | 6 +-
groups/nehalem/ICACHE.txt | 25 +
groups/nehalem/L2.txt | 30 +-
groups/nehalem/L2CACHE.txt | 20 +-
groups/nehalem/L3.txt | 18 +-
groups/nehalem/L3CACHE.txt | 30 +-
groups/nehalem/MEM.txt | 49 +-
groups/nehalem/SCHEDULER.txt | 8 +-
groups/nehalem/TLB.txt | 10 +-
groups/nehalem/VIEW.txt | 50 -
groups/nehalemEX/BRANCH.txt | 14 +-
groups/nehalemEX/CACHE.txt | 29 +-
groups/nehalemEX/DATA.txt | 6 +-
groups/nehalemEX/FLOPS_DP.txt | 16 +-
groups/nehalemEX/FLOPS_SP.txt | 16 +-
groups/nehalemEX/FLOPS_X87.txt | 6 +-
groups/nehalemEX/ICACHE.txt | 25 +
groups/nehalemEX/L2.txt | 31 +-
groups/nehalemEX/L2CACHE.txt | 21 +-
groups/nehalemEX/L3.txt | 37 +
groups/nehalemEX/L3CACHE.txt | 48 +
groups/nehalemEX/MEM.txt | 53 +-
groups/nehalemEX/SCHEDULER.txt | 8 +-
groups/nehalemEX/TLB.txt | 8 +-
groups/pentiumm/BRANCH.txt | 17 +
groups/pentiumm/CPI.txt | 22 +
groups/pentiumm/FLOPS_DP.txt | 20 +
groups/pentiumm/FLOPS_SP.txt | 18 +
groups/pentiumm/L3.txt | 30 +
groups/phi/CACHE.txt | 15 +-
groups/phi/COMPUTE_TO_DATA_RATIO.txt | 22 +
groups/phi/CPI.txt | 4 +
groups/phi/L2CACHE.txt | 19 -
groups/phi/MEM.txt | 18 +
groups/phi/MEM1.txt | 13 +-
groups/phi/MEM2.txt | 12 +-
groups/phi/MEM3.txt | 10 +-
groups/phi/MEM4.txt | 12 +-
groups/phi/MEM5.txt | 14 +-
groups/phi/MEM6.txt | 12 +-
groups/phi/MEM_READ.txt | 20 +
groups/phi/MEM_WRITE.txt | 20 +
groups/phi/PAIRING.txt | 14 +-
groups/phi/READ_MISS_RATIO.txt | 9 +-
groups/phi/TLB.txt | 23 +
groups/phi/TLB_L1.txt | 23 +
groups/phi/TLB_L2.txt | 21 +
groups/phi/VECTOR.txt | 10 +-
groups/phi/VECTOR2.txt | 10 +-
groups/phi/VPU_FILL_RATIO_DBL.txt | 12 +-
groups/phi/VPU_PAIRING.txt | 15 +-
groups/phi/VPU_READ_MISS_RATIO.txt | 10 +-
groups/phi/VPU_WRITE_MISS_RATIO.txt | 10 +-
groups/phi/WRITE_MISS_RATIO.txt | 9 +-
groups/sandybridge/BRANCH.txt | 14 +-
groups/sandybridge/CLOCK.txt | 2 +-
groups/sandybridge/DATA.txt | 8 +-
groups/sandybridge/ENERGY.txt | 14 +-
groups/sandybridge/FALSE_SHARE.txt | 25 +
groups/sandybridge/FLOPS_AVX.txt | 13 +-
groups/sandybridge/FLOPS_DP.txt | 18 +-
groups/sandybridge/FLOPS_SP.txt | 18 +-
groups/sandybridge/ICACHE.txt | 33 +
groups/sandybridge/L2.txt | 28 +-
groups/sandybridge/L2CACHE.txt | 19 +-
groups/sandybridge/L3.txt | 22 +-
groups/sandybridge/L3CACHE.txt | 24 +-
groups/sandybridge/MEM.txt | 32 -
groups/sandybridge/MEM_DP.txt | 55 -
groups/sandybridge/MEM_SP.txt | 56 -
groups/sandybridge/RECOVERY.txt | 22 +
groups/sandybridge/TLB_DATA.txt | 20 +-
groups/sandybridge/TLB_INSTR.txt | 10 +-
groups/sandybridge/UOPS.txt | 35 +
groups/sandybridge/UOPS_EXEC.txt | 31 +
groups/sandybridge/UOPS_ISSUE.txt | 31 +
groups/sandybridge/UOPS_RETIRE.txt | 31 +
groups/sandybridgeEP/BRANCH.txt | 31 +
groups/sandybridgeEP/CACHES.txt | 97 +
groups/sandybridgeEP/CLOCK.txt | 27 +
groups/sandybridgeEP/DATA.txt | 22 +
groups/sandybridgeEP/ENERGY.txt | 33 +
groups/sandybridgeEP/FALSE_SHARE.txt | 27 +
groups/sandybridgeEP/FLOPS_AVX.txt | 26 +
groups/sandybridgeEP/FLOPS_DP.txt | 31 +
groups/sandybridgeEP/FLOPS_SP.txt | 31 +
groups/sandybridgeEP/ICACHE.txt | 33 +
groups/sandybridgeEP/L2.txt | 38 +
groups/sandybridgeEP/L2CACHE.txt | 34 +
groups/sandybridgeEP/L3.txt | 36 +
groups/sandybridgeEP/L3CACHE.txt | 36 +
groups/sandybridgeEP/MEM.txt | 40 +
groups/sandybridgeEP/MEM_DP.txt | 59 +
groups/sandybridgeEP/MEM_SP.txt | 61 +
groups/sandybridgeEP/NUMA.txt | 33 +
groups/sandybridgeEP/QPI.txt | 35 +
groups/sandybridgeEP/RECOVERY.txt | 22 +
groups/sandybridgeEP/TLB_DATA.txt | 35 +
groups/sandybridgeEP/TLB_INSTR.txt | 28 +
groups/sandybridgeEP/UOPS.txt | 35 +
groups/sandybridgeEP/UOPS_EXEC.txt | 31 +
groups/sandybridgeEP/UOPS_ISSUE.txt | 31 +
groups/sandybridgeEP/UOPS_RETIRE.txt | 31 +
groups/silvermont/BRANCH.txt | 14 +-
groups/silvermont/CLOCK.txt | 23 +
groups/silvermont/DATA.txt | 22 +
groups/silvermont/ENERGY.txt | 6 +-
groups/silvermont/ICACHE.txt | 6 +-
groups/silvermont/L1TOL2.txt | 28 -
groups/silvermont/L2CACHE.txt | 34 +
groups/silvermont/L2TOMEM.txt | 26 -
groups/silvermont/MEM.txt | 37 +
groups/silvermont/MEM_LAT.txt | 23 +
groups/silvermont/TLB_DATA.txt | 27 +
groups/silvermont/TLB_INSTR.txt | 27 +
groups/skylake/BRANCH.txt | 31 +
groups/skylake/CLOCK.txt | 27 +
groups/skylake/DATA.txt | 22 +
groups/skylake/ENERGY.txt | 39 +
groups/skylake/FALSE_SHARE.txt | 25 +
groups/skylake/FLOPS_AVX.txt | 24 +
groups/skylake/FLOPS_DP.txt | 29 +
groups/skylake/FLOPS_SP.txt | 29 +
groups/skylake/ICACHE.txt | 30 +
groups/skylake/L2.txt | 38 +
groups/skylake/L2CACHE.txt | 34 +
groups/skylake/L3.txt | 36 +
groups/skylake/L3CACHE.txt | 35 +
groups/skylake/RECOVERY.txt | 22 +
groups/skylake/TLB_DATA.txt | 35 +
groups/skylake/TLB_INSTR.txt | 28 +
groups/skylake/UOPS.txt | 29 +
groups/skylake/UOPS_EXEC.txt | 31 +
groups/skylake/UOPS_ISSUE.txt | 31 +
groups/skylake/UOPS_RETIRE.txt | 31 +
groups/westmere/BRANCH.txt | 16 +-
groups/westmere/CACHE.txt | 13 +-
groups/westmere/CLOCK.txt | 18 +
groups/westmere/DATA.txt | 6 +-
groups/westmere/FLOPS_DP.txt | 14 +-
groups/westmere/FLOPS_SP.txt | 14 +-
groups/westmere/FLOPS_X87.txt | 6 +-
groups/westmere/ICACHE.txt | 25 +
groups/westmere/L2.txt | 28 +-
groups/westmere/L2CACHE.txt | 21 +-
groups/westmere/L3.txt | 23 +-
groups/westmere/L3CACHE.txt | 26 +-
groups/westmere/MEM.txt | 53 +-
groups/westmere/TLB.txt | 22 -
groups/westmere/TLB_DATA.txt | 35 +
groups/westmere/TLB_INSTR.txt | 27 +
groups/westmere/UOPS.txt | 35 +
groups/westmere/VIEW.txt | 14 +-
groups/westmereEX/BRANCH.txt | 16 +-
groups/westmereEX/CACHE.txt | 11 +-
groups/westmereEX/DATA.txt | 6 +-
groups/westmereEX/FLOPS_DP.txt | 16 +-
groups/westmereEX/FLOPS_SP.txt | 16 +-
groups/westmereEX/FLOPS_X87.txt | 6 +-
groups/westmereEX/ICACHE.txt | 25 +
groups/westmereEX/L2.txt | 28 +-
groups/westmereEX/L2CACHE.txt | 21 +-
groups/westmereEX/L3.txt | 20 +-
groups/westmereEX/L3CACHE.txt | 52 +
groups/westmereEX/MEM.txt | 49 +-
groups/westmereEX/NUMA.txt | 33 +
groups/westmereEX/TLB.txt | 22 -
groups/westmereEX/TLB_DATA.txt | 35 +
groups/westmereEX/TLB_INSTR.txt | 27 +
groups/westmereEX/UOPS.txt | 35 +
kernel/Makefile | 3 +-
kernel/README | 3 +
make/config_checks.mk | 49 +
make/config_defines.mk | 117 +
make/include_CLANG.mk | 28 +
make/include_GCC.mk | 15 +-
make/include_GCCX86.mk | 22 +-
make/include_ICC.mk | 14 +-
make/include_MIC.mk | 22 +-
monitoring/README.agent | 66 +
monitoring/groups/atom/BW_MEM.txt | 10 +
monitoring/groups/atom/FLOPS_DP.txt | 13 +
monitoring/groups/atom/FLOPS_SP.txt | 12 +
monitoring/groups/broadwell/BW.txt | 13 +
monitoring/groups/broadwell/ENERGY.txt | 18 +
monitoring/groups/broadwell/FLOPS_DP.txt | 22 +
monitoring/groups/broadwell/FLOPS_SP.txt | 22 +
monitoring/groups/broadwellEP/BW.txt | 13 +
monitoring/groups/broadwellEP/ENERGY.txt | 18 +
monitoring/groups/core2/BW_L2.txt | 11 +
monitoring/groups/core2/BW_MEM.txt | 10 +
monitoring/groups/haswell/BW.txt | 13 +
monitoring/groups/haswell/ENERGY.txt | 18 +
monitoring/groups/haswellEP/BW.txt | 32 +
monitoring/groups/haswellEP/ENERGY.txt | 18 +
monitoring/groups/interlagos/BW.txt | 16 +
monitoring/groups/interlagos/CPI.txt | 19 +
monitoring/groups/interlagos/FLOPS.txt | 18 +
monitoring/groups/ivybridge/BW.txt | 13 +
monitoring/groups/ivybridge/ENERGY.txt | 18 +
monitoring/groups/ivybridge/FLOPS_DP.txt | 23 +
monitoring/groups/ivybridge/FLOPS_SP.txt | 24 +
monitoring/groups/ivybridgeEP/BW.txt | 32 +
monitoring/groups/ivybridgeEP/ENERGY.txt | 18 +
monitoring/groups/ivybridgeEP/FLOPS_DP.txt | 23 +
monitoring/groups/ivybridgeEP/FLOPS_SP.txt | 24 +
monitoring/groups/kabini/BW.txt | 14 +
monitoring/groups/kabini/CPI.txt | 19 +
monitoring/groups/kabini/FLOPS.txt | 14 +
monitoring/groups/nehalem/BW.txt | 20 +
monitoring/groups/nehalem/CPI.txt | 14 +
monitoring/groups/nehalem/FLOPS.txt | 20 +
monitoring/groups/nehalemEX/BW.txt | 29 +
monitoring/groups/nehalemEX/CPI.txt | 12 +
monitoring/groups/nehalemEX/FLOPS.txt | 20 +
monitoring/groups/pentiumm/BW.txt | 12 +
monitoring/groups/pentiumm/CPI.txt | 17 +
monitoring/groups/phi/CPI.txt | 17 +
monitoring/groups/sandybridge/BW.txt | 13 +
monitoring/groups/sandybridge/ENERGY.txt | 18 +
monitoring/groups/sandybridge/FLOPS_DP.txt | 24 +
monitoring/groups/sandybridge/FLOPS_SP.txt | 24 +
monitoring/groups/sandybridgeEP/BW.txt | 24 +
monitoring/groups/sandybridgeEP/ENERGY.txt | 18 +
monitoring/groups/sandybridgeEP/FLOPS_DP.txt | 24 +
monitoring/groups/sandybridgeEP/FLOPS_SP.txt | 24 +
monitoring/groups/silvermont/BW.txt | 12 +
monitoring/groups/silvermont/CPI.txt | 14 +
monitoring/groups/silvermont/ENERGY.txt | 16 +
monitoring/groups/westmere/BW.txt | 19 +
monitoring/groups/westmere/CPI.txt | 14 +
monitoring/groups/westmere/FLOPS.txt | 20 +
monitoring/groups/westmereEX/BW.txt | 20 +
monitoring/groups/westmereEX/CPI.txt | 14 +
monitoring/groups/westmereEX/FLOPS.txt | 20 +
monitoring/likwid-agent.conf | 52 +
perl/AsmGen.pl | 284 --
perl/feedGnuplot | 1543 ++++--
perl/gas.pm | 211 -
perl/gen_events.pl | 77 +-
perl/generateGroups.pl | 142 -
perl/generatePas.pl | 163 -
perl/likwid-mpirun | 456 --
perl/likwid-perfscope | 110 -
perl/likwid-setFrequencies | 185 -
perl/set_license.pl | 226 +-
perl/templates/group.tt | 208 -
perl/templates/testcases.tt | 19 -
src/access-daemon/Makefile | 20 +-
src/access-daemon/accessDaemon.c | 908 ++--
src/access-daemon/setFreq.c | 291 +-
src/access.c | 221 +
src/accessClient.c | 257 -
src/access_client.c | 343 ++
src/access_x86.c | 91 +
src/access_x86_msr.c | 288 ++
src/access_x86_pci.c | 313 ++
src/affinity.c | 335 +-
src/allocator.c | 199 -
src/applications/likwid-agent.lua | 559 +++
src/applications/likwid-bench.c | 536 --
src/applications/likwid-features.c | 191 -
src/applications/likwid-features.lua | 191 +
src/applications/likwid-genCfg.c | 122 -
src/applications/likwid-genTopoCfg.lua | 153 +
src/applications/likwid-memsweeper.c | 138 -
src/applications/likwid-memsweeper.lua | 89 +
src/applications/likwid-mpirun.lua | 1967 ++++++++
src/applications/likwid-perfctr.c | 528 --
src/applications/likwid-perfctr.lua | 775 +++
src/applications/likwid-perfscope.lua | 560 +++
src/applications/likwid-pin.c | 346 --
src/applications/likwid-pin.lua | 275 ++
src/applications/likwid-powermeter.c | 507 --
src/applications/likwid-powermeter.lua | 388 ++
src/applications/likwid-setFrequencies.lua | 396 ++
src/applications/likwid-topology.c | 509 --
src/applications/likwid-topology.lua | 394 ++
src/applications/likwid.lua | 1142 +++++
src/asciiBoxes.c | 256 -
src/asciiTable.c | 236 -
src/barrier.c | 155 -
src/bench.c | 537 --
src/bitUtil.c | 14 +-
src/bstrlib.c | 3072 ++++++------
src/calculator.c | 926 ++++
src/calculator_stack.c | 77 +
src/configuration.c | 339 ++
src/cpuFeatures.c | 659 ++-
src/cpuid.c | 1244 -----
src/cpustring.c | 577 +++
src/daemon.c | 123 -
src/ghash.c | 52 +-
src/hashTable.c | 94 +-
src/includes/access.h | 44 +
src/includes/accessClient.h | 55 -
src/includes/accessClient_types.h | 87 -
src/includes/access_client.h | 11 +
src/includes/access_client_types.h | 65 +
src/includes/access_x86.h | 13 +
src/includes/access_x86_msr.h | 12 +
src/includes/access_x86_pci.h | 12 +
src/includes/affinity.h | 24 +-
src/includes/affinity_types.h | 42 -
src/includes/allocator.h | 48 -
src/includes/asciiBoxes.h | 42 -
src/includes/asciiBoxes_types.h | 47 -
src/includes/asciiTable.h | 45 -
src/includes/asciiTable_types.h | 48 -
src/includes/barrier.h | 62 -
src/includes/barrier_types.h | 49 -
src/includes/bitUtil.h | 8 +-
src/includes/bstrlib.h | 46 +-
src/includes/calculator.h | 38 +
src/includes/calculator_stack.h | 48 +
src/includes/configuration.h | 46 +
src/includes/cpuFeatures.h | 8 +-
src/includes/cpuFeatures_types.h | 42 +-
src/includes/cpuid.h | 141 +-
src/includes/cpuid_types.h | 115 -
src/includes/daemon.h | 42 -
src/includes/error.h | 70 +-
src/includes/ghash.h | 42 +-
src/includes/hashTable.h | 13 +-
src/includes/libperfctr_types.h | 15 +-
src/includes/likwid.h | 1389 +++++-
src/includes/lock.h | 8 +-
src/includes/memsweep.h | 15 +-
src/includes/msr.h | 47 -
src/includes/multiplex.h | 40 -
src/includes/multiplex_types.h | 42 -
src/includes/numa.h | 43 +-
src/includes/numa_hwloc.h | 40 +
src/includes/numa_proc.h | 39 +
src/includes/numa_types.h | 52 -
src/includes/pci.h | 49 -
src/includes/pci_hwloc.h | 37 +
src/includes/pci_proc.h | 37 +
src/includes/pci_types.h | 69 +-
src/includes/perfgroup.h | 94 +
src/includes/perfmon.h | 88 +-
src/includes/perfmon_atom.h | 11 +-
src/includes/perfmon_atom_events.txt | 17 +-
src/includes/perfmon_broadwell.h | 1793 +++++++
src/includes/perfmon_broadwellEP_counters.h | 362 ++
src/includes/perfmon_broadwellEP_events.txt | 2569 ++++++++++
src/includes/perfmon_broadwell_counters.h | 83 +
src/includes/perfmon_broadwell_events.txt | 665 +++
src/includes/perfmon_broadwelld_counters.h | 252 +
src/includes/perfmon_broadwelld_events.txt | 1984 ++++++++
src/includes/perfmon_core2.h | 341 +-
src/includes/perfmon_core2_counters.h | 31 +-
src/includes/perfmon_core2_events.txt | 217 +-
src/includes/perfmon_haswell.h | 1973 +++++++-
src/includes/perfmon_haswellEP_counters.h | 330 ++
src/includes/perfmon_haswellEP_events.txt | 2616 ++++++++++
src/includes/perfmon_haswell_counters.h | 71 +-
src/includes/perfmon_haswell_events.txt | 651 ++-
src/includes/perfmon_interlagos.h | 335 +-
src/includes/perfmon_interlagos_counters.h | 35 +-
src/includes/perfmon_interlagos_events.txt | 130 +-
src/includes/perfmon_ivybridge.h | 1892 +++++---
src/includes/perfmon_ivybridgeEP_counters.h | 316 ++
src/includes/perfmon_ivybridgeEP_events.txt | 2072 ++++++++
src/includes/perfmon_ivybridge_counters.h | 90 +-
src/includes/perfmon_ivybridge_events.txt | 821 ++--
src/includes/perfmon_k10.h | 231 +-
src/includes/perfmon_k10_counters.h | 26 +-
src/includes/perfmon_k10_events.txt | 53 +-
src/includes/perfmon_k8.h | 17 +-
src/includes/perfmon_k8_events.txt | 42 +-
src/includes/perfmon_kabini.h | 398 +-
src/includes/perfmon_kabini_counters.h | 39 +-
src/includes/perfmon_kabini_events.txt | 30 +-
src/includes/perfmon_nehalem.h | 698 ++-
src/includes/perfmon_nehalemEX.h | 1828 ++++---
src/includes/perfmon_nehalemEX_counters.h | 185 +
src/includes/perfmon_nehalemEX_events.txt | 425 +-
src/includes/perfmon_nehalemEX_westmereEX_common.h | 94 +
src/includes/perfmon_nehalem_counters.h | 58 +-
src/includes/perfmon_nehalem_events.txt | 33 +-
src/includes/perfmon_p6_events.txt | 19 +-
src/includes/perfmon_perf.h | 60 +
src/includes/perfmon_phi.h | 241 +-
src/includes/perfmon_phi_counters.h | 23 +-
src/includes/perfmon_phi_events.txt | 17 +-
src/includes/perfmon_pm.h | 249 +-
src/includes/perfmon_pm_counters.h | 22 +-
src/includes/perfmon_pm_events.txt | 36 +-
src/includes/perfmon_sandybridge.h | 2129 ++++++--
src/includes/perfmon_sandybridgeEP_counters.h | 214 +
src/includes/perfmon_sandybridgeEP_events.txt | 1342 +++++
src/includes/perfmon_sandybridge_counters.h | 95 +-
src/includes/perfmon_sandybridge_events.txt | 652 +--
src/includes/perfmon_silvermont.h | 527 +-
src/includes/perfmon_silvermont_counters.h | 37 +-
src/includes/perfmon_silvermont_events.txt | 424 +-
src/includes/perfmon_skylake.h | 753 +++
src/includes/perfmon_skylake_counters.h | 84 +
src/includes/perfmon_skylake_events.txt | 599 +++
src/includes/perfmon_types.h | 314 +-
src/includes/perfmon_westmere.h | 13 +-
src/includes/perfmon_westmereEX.h | 1943 +++++---
src/includes/perfmon_westmereEX_counters.h | 274 +-
src/includes/perfmon_westmereEX_events.txt | 405 +-
src/includes/perfmon_westmere_events.txt | 168 +-
src/includes/power.h | 179 +-
src/includes/power_types.h | 39 +-
src/includes/registers.h | 554 ++-
src/includes/registers_types.h | 209 +
src/includes/strUtil.h | 55 -
src/includes/strUtil_types.h | 61 -
src/includes/test_types.h | 108 -
src/includes/textcolor.h | 8 +-
src/includes/thermal.h | 51 +-
src/includes/thermal_types.h | 18 +-
src/includes/threads.h | 107 -
src/includes/threads_types.h | 57 -
src/includes/timer.h | 76 +-
src/includes/timer_types.h | 8 +-
src/includes/tlb-info.h | 89 +
src/includes/topology.h | 144 +
src/includes/topology_cpuid.h | 43 +
src/includes/topology_hwloc.h | 52 +
src/includes/topology_proc.h | 51 +
src/includes/topology_types.h | 73 +
src/includes/tree.h | 9 +-
src/includes/tree_types.h | 34 +-
src/includes/types.h | 30 +-
src/libperfctr.c | 816 ++--
src/likwid.f90 | 102 +-
src/likwid_f90_interface.c | 57 +-
src/loadData.S | 44 +
src/loadData.s | 22 -
src/loadData.s.tmp | 0
src/luawid.c | 2334 +++++++++
src/memsweep.c | 59 +-
src/msr.c | 307 --
src/multiplex.c | 165 -
src/numa.c | 424 +-
src/numa_hwloc.c | 415 ++
src/numa_proc.c | 383 ++
src/pci.c | 398 --
src/pci_hwloc.c | 81 +
src/pci_proc.c | 125 +
src/perfgroup.c | 1285 +++++
src/perfmon.c | 3639 +++++++++-----
src/perfmon_perf.c | 260 +
src/power.c | 507 +-
src/pthread-overload/Makefile | 25 +-
src/pthread-overload/pthread-overload.c | 108 +-
src/strUtil.c | 975 ----
src/thermal.c | 22 +-
src/threads.c | 217 -
src/timer.c | 399 +-
src/topology.c | 1041 ++++
src/topology_cpuid.c | 939 ++++
src/topology_hwloc.c | 327 ++
src/topology_proc.c | 626 +++
src/tree.c | 116 +-
test/MPI_pin_test.c | 53 +-
test/Makefile | 73 +-
test/accuracy/Makefile | 39 +-
test/accuracy/README | 7 +-
test/accuracy/TESTS/BRANCH.txt | 42 +
test/accuracy/TESTS/CLOCK.txt | 53 +
test/accuracy/TESTS/DATA.txt | 34 +
test/accuracy/TESTS/FLOPS_AVX.txt | 25 +-
test/accuracy/TESTS/FLOPS_DP.txt | 105 +-
test/accuracy/TESTS/FLOPS_SP.txt | 92 +-
test/accuracy/TESTS/HA.txt | 58 +
test/accuracy/TESTS/L2.txt | 62 +-
test/accuracy/TESTS/L3.txt | 62 +-
test/accuracy/TESTS/MEM.txt | 62 +-
test/accuracy/TESTS/UOPS.txt | 30 +
test/accuracy/likwid-accuracy.py | 540 +-
test/accuracy/likwid-adjust-test-sizes.py | 105 +
test/accuracy/likwid-tester | 220 -
test/accuracy/likwid-tester-plot | 78 -
test/executable_tests/Makefile | 14 +-
test/executable_tests/README | 3 +
test/executable_tests/likwid-bench.txt | 39 +-
test/executable_tests/likwid-features.txt | 9 -
test/executable_tests/likwid-genCfg.txt | 5 -
test/executable_tests/likwid-genTopoCfg.txt | 5 +
test/executable_tests/likwid-memsweeper.txt | 8 +-
test/executable_tests/likwid-mpirun.txt | 39 +
test/executable_tests/likwid-perfctr.txt | 73 +-
test/executable_tests/likwid-pin.txt | 12 +-
test/executable_tests/likwid-powermeter.txt | 28 +-
test/executable_tests/likwid-setFreq.txt | 6 -
test/executable_tests/likwid-setFrequencies.txt | 14 +
test/executable_tests/likwid-topology.txt | 9 +-
test/executable_tests/tester.sh | 24 +-
test/serial.c | 43 +
test/stream.c | 423 --
test/test-likwidAPI.c | 2099 ++++++++
test/test-msr-access.c | 101 +
test/testTBB.cc | 67 +
1058 files changed, 129398 insertions(+), 27430 deletions(-)
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..3877f0e
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,15 @@
+language: c
+compiler: gcc
+install: true
+dist: trusty
+sudo: required
+script:
+ - make && sudo make install
+ - /usr/local/bin/likwid-topology -c -C -g
+ - /usr/local/bin/likwid-pin -p
+ - sudo modprobe msr
+ - ls -la /dev/cpu/*
+ - ls -la /usr/local/sbin/*
+ - make -C test streamGCC
+ - /usr/local/bin/likwid-perfctr -i
+ - /usr/local/bin/likwid-bench -t copy -w N:100MB:2
diff --git a/CHANGELOG b/CHANGELOG
new file mode 100644
index 0000000..6c84df0
--- /dev/null
+++ b/CHANGELOG
@@ -0,0 +1,59 @@
+
+
+# Changelog 4.1.0
+- Support for Intel Skylake (Core + Uncore)
+- Support for Intel Broadwell (Core + Uncore)
+- Support for Intel Broadwell D (Core + Uncore)
+- Support for Intel Broadwell EP/EN/EX (Core + Uncore)
+- Support for Intel Airmont (Core)
+- Uncore support for Intel SandyBridge, IvyBridge and Haswell
+- Performance group and event set handling in library
+- Internal calculator for derived metrics
+- Improvement of Marker API
+- Get results/metrics of last measurement cycle
+- Fixed most memory leaks
+- Respect 'Intel PMU sharing guide'
+- Update of internal Lua to 5.3
+- More examples (C++11 threads,Cilk+, TBB)
+- Test suite for executables and library
+- Accuracy checker supports multiple CPUs
+- Security checked access daemon
+- Likwid-bench supports Integer benchmarks
+- Likwid-bench selects interation count automatically
+- Likwid-bench has new FMA related benchmarks
+- Likwid-mpirun supports SLURM job scheduler
+- New tool likwid-features
+
+# Changelog 4.0.1
+- likwid-bench: Iteration determination is done serially
+- likwid-bench: Manual selection of iterations possible
+- likwid-perfctr: Set cpuset to all CPUs not only the first
+- likwid-pin: Set cpuset to all CPUs not only the first
+- likwid-accuracy.py: Enhanced plotting functions, use only instrumented likwid-bench
+- likwid-accessD: Check for allowed register for PCI accesses
+- Add models HASWELL_M1 (0x45) and HASWELL_M2 (0x46) to likwid-powermeter and likwid-accessD
+- New test application using Cilk and Marker API
+- New test application using C++11 threads and Marker API
+- likwid-agent: gmetric version check for --group option and s/\s*/_/ in metric names
+- likwid-powermeter: Print RAPL domain name
+- Marker API: Initialize access already at likwid_markerInit()
+- Marker API: likwid_markerThreadInit() only pins if not already pinned
+
+# Changelog 4.0.0
+
+- Support for Intel Broadwell
+- Uncore support for all Uncore-aware architectures
+ - Nehalem (EX)
+ - Westmere (EX)
+ - SandyBridge EP
+ - IvyBridge EP
+ - Haswell EP
+- Measure multiple event sets in a round-robin fashion (no multiplexing!)
+- Event options to filter the counter increments
+- Whole LIKWID functionality is exposed as API for C/C++ and Lua
+- New functions in the Marker API to switch event sets and get intermediate results
+- Topology code relies on hwloc. CPUID is still included but only as fallback
+- Most LIKWID applications are written in Lua (only exception likwid-bench)
+- Monitoring daemon likwid-agent with multiple output backends
+- More performance groups
+
diff --git a/INSTALL b/INSTALL
index 5939aa9..c4bfb05 100644
--- a/INSTALL
+++ b/INSTALL
@@ -1,106 +1,140 @@
== Basic build ==
1. Edit config.mk. Follow the comments there.
- Optionally you can change compiler settings in include_[GCC|ICC|GCCX86].mk.
- Please note that only the default compiler flags are supported and tested.
-2. make
-3. make install (required)
-4. setup access to the msr device files (see end of this document)
+ Optionally you can change compiler settings in include_[GCC|CLANG|ICC|MIC].mk.
+ Please note that only the default compiler flags GCC are supported and tested.
+2. make (Builds hwloc, lua, Likwid libraries, access daemons and likwid-bench)
+3. make install (this is required for likwid-pin and if you use the accessDaemon)
Only the default flags set are tested. As it is not possible to test all
-compiler setting variants the Intel icc compiler is only build tested. A basic
-function test is done for the icc binary. The only variant fully tested is gcc
-with default compiler flags. It is therefore recommended to use gcc with the
-default flags. If you want to use and build the Fortran interface you can mix
-GCC with the Intel Fortran Compiler. More information on this can be found in
-the WIKI. On 32bit systems you have to pick the GCCX86 compiler target.
+compiler setting variants the Intel icc compiler and Clang is only build tested.
+A basic function test is done for the icc binary. The only variant fully tested
+is gcc with default compiler flags. It is therefore recommended to use gcc with
+the default flags. If you want to use and build the Fortran interface you can mix
+GCC with the Intel Fortran Compiler (default setup). You can change the Fortran
+compiler in make/include_[GCC|CLANG|ICC|MIC].mk.
*NOTICE*
-All generated files are located in the [GCC|ICC|GCCX86] build directory.
-This includes the dependency files, object files and also the
-generated source files and the pas and assembly files for likwid-bench.
+All generated files are located in the [GCC|ICC|CLANG|MIC] build directory.
+This includes the dependency files, object files. The
+generated source files and the pas and assembly files for likwid-bench are build
+in bench/[GCC|ICC|CLANG|MIC].
If you debug your likwid-bench benchmarks you can look at all
intermediate build files and also the final assembly code.
+== Build on Xeon Phi ==
+For builds for the Xeon Phi coprocessor, the accessDaemon and the frequency
+daemon are disabled. Moreover, the access mode is set to 'direct'. This was made
+because it is important to run as few processes as possible on the Xeon Phi and
+the accessDaemon would start one process per hardware thread.
+In order to build Likwid for the Xeon Phi processor, you have to change the
+RPATHS variable in make/include_MIC.mk to point to the folder with the Intel
+libraries like libimf.so. This is crucial because when using an suid-root
+executable, the LD_LIBRARY_PATH gets lost but Likwid still needs to know where
+the Intel libraries reside.
+After installation change the owner of likwid-lua to root and set the suid-root
+bit for likwid-lua:
+chown root <BINPATH>/likwid-lua
+chmod u+s <BINPATH>/likwid-lua
+Afterwards Likwid can be used as anywhere else.
+
== Known problems ==
On very old systems with old kernels (< 2.6.7) or old glibc versions likwid
is build with reduced funtionality. This includes missing support for NUMA
and pinning.
+likwid-setFrequencies can only be used if the acpi_cpufreq module is loaded. It
+is not possible to fix the frequency with the intel_pstate module.
== Additional Targets ==
make clean - clean the object directory
make distclean - clean also the executables/libraries
make uninstall - delete installed files
+make docs - generate html documentation using doxygen
+make local - set paths in Lua files to work from current directory
+ (for testing only! Uses already installed access daemons and
+ libraries. Often you have to set the LD_LIBRARY_PATH to the
+ contain the current folder)
-== Build accessDaemon ==
+== Dependencies ==
+Most parts of the Likwid suite do not have external dependencies that need to be
+installed before you can build Likwid. If external libraries are used, they are
+shipped with Likwid.
-To build the accessDaemon:
+Included dependencies:
+- hwloc
+- Lua
+- Perl Template toolkit
-1. Set the desired default ACCESSMODE. You can overwrite this on the command line.
-2. make will also build the accessDaemon
-3. Install with
- make install
+Build dependencies:
+- C compiler (commonly gcc, but clang and icc are also possible)
+- make
+- Perl
-With the standard make install target the daemon will also be installed in
-${PREFIX}/sbin . Don't forget to copy the dameon if you configured a different
-path in ACCESSDAEMON.
+Runtime dependencies for likwid-perfscope:
+- gnuplot
-== Setup of msr module ==
+Runtime dependencies for likwid-agent (if enabled in configfile):
+- gmetric (Output to Ganglia Monitoring System)
+- rrdtool (Output to RRDs)
+- logger (Output to syslog)
-likwid-perfctr, likwid-powermeter and likwid-features require the Linux msr kernel module. This module
-is part of most standard distro kernels. You have to be root to do the initial setup.
+For the HTML documentation you further need doxygen.
-Check if msr device files are there with 'ls /dev/cpu/0/'. If msr device files are not there try:
+== Build accessDaemon ==
-1. Check if the msr module is loaded with 'lsmod | grep msr' . There should be an output.
-2. It the module is not loaded load it with 'modprobe msr' . For automatic loading at startup
-consult your distros documentation how to do so.
+Change path for the accessDaemon:
-Once you have the msr device files avilable:
-3. Adopt access rights on the msr device files for normal user. To allow everybody access you can
-use 'chmod o+rw /dev/cpu/*/msr' . This is only recommended on save single user desktop systems.
+1. Edit config.mk and configure path in ACCESSDAEMON variable. You can overwrite
+ it later in likwid.cfg
+2. Set the desired default ACCESSMODE. You can overwrite this on the command
+ line or likwid.cfg.
+2. make will also build the accessDaemon
+3. Install with (sudo) make install
+
+With the standard make install target the daemon will also be installed in
+to the path in $ACCESSDAEMON. It also sets the user to root and the suid bit.
+
+== Setup of msr module ==
+
+likwid-perfctr, likwid-powermeter, likwid-agent, require the Linux msr kernel
+module. This module is part of most standard distro kernels. You have to be root
+to do the initial setup.
+
+1. Check if the msr module is loaded with 'lsmod | grep msr'.
+ There should be an output.
+2. If the module is not loaded, load it with 'modprobe msr'. For automatic
+ loading at startup consult your distros documentation how to do so, commonly
+ by adding 'msr' to /etc/modules.
+3. Adopt access rights on the msr device files for normal user. To allow
+ everybody access you can use 'chmod o+rw /dev/cpu/*/msr'.
+ This is only recommended on save single user desktop systems and might be not
+ enough to grant access to anybody because of POSIX capabilites or other
+ security features of your distro.
As a general access to the msr registers is not desired on security sensitive
-systems you can either implement a more sophisticated access rights settings
+systems, you can either implement a more sophisticated access rights settings
with e.g. setgid. A common solution used on many other device files, e.g. for
audio, is to introduce a group and make a chown on the msr device files to that
-group. Now if you execute likwid-perfctr with setgid on that group the
-executing user can use the tool but cannot directly write or read the msr
-device files.
+group or use dbus rules. Now if you execute likwid-perfctr with setgid on that
+group the executing user can use the tool but cannot directly write or read the
+msr device files.
A secure solution is to use the accessDaemon, which encapsulates the access to
-the msr device files and performs a address check for allowed registers. For
-more information how to setup and use this solution have a look at the WIKI
-page:
-
-http://code.google.com/p/likwid/wiki/MSRDaemon
+the msr device files and performs an address check for allowed registers. For
+more information how to setup look at the HTML documentation.
-A common solution to give access is to use the likwid-accessD and make it suid root.
-Starting with version 3.1.3 make install will do those steps. Of course this will only
-work as long as you are root while calling make install.
-
-If for you are not root and someone else needs to install the daemon the
-following steps need to be carried out:
-
-1. Go to the directory where you installed the likwid tools.
-2. Change to the sbin directory there.
-3. Execute (as root): chown root.<some user group> likwid-accessD
-4. Execute (as root): chmod u+s likwid-accessD
-
-
-This should be sufficient on many machines.
-You need to perform the same procedure for likwid-setFreq.
-
-=== THIS IS USUALLY NOT NECESSARY ANYMORE ==
A demo for a root exploit involving the msr device files was published. As
a consequence the security settings for access to the msr device files are
-tightened in recent kernels.
+tightened in recent kernels. The exploit used a specify register to alter the
+entry point for the current process to a malware. The daemon grants access only
+to hardware performance counter related registers.
+
Just setting the file access rights or using suid root on the access daemon is
-not sufficient anymore. You have to register your binary now to get access.
-This is only necessary if above setup dos not work.
+not sufficient anymore for some distros. You have to register your binary at the
+libcap now to get access. This is only necessary if above setup does not work.
You register the necessary capability by calling
@@ -108,27 +142,10 @@ sudo setcap cap_sys_rawio+ep EXECUTABLE
on the executables. This is only possible on local file systems.
The only feasable way is to register the likwid-accessD and proxy all access over it.
-=== SNIP ==
If you have still problems please let me know on the likwid mailing list:
-
http://groups.google.com/group/likwid-users
-== NOTICE for Intel Xeon Phi (KNC) ==
-
-If you want to use LIKWID on a Xeon Phi you have to use set MIC as COMPILER in
-config.mk. This build of LIKWID won't be binary compatible with other X86
-processors. It is required to set the default access mode to direct in
-and disable the build of likwid-accessD in config.mk.
-
-To use LIKWID you have to turn of power management on the MIC. LIKWID relies on
-RDTSC being used for wallclock time. On the MIC this is only given if power
-management is turned off. This can be configured in
-/etc/sysconfig/mic/default.conf.
-
-At the end of this file the power management is configured. The following configuration worked:
-
- PowerManagement "cpufreq_off;corec6_off;pc3_off;pc6_off"
diff --git a/Makefile b/Makefile
index eecd4e9..e1f959b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,16 +1,17 @@
+#
# =======================================================================================
#
# Filename: Makefile
#
# Description: Central Makefile
#
-# Version: 3.1.3
-# Released: 4.11.2014
+# Version: <VERSION>
+# Released: <DATE>
#
# Author: Jan Treibig (jt), jan.treibig at gmail.com
# Project: likwid
#
-# Copyright (C) 2014 Jan Treibig
+# Copyright (C) 2013 Jan Treibig
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
@@ -32,25 +33,8 @@ GROUP_DIR = ./groups
FILTER_DIR = ./filters
MAKE_DIR = ./make
-#DO NOT EDIT BELOW
-
-# determine kernel Version
-KERNEL_VERSION_MAJOR := $(shell uname -r | awk '{split($$1,a,"."); print a[1]}' | cut -d '-' -f1)
-KERNEL_VERSION := $(shell uname -r | awk '{split($$1,a,"."); print a[2]}' | cut -d '-' -f1)
-KERNEL_VERSION_MINOR := $(shell uname -r | awk '{split($$1,a,"."); print a[3]}' | cut -d '-' -f1)
-
-HAS_MEMPOLICY = $(shell if [ $(KERNEL_VERSION) -lt 7 -a $(KERNEL_VERSION_MAJOR) -lt 3 -a $(KERNEL_VERSION_MINOR) -lt 7 ]; then \
- echo 0; else echo 1; \
- fi; )
-HAS_RDTSCP = $(shell /bin/bash -c "cat /proc/cpuinfo | grep -c rdtscp")
-
-# determine glibc Version
-GLIBC_VERSION := $(shell ldd --version | grep ldd | awk '{ print $$NF }' | awk -F. '{ print $$2 }')
-
-HAS_SCHEDAFFINITY = $(shell if [ $(GLIBC_VERSION) -lt 4 ]; then \
- echo 0; else echo 1; \
- fi; )
+#DO NOT EDIT BELOW
# Dependency chains:
# *.[ch] -> *.o -> executables
@@ -59,161 +43,105 @@ HAS_SCHEDAFFINITY = $(shell if [ $(GLIBC_VERSION) -lt 4 ]; then \
include ./config.mk
include $(MAKE_DIR)/include_$(COMPILER).mk
-INCLUDES += -I./src/includes -I$(BUILD_DIR)
-LIBS +=
-DEFINES += -DVERSION=$(VERSION) \
- -DRELEASE=$(RELEASE) \
- -DCFGFILE=$(CFG_FILE_PATH) \
- -DMAX_NUM_THREADS=$(MAX_NUM_THREADS) \
- -DMAX_NUM_NODES=$(MAX_NUM_NODES) \
- -DHASH_TABLE_SIZE=$(HASH_TABLE_SIZE) \
- -DLIBLIKWIDPIN=$(LIBLIKWIDPIN) \
- -DLIKWIDFILTERPATH=$(LIKWIDFILTERPATH)
+include $(MAKE_DIR)/config_checks.mk
+include $(MAKE_DIR)/config_defines.mk
+
+INCLUDES += -I./src/includes -I$(LUA_FOLDER)/includes -I$(HWLOC_FOLDER)/include -I$(BUILD_DIR)
+LIBS += -ldl
#CONFIGURE BUILD SYSTEM
BUILD_DIR = ./$(COMPILER)
Q ?= @
GENGROUPLOCK = .gengroup
-ifeq ($(COMPILER),MIC)
-BENCH_DIR = ./bench/phi
-else
-ifeq ($(COMPILER),GCCX86)
-BENCH_DIR = ./bench/x86
-else
-BENCH_DIR = ./bench/x86-64
-endif
-endif
-
-LIKWID_LIB = liblikwid
-ifeq ($(SHARED_LIBRARY),true)
-CFLAGS += $(SHARED_CFLAGS) -ggdb
-DYNAMIC_TARGET_LIB := $(LIKWID_LIB).so
-TARGET_LIB := $(DYNAMIC_TARGET_LIB)
-LIBS += -L. -llikwid
-SHARED_LFLAGS += -lm -lpthread
-else
-STATIC_TARGET_LIB := $(LIKWID_LIB).a
-TARGET_LIB := $(STATIC_TARGET_LIB)
-endif
-
-ifneq ($(COLOR),NONE)
-DEFINES += -DCOLOR=$(COLOR)
-endif
-
-ifneq ($(COMPILER),MIC)
- DAEMON_TARGET = likwid-accessD
-else
- $(info Info: Compiling for Xeon Phi. Disabling build of likwid-accessD.);
-endif
-
-ifeq ($(INSTRUMENT_BENCH),true)
-DEFINES += -DPERFMON
-endif
-
-ifeq ($(HAS_MEMPOLICY),1)
-DEFINES += -DHAS_MEMPOLICY
-else
-$(info Kernel $(KERNEL_VERSION_MAJOR).$(KERNEL_VERSION).$(KERNEL_VERSION_MINOR) has no mempolicy support! First Linux kernel with memory policies has version 2.6.7);
-endif
-
-ifeq ($(HAS_RDTSCP),0)
-$(info Building without RDTSCP timing support!);
-else
-ifneq ($(COMPILER),MIC)
-DEFINES += -DHAS_RDTSCP
-else
- $(info Info: Compiling for Xeon Phi. Disabling RDTSCP support.);
-endif
-endif
-
-ifeq ($(HAS_SCHEDAFFINITY),1)
-DEFINES += -DHAS_SCHEDAFFINITY
-PINLIB = liblikwidpin.so
-else
-$(info GLIBC version 2.$(GLIBC_VERSION) has no pthread_setaffinity_np support!);
-PINLIB =
-endif
-
-DEFINES += -DACCESSDAEMON=$(ACCESSDAEMON)
-
-ifeq ($(ACCESSMODE),accessdaemon)
-ifneq ($(COMPILER),MIC)
- DEFINES += -DACCESSMODE=1
-else
- $(info Info: Compiling for Xeon Phi. Set accessmode to direct.);
- DEFINES += -DACCESSMODE=0
-endif
-else
- DEFINES += -DACCESSMODE=0
-endif
-
-SETFREQ_TARGET = likwid-setFreq
-
VPATH = $(SRC_DIR)
OBJ = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c))
-OBJ += $(patsubst $(SRC_DIR)/%.s, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.s))
OBJ += $(patsubst $(SRC_DIR)/%.cc, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cc))
+OBJ += $(patsubst $(SRC_DIR)/%.S, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.S))
+ifeq ($(FILTER_HWLOC_OBJ),yes)
+OBJ := $(filter-out $(BUILD_DIR)/topology_hwloc.o,$(OBJ))
+OBJ := $(filter-out $(BUILD_DIR)/numa_hwloc.o,$(OBJ))
+OBJ := $(filter-out $(BUILD_DIR)/pci_hwloc.o,$(OBJ))
+endif
+ifneq ($(FORTRAN_INTERFACE),true)
+OBJ := $(filter-out $(BUILD_DIR)/likwid_f90_interface.o,$(OBJ))
+endif
PERFMONHEADERS = $(patsubst $(SRC_DIR)/includes/%.txt, $(BUILD_DIR)/%.h,$(wildcard $(SRC_DIR)/includes/*.txt))
-OBJ_BENCH = $(patsubst $(BENCH_DIR)/%.ptt, $(BUILD_DIR)/%.o,$(wildcard $(BENCH_DIR)/*.ptt))
-
-APPS = likwid-perfctr \
- likwid-features \
- likwid-powermeter \
- likwid-memsweeper \
- likwid-topology \
- likwid-genCfg \
- likwid-pin \
- likwid-bench
-
-PERL_APPS = likwid-mpirun \
- likwid-setFrequencies \
- likwid-perfscope
-
-DAEMON_APPS = $(SETFREQ_TARGET) \
- $(DAEMON_TARGET)
+OBJ_LUA = $(wildcard ./ext/lua/$(COMPILER)/*.o)
+OBJ_HWLOC = $(wildcard ./ext/hwloc/$(COMPILER)/*.o)
+
+
+L_APPS = likwid-perfctr \
+ likwid-pin \
+ likwid-powermeter \
+ likwid-topology \
+ likwid-memsweeper \
+ likwid-agent \
+ likwid-mpirun \
+ likwid-features \
+ likwid-perfscope \
+ likwid-genTopoCfg
+C_APPS = bench/likwid-bench
+L_HELPER = likwid.lua
+ifeq ($(BUILDFREQ),true)
+ L_APPS += likwid-setFrequencies
+endif
CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES)
-ifneq ($(FORTRAN_INTERFACE),false)
-HAS_FORTRAN_COMPILER = $(shell $(FC) --version 2>/dev/null || echo 'NOFORTRAN' )
-ifeq ($(HAS_FORTRAN_COMPILER),NOFORTRAN)
-FORTRAN_INTERFACE=
-$(info Warning: You have selected the fortran interface in config.mk, but there seems to be no fortran compiler - not compiling it!)
-else
-FORTRAN_INTERFACE = likwid.mod
-FORTRAN_INSTALL = @cp -f likwid.mod $(PREFIX)/include/
-endif
-else
-FORTRAN_INTERFACE =
-FORTRAN_INSTALL =
-endif
-
-all: $(BUILD_DIR) $(GENGROUPLOCK) $(PERFMONHEADERS) $(OBJ) $(OBJ_BENCH) $(TARGET_LIB) $(APPS) $(FORTRAN_INTERFACE) $(PINLIB) $(DAEMON_TARGET) $(SETFREQ_TARGET)
+all: $(BUILD_DIR) $(PERFMONHEADERS) $(OBJ) $(TARGET_LIB) $(FORTRAN_IF) $(PINLIB) $(L_APPS) $(L_HELPER) $(DAEMON_TARGET) $(FREQ_TARGET) $(BENCH_TARGET)
tags:
@echo "===> GENERATE TAGS"
$(Q)ctags -R
-$(APPS): $(addprefix $(SRC_DIR)/applications/,$(addsuffix .c,$(APPS))) $(BUILD_DIR) $(GENGROUPLOCK) $(OBJ) $(OBJ_BENCH)
- @echo "===> LINKING $@"
- $(Q)${CC} $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) ${LFLAGS} -o $@ $(addprefix $(SRC_DIR)/applications/,$(addsuffix .c,$@)) $(OBJ_BENCH) $(STATIC_TARGET_LIB) $(LIBS)
-
-$(STATIC_TARGET_LIB): $(OBJ)
- @echo "===> CREATE STATIC LIB $(STATIC_TARGET_LIB)"
- $(Q)${AR} -crus $(STATIC_TARGET_LIB) $(OBJ)
-
-$(DYNAMIC_TARGET_LIB): $(OBJ)
- @echo "===> CREATE SHARED LIB $(DYNAMIC_TARGET_LIB)"
- $(Q)${CC} $(SHARED_CFLAGS) -o $(DYNAMIC_TARGET_LIB) $(OBJ) -lm $(SHARED_LFLAGS)
+docs:
+ @echo "===> GENERATE DOXYGEN DOCS"
+ @cp doc/lua-doxygen.md doc/lua-doxygen.md.safe
+ @cp doc/likwid-doxygen.md doc/likwid-doxygen.md.safe
+ @sed -i -e s+'<PREFIX>'+$(PREFIX)+g -e s+'<VERSION>'+$(VERSION)+g -e s+'<DATE>'+'$(DATE)'+g -e s+'<RELEASE>'+$(RELEASE)+g doc/lua-doxygen.md
+ @sed -i -e s+'<PREFIX>'+$(PREFIX)+g -e s+'<VERSION>'+$(VERSION)+g -e s+'<DATE>'+'$(DATE)'+g -e s+'<RELEASE>'+$(RELEASE)+g doc/likwid-doxygen.md
+ $(Q)doxygen doc/Doxyfile
+ @mv doc/lua-doxygen.md.safe doc/lua-doxygen.md
+ @mv doc/likwid-doxygen.md.safe doc/likwid-doxygen.md
+
+$(L_APPS): $(addprefix $(SRC_DIR)/applications/,$(addsuffix .lua,$(L_APPS)))
+ @echo "===> ADJUSTING $@"
+ @if [ "$(ACCESSMODE)" = "direct" ]; then sed -i -e s/"access_mode = 1"/"access_mode = 0"/g $(SRC_DIR)/applications/$@.lua;fi
+ @sed -e s/'<INSTALLED_BINPREFIX>'/$(subst /,\\/,$(INSTALLED_BINPREFIX))/g \
+ -e s/'<INSTALLED_PREFIX>'/$(subst /,\\/,$(INSTALLED_PREFIX))/g \
+ -e s/'<VERSION>'/$(VERSION).$(RELEASE)/g \
+ -e s/'<DATE>'/$(DATE)/g \
+ $(addprefix $(SRC_DIR)/applications/,$(addsuffix .lua,$@)) > $@
+ @if [ "$(ACCESSMODE)" = "direct" ]; then sed -i -e s/"access_mode = 0"/"access_mode = 1"/g $(SRC_DIR)/applications/$@.lua;fi
+
+$(L_HELPER):
+ @echo "===> ADJUSTING $@"
+ @sed -e s/'<PREFIX>'/$(subst /,\\/,$(PREFIX))/g \
+ -e s/'<INSTALLED_LIBPREFIX>'/$(subst /,\\/,$(INSTALLED_LIBPREFIX))/g \
+ -e s/'<INSTALLED_PREFIX>'/$(subst /,\\/,$(INSTALLED_PREFIX))/g \
+ -e s/'<LIKWIDGROUPPATH>'/$(subst /,\\/,$(LIKWIDGROUPPATH))/g \
+ -e s/'<LIBLIKWIDPIN>'/$(subst /,\\/,$(LIBLIKWIDPIN))/g \
+ -e s/'<VERSION>'/$(VERSION)/g \
+ -e s/'<RELEASE>'/$(RELEASE)/g \
+ $(SRC_DIR)/applications/$@ > $@
+
+$(STATIC_TARGET_LIB): $(BUILD_DIR) $(PERFMONHEADERS) $(OBJ) $(TARGET_HWLOC_LIB) $(TARGET_LUA_LIB)
+ @echo "===> CREATE STATIC LIB $(TARGET_LIB)"
+ $(Q)${AR} -crus $(STATIC_TARGET_LIB) $(OBJ) $(TARGET_HWLOC_LIB) $(TARGET_LUA_LIB)
+
+
+$(DYNAMIC_TARGET_LIB): $(BUILD_DIR) $(PERFMONHEADERS) $(OBJ) $(TARGET_HWLOC_LIB) $(TARGET_LUA_LIB)
+ @echo "===> CREATE SHARED LIB $(TARGET_LIB)"
+ $(Q)${CC} $(DEBUG_FLAGS) $(SHARED_LFLAGS) -Wl,-soname,$(TARGET_LIB).$(VERSION).$(RELEASE) $(SHARED_CFLAGS) -o $(DYNAMIC_TARGET_LIB) $(OBJ) $(LIBS) $(TARGET_HWLOC_LIB) $(TARGET_LUA_LIB) $(RPATHS)
$(DAEMON_TARGET): $(SRC_DIR)/access-daemon/accessDaemon.c
- @echo "===> Build access daemon $(DAEMON_TARGET)"
- $(Q)$(MAKE) -s -C $(SRC_DIR)/access-daemon $(DAEMON_TARGET)
+ @echo "===> BUILD access daemon likwid-accessD"
+ $(Q)$(MAKE) -s -C $(SRC_DIR)/access-daemon likwid-accessD
-$(SETFREQ_TARGET): $(SRC_DIR)/access-daemon/setFreq.c
- @echo "===> Build frequency daemon $(SETFREQ_TARGET)"
- $(Q)$(MAKE) -s -C $(SRC_DIR)/access-daemon $(SETFREQ_TARGET)
+$(FREQ_TARGET): $(SRC_DIR)/access-daemon/setFreq.c
+ @echo "===> BUILD frequency daemon likwid-setFreq"
+ $(Q)$(MAKE) -s -C $(SRC_DIR)/access-daemon likwid-setFreq
$(BUILD_DIR):
@mkdir $(BUILD_DIR)
@@ -227,135 +155,435 @@ $(GENGROUPLOCK): $(foreach directory,$(shell ls $(GROUP_DIR)), $(wildcard $(GROU
$(Q)$(GEN_GROUPS) ./groups $(BUILD_DIR) ./perl/templates
$(Q)touch $(GENGROUPLOCK)
-$(FORTRAN_INTERFACE): $(SRC_DIR)/likwid.f90
+$(FORTRAN_IF): $(SRC_DIR)/likwid.f90
@echo "===> COMPILE FORTRAN INTERFACE $@"
$(Q)$(FC) -c $(FCFLAGS) $<
@rm -f likwid.o
+$(TARGET_LUA_LIB):
+ @echo "===> ENTER $(LUA_FOLDER)"
+ $(Q)$(MAKE) -s --no-print-directory -C $(LUA_FOLDER) $(MAKECMDGOALS)
+
+$(TARGET_HWLOC_LIB):
+ @echo "===> ENTER $(HWLOC_FOLDER)"
+ $(Q)$(MAKE) -s --no-print-directory -C $(HWLOC_FOLDER) $(MAKECMDGOALS)
+
+$(BENCH_TARGET):
+ @echo "===> ENTER $(BENCH_FOLDER)"
+ $(Q)$(MAKE) -s --no-print-directory -C $(BENCH_FOLDER) $(MAKECMDGOALS)
+
#PATTERN RULES
$(BUILD_DIR)/%.o: %.c
@echo "===> COMPILE $@"
- $(Q)$(CC) -c $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $< -o $@
- $(Q)$(CC) $(CPPFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d
-
-$(BUILD_DIR)/%.o: %.s
- @echo "===> ASSEMBLE $@"
- $(Q)$(AS) $(ASFLAGS) $< -o $@
+ $(Q)$(CC) -c $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $< -o $@
+ $(Q)$(CC) $(DEBUG_FLAGS) $(CPPFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d
$(BUILD_DIR)/%.o: %.cc
@echo "===> COMPILE $@"
- $(Q)$(CXX) -c $(CXXFLAGS) $(CPPFLAGS) $< -o $@
- $(Q)$(CXX) $(CXXFLAGS) $(CPPFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d
+ $(Q)$(CXX) -c $(DEBUG_FLAGS) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
+ $(Q)$(CXX) $(DEBUG_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d
+$(BUILD_DIR)/%.o: %.S
+ @echo "===> COMPILE $@"
+ $(Q)$(CPP) $(CPPFLAGS) $< -o $@.tmp
+ $(Q)$(AS) $(ASFLAGS) $@.tmp -o $@
+ @rm $@.tmp
-$(BUILD_DIR)/%.pas: $(BENCH_DIR)/%.ptt
- @echo "===> GENERATE BENCHMARKS"
- $(Q)$(GEN_PAS) $(BENCH_DIR) $(BUILD_DIR) ./perl/templates
$(BUILD_DIR)/%.h: $(SRC_DIR)/includes/%.txt
@echo "===> GENERATE HEADER $@"
$(Q)$(GEN_PMHEADER) $< $@
-$(BUILD_DIR)/%.o: $(BUILD_DIR)/%.pas
- @echo "===> ASSEMBLE $@"
- $(Q)$(PAS) -i $(PASFLAGS) -o $(BUILD_DIR)/$*.s $< '$(DEFINES)'
- $(Q)$(AS) $(ASFLAGS) $(BUILD_DIR)/$*.s -o $@
ifeq ($(findstring $(MAKECMDGOALS),clean),)
-include $(OBJ:.o=.d)
endif
-.PHONY: clean distclean install uninstall
+.PHONY: clean distclean install uninstall help $(TARGET_LUA_LIB) $(TARGET_HWLOC_LIB) $(BENCH_TARGET)
+
.PRECIOUS: $(BUILD_DIR)/%.pas
.NOTPARALLEL:
-clean:
+clean: $(TARGET_LUA_LIB) $(TARGET_HWLOC_LIB) $(BENCH_TARGET)
@echo "===> CLEAN"
- @rm -rf $(BUILD_DIR)
- @rm -f $(GENGROUPLOCK)
+ @for APP in $(L_APPS); do \
+ rm -f $$APP; \
+ done
+ @rm -f likwid.lua
+ @rm -f $(STATIC_TARGET_LIB)
+ @rm -f $(DYNAMIC_TARGET_LIB)
+ @rm -f $(PINLIB)
+ @rm -f $(FORTRAN_IF_NAME)
+ @rm -f $(FREQ_TARGET) $(DAEMON_TARGET)
-distclean: clean
+distclean: $(TARGET_LUA_LIB) $(TARGET_HWLOC_LIB) $(BENCH_TARGET)
@echo "===> DIST CLEAN"
- @rm -f likwid-*
- @rm -f $(LIKWID_LIB)*
- @rm -f $(FORTRAN_INTERFACE)
+ @for APP in $(L_APPS); do \
+ rm -f $$APP; \
+ done
+ @rm -f likwid.lua
+ @rm -f $(STATIC_TARGET_LIB)
+ @rm -f $(DYNAMIC_TARGET_LIB)
@rm -f $(PINLIB)
+ @rm -f $(FORTRAN_IF_NAME)
+ @rm -f $(FREQ_TARGET) $(DAEMON_TARGET)
+ @rm -rf $(BUILD_DIR)
+ @rm -f $(GENGROUPLOCK)
+ @rm -rf doc/html
@rm -f tags
-install:
- @echo "===> INSTALL applications to $(PREFIX)/bin"
- @mkdir -p $(PREFIX)/bin
- @for app in $(APPS); do \
- cp -f $$app $(PREFIX)/bin; \
- done
- @cp -f perl/feedGnuplot $(PREFIX)/bin
- @for app in $(PERL_APPS); do \
- sed -e "s+<PREFIX>+$(PREFIX)+g" perl/$$app > $(PREFIX)/bin/$$app; \
- done
- @chmod 755 $(PREFIX)/bin/likwid-*
- @echo "===> INSTALL daemon applications to $(PREFIX)/sbin"
+ifeq ($(BUILDDAEMON),true)
+ifneq ($(COMPILER),MIC)
+install_daemon:
+ @echo "===> INSTALL access daemon to $(ACCESSDAEMON)"
+ @mkdir -p `dirname $(ACCESSDAEMON)`
+ @install -m 4775 $(INSTALL_CHOWN) $(DAEMON_TARGET) $(ACCESSDAEMON)
+move_daemon:
+ @echo "===> MOVE access daemon from $(ACCESSDAEMON) to $(INSTALLED_ACCESSDAEMON)"
+ @mkdir -p `dirname $(INSTALLED_ACCESSDAEMON)`
+ @install -m 4775 $(INSTALL_CHOWN) $(ACCESSDAEMON) $(INSTALLED_ACCESSDAEMON)
+uninstall_daemon:
+ @echo "===> REMOVING access daemon from $(ACCESSDAEMON)"
+ @rm -f $(ACCESSDAEMON)
+uninstall_daemon_moved:
+ @echo "===> REMOVING access daemon from $(INSTALLED_ACCESSDAEMON)"
+ @rm -f $(INSTALLED_ACCESSDAEMON)
+else
+install_daemon:
+ @echo "===> No INSTALL of the access daemon"
+move_daemon:
+ @echo "===> No MOVE of the access daemon"
+uninstall_daemon:
+ @echo "===> No UNINSTALL of the access daemon"
+uninstall_daemon_moved:
+ @echo "===> No UNINSTALL of the access daemon"
+endif
+else
+install_daemon:
+ @echo "===> No INSTALL of the access daemon"
+move_daemon:
+ @echo "===> No MOVE of the access daemon"
+uninstall_daemon:
+ @echo "===> No UNINSTALL of the access daemon"
+uninstall_daemon_moved:
+ @echo "===> No UNINSTALL of the access daemon"
+endif
+
+ifeq ($(BUILDFREQ),true)
+ifneq ($(COMPILER),MIC)
+install_freq:
+ @echo "===> INSTALL setFrequencies tool to $(PREFIX)/sbin/$(FREQ_TARGET)"
@mkdir -p $(PREFIX)/sbin
- @for app in $(DAEMON_APPS); do \
- cp -f $$app $(PREFIX)/sbin; \
- if [ $(shell id -u) = "0" ]; then \
- chown root $(PREFIX)/sbin/$$app; \
- chmod 4775 $(PREFIX)/sbin/$$app; \
- else \
- echo "Only root can adjust the privileges of the daemon applications in $(PREFIX)/sbin"; \
- fi; \
+ @install -m 4775 $(INSTALL_CHOWN) $(FREQ_TARGET) $(PREFIX)/sbin/$(FREQ_TARGET)
+move_freq:
+ @echo "===> MOVE setFrequencies tool from $(PREFIX)/sbin/$(FREQ_TARGET) to $(INSTALLED_PREFIX)/sbin/$(FREQ_TARGET)"
+ @mkdir -p $(INSTALLED_PREFIX)/sbin
+ @install -m 4775 $(INSTALL_CHOWN) $(PREFIX)/sbin/$(FREQ_TARGET) $(INSTALLED_PREFIX)/sbin/$(FREQ_TARGET)
+uninstall_freq:
+ @echo "===> REMOVING setFrequencies tool from $(PREFIX)/sbin/$(FREQ_TARGET)"
+ @rm -f $(PREFIX)/sbin/$(FREQ_TARGET)
+uninstall_freq_moved:
+ @echo "===> REMOVING setFrequencies tool from $(INSTALLED_PREFIX)/sbin/$(FREQ_TARGET)"
+ @rm -f $(INSTALLED_PREFIX)/sbin/$(FREQ_TARGET)
+else
+install_freq:
+ @echo "===> No INSTALL of setFrequencies tool"
+move_freq:
+ @echo "===> No MOVE of setFrequencies tool"
+uninstall_freq:
+ @echo "===> No UNINSTALL of setFrequencies tool"
+uninstall_freq_moved:
+ @echo "===> No UNINSTALL of setFrequencies tool"
+endif
+else
+install_freq:
+ @echo "===> No INSTALL of setFrequencies tool"
+move_freq:
+ @echo "===> No MOVE of setFrequencies tool"
+uninstall_freq:
+ @echo "===> No UNINSTALL of setFrequencies tool"
+uninstall_freq_moved:
+ @echo "===> No UNINSTALL of setFrequencies tool"
+endif
+
+install: install_daemon install_freq
+ @echo "===> INSTALL applications to $(BINPREFIX)"
+ @mkdir -p $(BINPREFIX)
+ @chmod 775 $(BINPREFIX)
+ @for APP in $(L_APPS); do \
+ install -m 755 $$APP $(BINPREFIX); \
done
+ @for APP in $(C_APPS); do \
+ install -m 755 $$APP $(BINPREFIX); \
+ done
+ @install -m 755 ext/lua/lua $(BINPREFIX)/likwid-lua
+ @echo "===> INSTALL helper applications to $(BINPREFIX)"
+ @install -m 755 perl/feedGnuplot $(BINPREFIX)
+ @echo "===> INSTALL lua to likwid interface to $(PREFIX)/share/lua"
+ @mkdir -p $(PREFIX)/share/lua
+ @chmod 775 $(PREFIX)/share/lua
+ @install -m 755 likwid.lua $(PREFIX)/share/lua
+ @echo "===> INSTALL libraries to $(LIBPREFIX)"
+ @mkdir -p $(LIBPREFIX)
+ @chmod 775 $(LIBPREFIX)
+ @install -m 755 $(TARGET_LIB) $(LIBPREFIX)/$(TARGET_LIB).$(VERSION).$(RELEASE)
+ @install -m 755 liblikwidpin.so $(LIBPREFIX)/liblikwidpin.so.$(VERSION).$(RELEASE)
+ @install -m 755 $(TARGET_HWLOC_LIB) $(LIBPREFIX)/$(shell basename $(TARGET_HWLOC_LIB)).$(VERSION).$(RELEASE)
+ @install -m 755 $(TARGET_LUA_LIB) $(LIBPREFIX)/$(shell basename $(TARGET_LUA_LIB)).$(VERSION).$(RELEASE)
+ @cd $(LIBPREFIX) && ln -fs $(TARGET_LIB).$(VERSION).$(RELEASE) $(TARGET_LIB)
+ @cd $(LIBPREFIX) && ln -fs $(TARGET_LIB).$(VERSION).$(RELEASE) $(TARGET_LIB).$(VERSION)
+ @cd $(LIBPREFIX) && ln -fs $(PINLIB).$(VERSION).$(RELEASE) $(PINLIB)
+ @cd $(LIBPREFIX) && ln -fs $(PINLIB).$(VERSION).$(RELEASE) $(PINLIB).$(VERSION)
+ @cd $(LIBPREFIX) && ln -fs $(shell basename $(TARGET_HWLOC_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_HWLOC_LIB))
+ @cd $(LIBPREFIX) && ln -fs $(shell basename $(TARGET_HWLOC_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_HWLOC_LIB)).$(VERSION)
+ @cd $(LIBPREFIX) && ln -fs $(shell basename $(TARGET_LUA_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_LUA_LIB))
+ @cd $(LIBPREFIX) && ln -fs $(shell basename $(TARGET_LUA_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_LUA_LIB)).$(VERSION)
@echo "===> INSTALL man pages to $(MANPREFIX)/man1"
@mkdir -p $(MANPREFIX)/man1
+ @chmod 775 $(MANPREFIX)/man1
@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-topology.1 > $(MANPREFIX)/man1/likwid-topology.1
- @sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-features.1 > $(MANPREFIX)/man1/likwid-features.1
- @sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-perfctr.1 > $(MANPREFIX)/man1/likwid-perfctr.1
+ @sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" -e "s+<PREFIX>+$(PREFIX)+g" < $(DOC_DIR)/likwid-perfctr.1 > $(MANPREFIX)/man1/likwid-perfctr.1
@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-powermeter.1 > $(MANPREFIX)/man1/likwid-powermeter.1
@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-pin.1 > $(MANPREFIX)/man1/likwid-pin.1
- @sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-setFrequencies.1 > $(MANPREFIX)/man1/likwid-setFrequencies.1
- @sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-bench.1 > $(MANPREFIX)/man1/likwid-bench.1
@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/feedGnuplot.1 > $(MANPREFIX)/man1/feedGnuplot.1
@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-accessD.1 > $(MANPREFIX)/man1/likwid-accessD.1
- @sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-genCfg.1 > $(MANPREFIX)/man1/likwid-genCfg.1
+ @sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-genTopoCfg.1 > $(MANPREFIX)/man1/likwid-genTopoCfg.1
@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-memsweeper.1 > $(MANPREFIX)/man1/likwid-memsweeper.1
@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-mpirun.1 > $(MANPREFIX)/man1/likwid-mpirun.1
@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-perfscope.1 > $(MANPREFIX)/man1/likwid-perfscope.1
@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-setFreq.1 > $(MANPREFIX)/man1/likwid-setFreq.1
+ @sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-features.1 > $(MANPREFIX)/man1/likwid-features.1
+ @sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-bench.1 > $(MANPREFIX)/man1/likwid-bench.1
+ @sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-agent.1 > $(MANPREFIX)/man1/likwid-agent.1
+ @sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-setFrequencies.1 > $(MANPREFIX)/man1/likwid-setFrequencies.1
+ @sed -e "s/.TH LUA/.TH LIKWID-LUA/g" -e "s/lua - Lua interpreter/likwid-lua - Lua interpreter included in LIKWID/g" -e "s/.B lua/.B likwid-lua/g" -e "s/.BR luac (1)//g" $(DOC_DIR)/likwid-lua.1 > $(MANPREFIX)/man1/likwid-lua.1
@chmod 644 $(MANPREFIX)/man1/likwid-*
@echo "===> INSTALL headers to $(PREFIX)/include"
- @mkdir -p $(PREFIX)/include/likwid
- @cp -f src/includes/likwid*.h $(PREFIX)/include/
- @cp -f src/includes/* $(PREFIX)/include/likwid
- @cp -f GCC/perfmon_group_types.h $(PREFIX)/include/likwid
+ @mkdir -p $(PREFIX)/include
+ @chmod 775 $(PREFIX)/include
+ @install -m 644 src/includes/likwid.h $(PREFIX)/include/
+ @install -m 644 src/includes/bstrlib.h $(PREFIX)/include/
$(FORTRAN_INSTALL)
- @echo "===> INSTALL libraries to $(PREFIX)/lib"
- @mkdir -p $(PREFIX)/lib
- @cp -f $(LIKWID_LIB)* $(PREFIX)/lib
- @chmod 755 $(PREFIX)/lib/$(PINLIB)
- @echo "===> INSTALL filters to $(LIKWIDFILTERPATH)"
+ @echo "===> INSTALL groups to $(PREFIX)/share/likwid/perfgroups"
+ @mkdir -p $(PREFIX)/share/likwid/perfgroups
+ @chmod 775 $(PREFIX)/share/likwid
+ @chmod 775 $(PREFIX)/share/likwid/perfgroups
+ @cp -rf groups/* $(PREFIX)/share/likwid/perfgroups
+ @chmod 775 $(PREFIX)/share/likwid/perfgroups/*
+ @find $(PREFIX)/share/likwid/perfgroups -name "*.txt" -exec chmod 644 {} \;
+ @echo "===> INSTALL monitoring groups to $(PREFIX)/share/likwid/mongroups"
+ @mkdir -p $(PREFIX)/share/likwid/mongroups
+ @chmod 775 $(PREFIX)/share/likwid/mongroups
+ @cp -rf monitoring/groups/* $(PREFIX)/share/likwid/mongroups
+ @chmod 775 $(PREFIX)/share/likwid/mongroups/*
+ @find $(PREFIX)/share/likwid/mongroups -name "*.txt" -exec chmod 644 {} \;
+ @mkdir -p $(PREFIX)/share/likwid/docs
+ @chmod 775 $(PREFIX)/share/likwid/docs
+ @install -m 644 doc/bstrlib.txt $(PREFIX)/share/likwid/docs
+ @mkdir -p $(PREFIX)/share/likwid/examples
+ @chmod 775 $(PREFIX)/share/likwid/examples
+ @install -m 644 examples/* $(PREFIX)/share/likwid/examples
+ @echo "===> INSTALL default likwid-agent.conf to $(PREFIX)/share/likwid/mongroups"
+ @sed -e "s+<PREFIX>+$(PREFIX)+g" monitoring/likwid-agent.conf > $(PREFIX)/share/likwid/mongroups/likwid-agent.conf
+ @chmod 644 $(PREFIX)/share/likwid/mongroups/likwid-agent.conf
+ @echo "===> INSTALL filters to $(abspath $(PREFIX)/share/likwid/filter)"
+ @mkdir -p $(abspath $(PREFIX)/share/likwid/filter)
+ @chmod 755 $(abspath $(PREFIX)/share/likwid/filter)
+ @cp -f filters/* $(abspath $(PREFIX)/share/likwid/filter)
+ @chmod 755 $(abspath $(PREFIX)/share/likwid/filter)/*
+
+
+move: move_daemon move_freq
+ @echo "===> MOVE applications from $(BINPREFIX) to $(INSTALLED_BINPREFIX)"
+ @mkdir -p $(INSTALLED_BINPREFIX)
+ @chmod 775 $(INSTALLED_BINPREFIX)
+ @for APP in $(L_APPS); do \
+ install -m 755 $(BINPREFIX)/$$APP $(INSTALLED_BINPREFIX); \
+ done
+ @for APP in $(C_APPS); do \
+ install -m 755 $(BINPREFIX)/`basename $$APP` $(INSTALLED_BINPREFIX); \
+ done
+ @install -m 755 $(BINPREFIX)/likwid-lua $(INSTALLED_BINPREFIX)/likwid-lua
+ @echo "===> MOVE helper applications from $(BINPREFIX) to $(INSTALLED_BINPREFIX)"
+ @install -m 755 $(BINPREFIX)/feedGnuplot $(INSTALLED_BINPREFIX)
+ @echo "===> MOVE lua to likwid interface from $(PREFIX)/share/lua to $(INSTALLED_PREFIX)/share/lua"
+ @mkdir -p $(INSTALLED_PREFIX)/share/lua
+ @chmod 775 $(INSTALLED_PREFIX)/share/lua
+ @install -m 755 $(PREFIX)/share/lua/likwid.lua $(INSTALLED_PREFIX)/share/lua
+ @echo "===> MOVE libraries from $(LIBPREFIX) to $(INSTALLED_LIBPREFIX)"
+ @mkdir -p $(INSTALLED_LIBPREFIX)
+ @chmod 775 $(INSTALLED_LIBPREFIX)
+ @install -m 755 $(LIBPREFIX)/$(TARGET_LIB).$(VERSION).$(RELEASE) $(INSTALLED_LIBPREFIX)/$(TARGET_LIB).$(VERSION).$(RELEASE)
+ @install -m 755 $(LIBPREFIX)/$(PINLIB).$(VERSION).$(RELEASE) $(INSTALLED_LIBPREFIX)/$(PINLIB).$(VERSION).$(RELEASE)
+ @install -m 755 $(LIBPREFIX)/$(shell basename $(TARGET_HWLOC_LIB)).$(VERSION).$(RELEASE) $(INSTALLED_LIBPREFIX)/$(shell basename $(TARGET_HWLOC_LIB)).$(VERSION).$(RELEASE)
+ @install -m 755 $(LIBPREFIX)/$(shell basename $(TARGET_LUA_LIB)).$(VERSION).$(RELEASE) $(INSTALLED_LIBPREFIX)/$(shell basename $(TARGET_LUA_LIB)).$(VERSION).$(RELEASE)
+ @cd $(INSTALLED_LIBPREFIX) && ln -fs $(TARGET_LIB).$(VERSION).$(RELEASE) $(TARGET_LIB)
+ @cd $(INSTALLED_LIBPREFIX) && ln -fs $(TARGET_LIB).$(VERSION).$(RELEASE) $(TARGET_LIB).$(VERSION)
+ @cd $(INSTALLED_LIBPREFIX) && ln -fs $(PINLIB).$(VERSION).$(RELEASE) $(PINLIB)
+ @cd $(INSTALLED_LIBPREFIX) && ln -fs $(PINLIB).$(VERSION).$(RELEASE) $(PINLIB).$(VERSION)
+ @cd $(INSTALLED_LIBPREFIX) && ln -fs $(shell basename $(TARGET_HWLOC_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_HWLOC_LIB))
+ @cd $(INSTALLED_LIBPREFIX) && ln -fs $(shell basename $(TARGET_HWLOC_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_HWLOC_LIB)).$(VERSION)
+ @cd $(INSTALLED_LIBPREFIX) && ln -fs $(shell basename $(TARGET_LUA_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_LUA_LIB))
+ @cd $(INSTALLED_LIBPREFIX) && ln -fs $(shell basename $(TARGET_LUA_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_LUA_LIB)).$(VERSION)
+ @echo "===> MOVE man pages from $(MANPREFIX)/man1 to $(INSTALLED_MANPREFIX)/man1"
+ @mkdir -p $(INSTALLED_MANPREFIX)/man1
+ @chmod 775 $(INSTALLED_MANPREFIX)/man1
+ @install -m 644 $(MANPREFIX)/man1/*.1 $(INSTALLED_MANPREFIX)/man1
+ @echo "===> MOVE headers from $(PREFIX)/include to $(INSTALLED_PREFIX)/include"
+ @mkdir -p $(INSTALLED_PREFIX)/include
+ @chmod 775 $(INSTALLED_PREFIX)/include
+ @install -m 644 $(PREFIX)/include/likwid.h $(INSTALLED_PREFIX)/include/likwid.h
+ @install -m 644 $(PREFIX)/include/bstrlib.h $(INSTALLED_PREFIX)/include/bstrlib.h
+ @if [ -e $(PREFIX)/include/likwid.mod ]; then install $(PREFIX)/include/likwid.mod $(INSTALLED_PREFIX)/include/likwid.mod; fi
+ @echo "===> MOVE groups from $(PREFIX)/share/likwid/perfgroups to $(INSTALLED_PREFIX)/share/likwid/perfgroups"
+ @mkdir -p $(INSTALLED_PREFIX)/share/likwid/perfgroups
+ @chmod 775 $(INSTALLED_PREFIX)/share/likwid
+ @chmod 775 $(INSTALLED_PREFIX)/share/likwid/perfgroups
+ @cp -rf $(PREFIX)/share/likwid/perfgroups/* $(INSTALLED_PREFIX)/share/likwid/perfgroups
+ @chmod 775 $(INSTALLED_PREFIX)/share/likwid/perfgroups/*
+ @find $(INSTALLED_PREFIX)/share/likwid/perfgroups -name "*.txt" -exec chmod 644 {} \;
+ @echo "===> MOVE monitoring groups from $(PREFIX)/share/likwid/mongroups to $(INSTALLED_PREFIX)/share/likwid/mongroups"
+ @mkdir -p $(INSTALLED_PREFIX)/share/likwid/mongroups
+ @chmod 775 $(INSTALLED_PREFIX)/share/likwid/mongroups
+ @cp -rf $(PREFIX)/share/likwid/mongroups/* $(INSTALLED_PREFIX)/share/likwid/mongroups
+ @chmod 775 $(INSTALLED_PREFIX)/share/likwid/mongroups/*
+ @find $(INSTALLED_PREFIX)/share/likwid/mongroups -name "*.txt" -exec chmod 644 {} \;
+ @mkdir -p $(INSTALLED_PREFIX)/share/likwid/docs
+ @chmod 775 $(INSTALLED_PREFIX)/share/likwid/docs
+ @install -m 644 $(PREFIX)/share/likwid/docs/bstrlib.txt $(INSTALLED_PREFIX)/share/likwid/docs
+ @mkdir -p $(INSTALLED_PREFIX)/share/likwid/examples
+ @chmod 775 $(INSTALLED_PREFIX)/share/likwid/examples
+ @install -m 644 examples/* $(INSTALLED_PREFIX)/share/likwid/examples
+ @echo "===> MOVE default likwid-agent.conf from $(PREFIX)/share/likwid/mongroups to $(INSTALLED_PREFIX)/share/likwid/mongroups"
+ @install $(PREFIX)/share/likwid/mongroups/likwid-agent.conf $(INSTALLED_PREFIX)/share/likwid/mongroups/likwid-agent.conf
+ @chmod 644 $(INSTALLED_PREFIX)/share/likwid/mongroups/likwid-agent.conf
+ @echo "===> MOVE filters from $(abspath $(PREFIX)/share/likwid/filter) to $(LIKWIDFILTERPATH)"
@mkdir -p $(LIKWIDFILTERPATH)
- @cp -f filters/* $(LIKWIDFILTERPATH)
+ @chmod 755 $(LIKWIDFILTERPATH)
+ @cp -f $(abspath $(PREFIX)/share/likwid/filter)/* $(LIKWIDFILTERPATH)
@chmod 755 $(LIKWIDFILTERPATH)/*
-uninstall:
+
+uninstall: uninstall_daemon uninstall_freq
@echo "===> REMOVING applications from $(PREFIX)/bin"
- @rm -f $(addprefix $(PREFIX)/bin/,$(APPS))
- @rm -f $(addprefix $(PREFIX)/bin/,$(PERL_APPS))
- @rm -f $(PREFIX)/bin/feedGnuplot
- @echo "===> REMOVING daemon applications from $(PREFIX)/sbin"
- @rm -f $(addprefix $(PREFIX)/sbin/,$(DAEMON_APPS))
+ @rm -f $(addprefix $(BINPREFIX)/,$(addsuffix .lua,$(L_APPS)))
+ @for APP in $(L_APPS); do \
+ rm -f $(BINPREFIX)/$$APP; \
+ done
+ @for APP in $(C_APPS); do \
+ rm -f $(BINPREFIX)/$$APP; \
+ done
+ @rm -f $(BINPREFIX)/feedGnuplot
+ @rm -f $(BINPREFIX)/likwid-lua
+ @rm -f $(BINPREFIX)/likwid-bench
+ @echo "===> REMOVING Lua to likwid interface from $(PREFIX)/share/lua"
+ @rm -rf $(PREFIX)/share/lua/likwid.lua
+ @echo "===> REMOVING libs from $(LIBPREFIX)"
+ @rm -f $(LIBPREFIX)/liblikwid*
@echo "===> REMOVING man pages from $(MANPREFIX)/man1"
- @rm -f $(MANPREFIX)/man1/likwid-*
+ @rm -f $(addprefix $(MANPREFIX)/man1/,$(addsuffix .1,$(L_APPS)))
@rm -f $(MANPREFIX)/man1/feedGnuplot.1
- @echo "===> REMOVING headers from $(PREFIX)/include"
- @rm -f $(PREFIX)/include/likwid*.h
- @rm -rf $(PREFIX)/include/likwid
- @echo "===> REMOVING libs from $(PREFIX)/lib"
- @rm -f $(PREFIX)/lib/$(LIKWID_LIB)*
- @echo "===> REMOVING filter from $(PREFIX)/share"
- @rm -rf $(PREFIX)/share/likwid
-
-
-
+ @rm -f $(MANPREFIX)/man1/likwid-setFreq.1
+ @rm -f $(MANPREFIX)/man1/likwid-accessD.1
+ @rm -f $(MANPREFIX)/man1/likwid-lua.1
+ @rm -f $(MANPREFIX)/man1/likwid-bench.1
+ @echo "===> REMOVING header from $(PREFIX)/include"
+ @rm -f $(PREFIX)/include/likwid.h
+ @rm -f $(PREFIX)/include/bstrlib.h
+ $(FORTRAN_REMOVE)
+ @echo "===> REMOVING filter, groups and default configs from $(PREFIX)/share/likwid"
+ @rm -rf $(abspath $(PREFIX)/share/likwid/filter)
+ @rm -rf $(PREFIX)/share/likwid/mongroups
+ @rm -rf $(PREFIX)/share/likwid/perfgroups
+ @rm -rf $(PREFIX)/share/likwid/docs
+ @rm -rf $(PREFIX)/share/likwid/examples
+ @rm -rf $(PREFIX)/share/likwid
+
+uninstall_moved: uninstall_daemon_moved uninstall_freq_moved
+ @echo "===> REMOVING applications from $(INSTALLED_PREFIX)/bin"
+ @rm -f $(addprefix $(INSTALLED_BINPREFIX)/,$(addsuffix .lua,$(L_APPS)))
+ @for APP in $(L_APPS); do \
+ rm -f $(INSTALLED_BINPREFIX)/$$APP; \
+ done
+ @for APP in $(C_APPS); do \
+ rm -f $(INSTALLED_BINPREFIX)/$$APP; \
+ done
+ @rm -f $(INSTALLED_BINPREFIX)/feedGnuplot
+ @rm -f $(INSTALLED_BINPREFIX)/likwid-lua
+ @rm -f $(INSTALLED_BINPREFIX)/likwid-bench
+ @echo "===> REMOVING Lua to likwid interface from $(INSTALLED_PREFIX)/share/lua"
+ @rm -rf $(INSTALLED_PREFIX)/share/lua/likwid.lua
+ @echo "===> REMOVING libs from $(INSTALLED_LIBPREFIX)"
+ @rm -f $(INSTALLED_LIBPREFIX)/liblikwid*
+ @echo "===> REMOVING man pages from $(INSTALLED_MANPREFIX)/man1"
+ @rm -f $(addprefix $(INSTALLED_MANPREFIX)/man1/,$(addsuffix .1,$(L_APPS)))
+ @rm -f $(INSTALLED_MANPREFIX)/man1/feedGnuplot.1
+ @rm -f $(INSTALLED_MANPREFIX)/man1/likwid-setFreq.1
+ @rm -f $(INSTALLED_MANPREFIX)/man1/likwid-accessD.1
+ @rm -f $(INSTALLED_MANPREFIX)/man1/likwid-lua.1
+ @rm -f $(INSTALLED_MANPREFIX)/man1/likwid-bench.1
+ @echo "===> REMOVING header from $(INSTALLED_PREFIX)/include"
+ @rm -f $(INSTALLED_PREFIX)/include/likwid.h
+ @rm -f $(INSTALLED_PREFIX)/include/bstrlib.h
+ $(FORTRAN_REMOVE)
+ @echo "===> REMOVING filter, groups and default configs from $(INSTALLED_PREFIX)/share/likwid"
+ @rm -rf $(LIKWIDFILTERPATH)
+ @rm -rf $(INSTALLED_PREFIX)/share/likwid/mongroups
+ @rm -rf $(INSTALLED_PREFIX)/share/likwid/perfgroups
+ @rm -rf $(INSTALLED_PREFIX)/share/likwid/docs
+ @rm -rf $(INSTALLED_PREFIX)/share/likwid/examples
+ @rm -rf $(INSTALLED_PREFIX)/share/likwid
+
+local: $(L_APPS) likwid.lua
+ @echo "===> Setting Lua scripts to run from current directory"
+ @PWD=$(shell pwd)
+ @for APP in $(L_APPS); do \
+ sed -i -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" -e "s/<RELEASE>/$(RELEASE)/g" -e "s+$(PREFIX)/bin/likwid-lua+$(PWD)/ext/lua/lua+" -e "s+$(PREFIX)/share/lua/?.lua+$(PWD)/?.lua+" $$APP; \
+ chmod +x $$APP; \
+ done
+ @sed -i -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" -e "s/<RELEASE>/$(RELEASE)/g" -e "s+$(PREFIX)/lib+$(PWD)+g" -e "s+$(PREFIX)/share/likwid/perfgroups+$(PWD)/groups+g" likwid.lua;
+ @sed -i -e "s+$(PREFIX)/share/likwid/mongroups+$(PWD)/monitoring/groups+g" likwid-agent
+ @ln -sf liblikwid.so liblikwid.so.$(VERSION)
+ @ln -sf ext/hwloc/liblikwid-hwloc.so liblikwid-hwloc.so.$(VERSION)
+ @ln -sf ext/lua/liblikwid-lua.so liblikwid-lua.so.$(VERSION)
+ @ln -sf liblikwid.so liblikwid.so.$(VERSION).$(RELEASE)
+ @ln -sf ext/hwloc/liblikwid-hwloc.so liblikwid-hwloc.so.$(VERSION).$(RELEASE)
+ @ln -sf ext/lua/liblikwid-lua.so liblikwid-lua.so.$(VERSION).$(RELEASE)
+ @echo "export LD_LIBRARY_PATH=$(PWD):$$LD_LIBRARY_PATH"
+
+testit: test/test-likwidAPI.c
+ make -C test test-likwidAPI
+ test/test-likwidAPI
+ make -C test/executable_tests
+
+help:
+ @echo "Help for building LIKWID:"
+ @echo
+ @echo "Common make targets:"
+ @echo "- make : build anything (integrate already compiled files)"
+ @echo "- make clean : clean library and executables, keep compiled files"
+ @echo "- make distclean : clean anything"
+ @echo "- make docs : Build documentation (requires Doxygen)"
+ @echo "- make install : Copy compiled files to $(PREFIX)"
+ @echo "- make move : Copy files from $(PREFIX) to $(INSTALLED_PREFIX)"
+ @echo "- make uninstall : Delete files from $(PREFIX)"
+ @echo "- make uninstall_moved : Delete files from $(INSTALLED_PREFIX)"
+ @echo
+ @echo "Compiler selection can be done in config.mk at COMPILER:"
+ @echo "- GCC : Use GCC for C code and Intel Fortran compiler for Fortran interface (default)"
+ @echo "- GCCX86 : Use GCC for C code. No Fortran compiler set (only for 32 bit builds)"
+ @echo "- CLANG: Use CLANG for C code and Intel Fortran compiler for Fortran interface (unsupported, may fail)"
+ @echo "- ICC: Use Intel C compiler for C code and Intel Fortran compiler for Fortran interface (unsupported, may fail)"
+ @echo "- MIC: Build for Intel Xeon Phi. Use Intel C compiler for C code and\n Intel Fortran compiler for Fortran interface (unsupported)"
+ @echo
+ @echo "LIKWID runs only in INSTALLED_PREFIX = $(INSTALLED_PREFIX)"
+ @echo "You can change it in config.mk, but it is recommended to keep INSTALLED_PREFIX = PREFIX"
+ @echo "The PREFIX is used for temporary install directories (e.g. for packaging)."
+ @echo "LIKWID will not run in PREFIX, it has to be in INSTALLED_PREFIX."
+ @echo "The common configuration is INSTALLED_PREFIX = PREFIX, so changing PREFIX is enough."
+ @echo "If PREFIX and INSTALLED_PREFIX differ, you have to move anything after 'make install' to"
+ @echo "the INSTALLED_PREFIX. You can also use 'make move' which does the job for you."
+
diff --git a/README b/README
deleted file mode 100644
index f47ac01..0000000
--- a/README
+++ /dev/null
@@ -1,29 +0,0 @@
-Likwid is a simple to install and use toolsuite of command line applications
-for performance oriented programmers. It works for Intel and AMD processors
-on the Linux operating system.
-
-It consists of:
-
-likwid-topology - print thread and cache topology
-likwid-features - view and toggle feature reagister on Intel processors
-likwid-perfctr - configure and read out hardware performance counters on Intel and AMD processors
-likwid-powermeter - read out RAPL Energy information and get info about Turbo Mode steps
-likwid-setFrequencies - read out RAPL Energy information and get info about Turbo Mode steps
-likwid-memsweeper - cleans up filled NUMA memory domains and evicts dirty cacheline from cache hierarchy
-likwid-pin - pin your threaded application (pthread, Intel and gcc OpenMP to dedicated processors
-likwid-bench - Micro benchmarking platform
-likwid-gencfg - Dumps topology information to a file
-likwid-mpirun - Wrapper to start MPI and Hybrid MPI/OpenMP applications (Supports Intel MPI and OpenMPI)
-likwid-scope - Frontend to the timeline mode of likwid-perfctr, plots live graphs of performance metrics
-
-For a detailed documentation on the usage of the tools have a look at the
-likwid wiki pages at:
-
-http://code.google.com/p/likwid/wiki/Introduction
-
-If you have problems or suggestions please let us know on the likwid mailing list:
-
-http://groups.google.com/group/likwid-users
-
-
-
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..838883c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,58 @@
+--------------------------------------------------------------------------------
+Introduction
+--------------------------------------------------------------------------------
+Likwid is a simple to install and use toolsuite of command line applications
+for performance oriented programmers. It works for Intel and AMD processors
+on the Linux operating system.
+
+[![Build Status](https://travis-ci.org/RRZE-HPC/likwid.svg?branch=master)](https://travis-ci.org/RRZE-HPC/likwid)
+
+It consists of:
+
+- likwid-topology: print thread, cache and NUMA topology
+- likwid-perfctr: configure and read out hardware performance counters on Intel and AMD processors
+- likwid-powermeter: read out RAPL Energy information and get info about Turbo mode steps
+- likwid-pin: pin your threaded application (pthread, Intel and gcc OpenMP to dedicated processors)
+- likwid-bench: Micro benchmarking platform
+- likwid-genTopoCfg: Dumps topology information to a file
+- likwid-mpirun: Wrapper to start MPI and Hybrid MPI/OpenMP applications (Supports Intel MPI, OpenMPI and MPICH)
+- likwid-perfscope: Frontend to the timeline mode of likwid-perfctr, plots live graphs of performance metrics using gnuplot
+- likwid-agent: Monitoring agent for hardware performance counters
+- likwid-memsweeper: Sweep memory of NUMA domains and evict cachelines from the last level cache
+- likwid-setFrequencies: Tool to control the CPU frequency
+
+--------------------------------------------------------------------------------
+Download, Build and Install
+--------------------------------------------------------------------------------
+You can get the releases of LIKWID at:
+http://ftp.fau.de/pub/likwid/
+
+For build and installation hints see INSTALL file
+
+--------------------------------------------------------------------------------
+Documentation
+--------------------------------------------------------------------------------
+For a detailed documentation on the usage of the tools have a look at the
+html documentation build with doxygen. Call
+
+make docs
+
+or after installation, look at the man pages.
+
+There is also a wiki at the github page:
+https://github.com/rrze-likwid/likwid/wiki
+
+If you have problems or suggestions please let me know on the likwid mailing list:
+http://groups.google.com/group/likwid-users
+
+or if it is bug, add an issue at:
+https://github.com/rrze-likwid/likwid/issues
+
+--------------------------------------------------------------------------------
+Extras
+--------------------------------------------------------------------------------
+- If you want to use the Marker API with Java, you can find the Java module here:
+https://github.com/jlewandowski/likwid-java-api
+- For Python you can find an interface to the LIKWID API here:
+https://github.com/TomTheBear/likwid-python-api
+
diff --git a/bench/Makefile b/bench/Makefile
new file mode 100644
index 0000000..da883ef
--- /dev/null
+++ b/bench/Makefile
@@ -0,0 +1,157 @@
+#
+# =======================================================================================
+#
+# Filename: Makefile
+#
+# Description: likwid-bench Makefile
+#
+# Version: <VERSION>
+# Released: <DATE>
+#
+# Author: Jan Treibig (jt), jan.treibig at gmail.com
+# Project: likwid
+#
+# Copyright (C) 2013 Jan Treibig
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation, either version 3 of the License, or (at your option) any later
+# version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
+
+SRC_DIR = ./src
+MAKE_DIR = ../make
+
+#DO NOT EDIT BELOW
+
+
+# Dependency chains:
+# *.[ch] -> *.o -> executables
+# *.ptt -> *.pas -> *.s -> *.o -> executables
+# *.txt -> *.h (generated)
+
+include ../config.mk
+include $(MAKE_DIR)/include_$(COMPILER).mk
+include $(MAKE_DIR)/config_checks.mk
+include $(MAKE_DIR)/config_defines.mk
+
+#INCLUDES += -I./includes -I../src/includes -I../ext/hwloc/include -I../$(COMPILER) -I$(BUILD_DIR)
+INCLUDES += -I./includes -I$(BUILD_DIR) -I../src/includes
+LIBS +=
+CFLAGS := $(filter-out -fvisibility=hidden, $(CFLAGS))
+
+#CONFIGURE BUILD SYSTEM
+BUILD_DIR = ./$(COMPILER)
+Q ?= @
+
+ifeq ($(COMPILER),MIC)
+BENCH_DIR = ./phi
+else
+ifeq ($(COMPILER),GCCX86)
+BENCH_DIR = ./x86
+else
+BENCH_DIR = ./x86-64
+endif
+endif
+
+SHARED_TARGET_LIB := -L.. -L../ext/hwloc/ -L../ext/lua -llikwid -llikwid-hwloc -llikwid-lua
+STATIC_TARGET_LIB := ../liblikwid.a ../ext/hwloc/liblikwid-hwloc.a ../ext/lua/liblikwid-lua.a
+TARGET_LIB = $(SHARED_TARGET_LIB)
+
+BENCH_LIBS :=
+ifeq ($(INSTRUMENT_BENCH),true)
+ DEFINES += -DLIKWID_PERFMON
+endif
+
+ifeq ($(DEBUG),true)
+DEBUG_FLAGS = -g
+DEFINES += -DDEBUG_LIKWID
+else
+DEBUG_FLAGS =
+endif
+
+
+VPATH = $(SRC_DIR)
+OBJ = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c))
+ifeq ($(SHARED_LIBRARY),false)
+OBJ := $(filter-out $(BUILD_DIR)/bstrlib.o,$(OBJ))
+TARGET_LIB = $(STATIC_TARGET_LIB)
+endif
+OBJ_BENCH = $(patsubst $(BENCH_DIR)/%.ptt, $(BUILD_DIR)/%.o,$(wildcard $(BENCH_DIR)/*.ptt))
+BENCH = $(shell basename $(BENCH_TARGET))
+
+CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES)
+
+all: $(BUILD_DIR) $(OBJ) $(OBJ_BENCH) $(BENCH_TARGET)
+
+
+$(BENCH_TARGET): $(BENCH)
+$(BENCH): likwid-bench.c $(BUILD_DIR) $(OBJ) $(OBJ_BENCH)
+ @echo "===> LINKING $(BENCH)"
+ $(Q)${CC} $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) ${LFLAGS} likwid-bench.c $(BENCH_LIBS) $(OBJ_BENCH) $(OBJ) -o $(BENCH) $(TARGET_LIB) $(LIBS) $(RPATHS)
+
+
+$(BUILD_DIR):
+ @mkdir $(BUILD_DIR)
+
+
+#PATTERN RULES
+$(BUILD_DIR)/%.o: %.c
+ @echo "===> COMPILE C $@"
+ $(Q)$(CC) -g -c $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $(TARGET_LIB) $< -o $@
+ $(Q)$(CC) -g $(DEBUG_FLAGS) $(CPPFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d
+
+
+$(BUILD_DIR)/%.pas: $(BENCH_DIR)/%.ptt
+ @echo "===> GENERATE BENCHMARKS"
+ $(Q)$(GEN_PAS) $(BENCH_DIR) $(BUILD_DIR) ./perl/templates
+
+
+$(BUILD_DIR)/%.o: $(BUILD_DIR)/%.pas
+ @echo "===> ASSEMBLE $@"
+ $(Q)$(PAS) -i $(PASFLAGS) -o $(BUILD_DIR)/$*.s $< '$(DEFINES)'
+ $(Q)$(AS) $(ASFLAGS) $(BUILD_DIR)/$*.s -o $@
+
+ifeq ($(findstring $(MAKECMDGOALS),clean),)
+-include $(OBJ:.o=.d)
+endif
+
+.PHONY: clean distclean install uninstall
+
+
+.PRECIOUS: $(BUILD_DIR)/%.pas
+
+.NOTPARALLEL:
+
+
+clean:
+ @rm -rf likwid-bench
+
+distclean:
+ @rm -rf $(BUILD_DIR)
+ @rm -rf likwid-bench
+
+install:
+ @echo "===> INSTALL applications to $(BINPREFIX)"
+ cp -f likwid-bench $(BINPREFIX)
+ @echo "===> INSTALL man pages to $(MANPREFIX)/man1"
+ @mkdir -p $(MANPREFIX)/man1
+ @sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-bench.1 > $(MANPREFIX)/man1/likwid-bench.1
+
+
+uninstall:
+ @echo "===> REMOVING applications from $(BINPREFIX)"
+ rm -rf $(BINPREFIX)/likwid-bench
+ @echo "===> REMOVING man pages from $(MANPREFIX)/man1"
+ @rm -f $(MANPREFIX)/man1/likwid-bench.1
+
+
+
diff --git a/bench/includes/allocator.h b/bench/includes/allocator.h
new file mode 100644
index 0000000..f7eae06
--- /dev/null
+++ b/bench/includes/allocator.h
@@ -0,0 +1,50 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: allocator.h
+ *
+ * Description: Header File allocator Module.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Project: none
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef ALLOCATOR_H
+#define ALLOCATOR_H
+
+#include <stdint.h>
+#include <bstrlib.h>
+#include <test_types.h>
+
+#define LLU_CAST (unsigned long long)
+
+extern void allocator_init(int numVectors);
+extern void allocator_finalize();
+extern void allocator_allocateVector(void** ptr,
+ int alignment,
+ uint64_t size,
+ int offset,
+ DataType type,
+ bstring domain);
+
+#endif /*ALLOCATOR_H*/
+
diff --git a/bench/includes/allocator_types.h b/bench/includes/allocator_types.h
new file mode 100644
index 0000000..43ad3c0
--- /dev/null
+++ b/bench/includes/allocator_types.h
@@ -0,0 +1,46 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: allocator_types.h
+ *
+ * Description: Header File types of allocator Module.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: none
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef ALLOCATOR_TYPES_H
+#define ALLOCATOR_TYPES_H
+
+#include <stdint.h>
+#include <test_types.h>
+
+typedef struct {
+ void* ptr;
+ size_t size;
+ off_t offset;
+ DataType type;
+} allocation;
+
+
+
+#endif
diff --git a/bench/includes/barrier.h b/bench/includes/barrier.h
new file mode 100644
index 0000000..6427c4a
--- /dev/null
+++ b/bench/includes/barrier.h
@@ -0,0 +1,58 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: barrier.h
+ *
+ * Description: Header File barrier Module
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef BARRIER_H
+#define BARRIER_H
+
+#include <barrier_types.h>
+
+/**
+ * @brief Initialize the barrier module
+ * @param numberOfThreads The total number of threads in the barrier
+ */
+extern void barrier_init(int numberOfGroups);
+
+/**
+ * @brief Register a thread for a barrier
+ * @param threadId The id of the thread to register
+ */
+extern int barrier_registerGroup(int numThreads);
+extern void barrier_registerThread(BarrierData* barr, int groupsId, int threadId);
+
+/**
+ * @brief Synchronize threads
+ * @param threadId The id of the calling thread
+ * @param numberOfThreads Total number of threads in the barrier
+ */
+extern void barrier_synchronize(BarrierData* barr);
+extern void barrier_destroy(BarrierData* barr);
+
+
+#endif /*BARRIER_H*/
diff --git a/bench/includes/barrier_types.h b/bench/includes/barrier_types.h
new file mode 100644
index 0000000..9fc6e30
--- /dev/null
+++ b/bench/includes/barrier_types.h
@@ -0,0 +1,49 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: barrier_types.h
+ *
+ * Description: Type Definitions for barrier Module
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef BARRIER_TYPES_H
+#define BARRIER_TYPES_H
+
+#include <stdint.h>
+
+typedef struct {
+ int numberOfThreads;
+ int offset;
+ int val;
+ int* index;
+ volatile int* bval;
+} BarrierData;
+
+typedef struct {
+ int* groupBval;
+ int numberOfThreads;
+} BarrierGroup;
+
+#endif /*BARRIER_TYPES_H*/
diff --git a/bench/includes/bstrlib.h b/bench/includes/bstrlib.h
new file mode 120000
index 0000000..daa8a68
--- /dev/null
+++ b/bench/includes/bstrlib.h
@@ -0,0 +1 @@
+../../src/includes/bstrlib.h
\ No newline at end of file
diff --git a/bench/includes/likwid.h b/bench/includes/likwid.h
new file mode 120000
index 0000000..d2020f7
--- /dev/null
+++ b/bench/includes/likwid.h
@@ -0,0 +1 @@
+../../src/includes/likwid.h
\ No newline at end of file
diff --git a/bench/includes/strUtil.h b/bench/includes/strUtil.h
new file mode 100644
index 0000000..a16790c
--- /dev/null
+++ b/bench/includes/strUtil.h
@@ -0,0 +1,60 @@
+/*
+ * =======================================================================================
+ * Filename: strUtil.h
+ *
+ * Description: Some sting functions
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+#ifndef STRUTIL_H
+#define STRUTIL_H
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <bstrlib.h>
+#include <likwid.h>
+
+#include <test_types.h>
+
+typedef struct {
+ bstring domain;
+ int offset;
+ void* ptr;
+} Stream;
+
+typedef struct {
+ uint32_t numberOfThreads;
+ int* processorIds;
+ uint64_t size;
+ Stream* streams;
+} Workgroup;
+
+
+extern int bstr_to_workgroup(Workgroup* group, const_bstring str, DataType type, int numberOfStreams);
+extern void workgroups_destroy(Workgroup** groupList, int numberOfGroups, int numberOfStreams);
+
+#endif
diff --git a/bench/includes/test_types.h b/bench/includes/test_types.h
new file mode 100644
index 0000000..18627fc
--- /dev/null
+++ b/bench/includes/test_types.h
@@ -0,0 +1,113 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: test_types.h
+ *
+ * Description: Type definitions for benchmarking framework
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef TEST_TYPES_H
+#define TEST_TYPES_H
+
+#include <stdint.h>
+#include <bstrlib.h>
+
+typedef void (*FuncPrototype)();
+
+typedef enum {
+ SINGLE = 0,
+ DOUBLE,
+ INT} DataType;
+
+typedef enum {
+ STREAM_1 = 1,
+ STREAM_2,
+ STREAM_3,
+ STREAM_4,
+ STREAM_5,
+ STREAM_6,
+ STREAM_7,
+ STREAM_8,
+ STREAM_9,
+ STREAM_10,
+ STREAM_11,
+ STREAM_12,
+ STREAM_13,
+ STREAM_14,
+ STREAM_15,
+ STREAM_16,
+ STREAM_17,
+ STREAM_18,
+ STREAM_19,
+ STREAM_20,
+ STREAM_21,
+ STREAM_22,
+ STREAM_23,
+ STREAM_24,
+ STREAM_25,
+ STREAM_26,
+ STREAM_27,
+ STREAM_28,
+ STREAM_29,
+ STREAM_30,
+ STREAM_31,
+ STREAM_32,
+ STREAM_33,
+ STREAM_34,
+ STREAM_35,
+ STREAM_36,
+ STREAM_37,
+ STREAM_38,
+ MAX_STREAMS} Pattern;
+
+typedef struct {
+ char* name;
+ Pattern streams;
+ DataType type ;
+ int stride;
+ FuncPrototype kernel;
+ int flops;
+ int bytes;
+ char* desc;
+ int loads;
+ int stores;
+ int branches;
+ int instr_const;
+ int instr_loop;
+ int uops;
+} TestCase;
+
+typedef struct {
+ uint64_t size;
+ uint64_t iter;
+ uint32_t min_runtime;
+ const TestCase* test;
+ uint64_t cycles;
+ uint32_t numberOfThreads;
+ int* processors;
+ void** streams;
+} ThreadUserData;
+
+#endif /*TEST_TYPES_H*/
diff --git a/bench/includes/threads.h b/bench/includes/threads.h
new file mode 100644
index 0000000..d92bbc9
--- /dev/null
+++ b/bench/includes/threads.h
@@ -0,0 +1,114 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: threads.h
+ *
+ * Description: Header file of pthread interface module
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef THREADS_H
+#define THREADS_H
+
+#include <pthread.h>
+#include <threads_types.h>
+
+#define THREADS_BARRIER pthread_barrier_wait(&threads_barrier)
+#define MIN_ITERATIONS 10
+
+extern pthread_barrier_t threads_barrier;
+extern ThreadData* threads_data;
+extern ThreadGroup* threads_groups;
+
+
+/**
+ * @brief Test the maximal possible thread count
+ * @return numberOfThreads The number of available threads
+ */
+extern int threads_test(void);
+
+/**
+ * @brief Initialization of the thread module
+ * @param numberOfThreads The total number of threads
+ */
+extern void threads_init(int numberOfThreads);
+
+/**
+ * @brief Create all threads
+ * @param startRoutine thread entry function pointer
+ */
+extern void threads_create(void *(*startRoutine)(void*));
+
+/**
+ * @brief Register User thread data for all threads
+ * @param data Reference to the user data structo
+ * @param func Optional function pointer to copy data
+ */
+extern void threads_registerDataAll(
+ ThreadUserData* data,
+ threads_copyDataFunc func);
+
+/**
+ * @brief Register User thread data for one thread
+ * @param threadId thread Id
+ * @param data Reference to the user data structo
+ * @param func Optional function pointer to copy data
+ */
+extern void threads_registerDataThread(
+ int threadId,
+ ThreadUserData* data,
+ threads_copyDataFunc func);
+
+/**
+ * @brief Register User thread data for a thread group
+ * @param groupId group Id
+ * @param data Reference to the user data structo
+ * @param func Optional function pointer to copy data
+ */
+extern void threads_registerDataGroup(
+ int groupId,
+ ThreadUserData* data,
+ threads_copyDataFunc func);
+
+extern size_t threads_updateIterations(int groupId, size_t demandIter);
+
+/**
+ * @brief Join the threads and free pthread related data structures
+ * @param
+ */
+extern void threads_join(void);
+
+/**
+ * @brief Free memory of thread data structures
+ * @param numberOfGroups The number of groups to destroy
+ */
+extern void threads_destroy(int numberOfGroups, int numberOfStreams);
+
+/**
+ * @brief Create Thread groups
+ * @param numberOfGroups The number of groups to create
+ */
+extern void threads_createGroups(int numberOfGroups);
+
+#endif /* THREADS_H */
diff --git a/bench/includes/threads_types.h b/bench/includes/threads_types.h
new file mode 100644
index 0000000..68f0af3
--- /dev/null
+++ b/bench/includes/threads_types.h
@@ -0,0 +1,56 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: threads_types.h
+ *
+ * Description: Types file for threads module.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef THREADS_TYPES_H
+#define THREADS_TYPES_H
+
+#include <stdint.h>
+#include <test_types.h>
+
+typedef struct {
+ int globalNumberOfThreads;
+ int numberOfThreads;
+ int globalThreadId;
+ int threadId;
+ int numberOfGroups;
+ int groupId;
+ double time;
+ uint64_t cycles;
+ ThreadUserData data;
+} ThreadData;
+
+typedef struct {
+ int numberOfThreads;
+ int* threadIds;
+} ThreadGroup;
+
+typedef void (*threads_copyDataFunc)(ThreadUserData* src,ThreadUserData* dst);
+
+#endif /*THREADS_TYPES_H*/
diff --git a/bench/likwid-bench.c b/bench/likwid-bench.c
new file mode 100644
index 0000000..02d0ced
--- /dev/null
+++ b/bench/likwid-bench.c
@@ -0,0 +1,521 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: likwid-bench.c
+ *
+ * Description: A flexible and extensible benchmarking toolbox
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+#include <stdlib.h>
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <inttypes.h>
+
+#include <bstrlib.h>
+#include <errno.h>
+#include <threads.h>
+#include <barrier.h>
+#include <testcases.h>
+#include <strUtil.h>
+#include <allocator.h>
+
+#include <likwid.h>
+
+extern void* runTest(void* arg);
+extern void* getIterSingle(void* arg);
+
+/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
+
+#define HELP_MSG printf("Threaded Memory Hierarchy Benchmark -- Version %d.%d \n\n",VERSION,RELEASE); \
+ printf("\n"); \
+ printf("Supported Options:\n"); \
+ printf("-h\t\t Help message\n"); \
+ printf("-a\t\t List available benchmarks \n"); \
+ printf("-d\t\t Delimiter used for physical core list (default ,) \n"); \
+ printf("-p\t\t List available thread domains\n"); \
+ printf("\t\t or the physical ids of the cores selected by the -c expression \n"); \
+ printf("-s <TIME>\t Seconds to run the test minimally (default 1)\n");\
+ printf("\t\t If resulting iteration count is below 10, it is normalized to 10.\n");\
+ printf("-i <ITERS>\t Specify the number of iterations per thread manually. \n"); \
+ printf("-l <TEST>\t list properties of benchmark \n"); \
+ printf("-t <TEST>\t type of test \n"); \
+ printf("-w\t\t <thread_domain>:<size>[:<num_threads>[:<chunk size>:<stride>]-<streamId>:<domain_id>[:<offset>]\n"); \
+ printf("\t\t <size> in kB, MB or GB (mandatory)\n"); \
+ printf("\n"); \
+ printf("Usage: \n"); \
+ printf("# Run the store benchmark on all CPUs of the system with a vector size of 1 GB\n"); \
+ printf("likwid-bench -t store -w S0:1GB\n"); \
+ printf("# Run the copy benchmark on one CPU at CPU socket 0 with a vector size of 100kB\n"); \
+ printf("likwid-bench -t copy -w S0:100kB:1\n"); \
+ printf("# Run the copy benchmark on one CPU at CPU socket 0 with a vector size of 100MB but place one stream on CPU socket 1\n"); \
+ printf("likwid-bench -t copy -w S0:100MB:1-0:S0,1:S1\n"); \
+
+#define VERSION_MSG \
+ printf("likwid-bench %d.%d \n\n",VERSION,RELEASE)
+
+/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ############ */
+
+ void
+copyThreadData(ThreadUserData* src,ThreadUserData* dst)
+{
+ uint32_t i;
+
+ *dst = *src;
+ dst->processors = (int*) malloc(src->numberOfThreads*sizeof(int));
+ dst->streams = (void**) malloc(src->test->streams*sizeof(void*));
+
+ for (i=0; i< src->test->streams; i++)
+ {
+ dst->streams[i] = src->streams[i];
+ }
+
+ for (i=0; i<src->numberOfThreads; i++)
+ {
+ dst->processors[i] = src->processors[i];
+ }
+}
+
+
+
+/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
+
+int main(int argc, char** argv)
+{
+ uint64_t iter = 100;
+ uint32_t i;
+ uint32_t j;
+ int globalNumberOfThreads = 0;
+ int optPrintDomains = 0;
+ int c;
+ ThreadUserData myData;
+ bstring testcase = bfromcstr("none");
+ uint64_t numberOfWorkgroups = 0;
+ int tmp = 0;
+ double time;
+ double cycPerUp = 0.0;
+ const TestCase* test = NULL;
+ uint64_t realSize = 0;
+ uint64_t realIter = 0;
+ uint64_t maxCycles = 0;
+ uint64_t minCycles = UINT64_MAX;
+ uint64_t cyclesClock = 0;
+ uint64_t demandIter = 0;
+ TimerData itertime;
+ Workgroup* currentWorkgroup = NULL;
+ Workgroup* groups = NULL;
+ uint32_t min_runtime = 1; /* 1s */
+ bstring HLINE = bfromcstr("");
+ binsertch(HLINE, 0, 80, '-');
+ binsertch(HLINE, 80, 1, '\n');
+ int (*ownprintf)(const char *format, ...);
+ ownprintf = &printf;
+
+ /* Handling of command line options */
+ if (argc == 1)
+ {
+ HELP_MSG;
+ exit(EXIT_SUCCESS);
+ }
+
+ while ((c = getopt (argc, argv, "w:t:s:l:aphvi:")) != -1) {
+ switch (c)
+ {
+ case 'h':
+ HELP_MSG;
+ exit (EXIT_SUCCESS);
+ case 'v':
+ VERSION_MSG;
+ exit (EXIT_SUCCESS);
+ case 'a':
+ ownprintf(TESTS"\n");
+ exit (EXIT_SUCCESS);
+ case 'w':
+ numberOfWorkgroups++;
+ break;
+ case 's':
+ min_runtime = atoi(optarg);
+ break;
+ case 'i':
+ demandIter = strtoul(optarg, NULL, 10);
+ if (demandIter <= 0)
+ {
+ fprintf (stderr, "Error: Iterations must be greater than 0\n");
+ return EXIT_FAILURE;
+ }
+ break;
+ case 'l':
+ bdestroy(testcase);
+ testcase = bfromcstr(optarg);
+ for (i=0; i<NUMKERNELS; i++)
+ {
+ if (biseqcstr(testcase, kernels[i].name))
+ {
+ test = kernels+i;
+ break;
+ }
+ }
+
+ if (test == NULL)
+ {
+ fprintf (stderr, "Error: Unknown test case %s\n",optarg);
+ return EXIT_FAILURE;
+ }
+ else
+ {
+ ownprintf("Name: %s\n",test->name);
+ ownprintf("Number of streams: %d\n",test->streams);
+ ownprintf("Loop stride: %d\n",test->stride);
+ ownprintf("Flops: %d\n",test->flops);
+ ownprintf("Bytes: %d\n",test->bytes);
+ switch (test->type)
+ {
+ case INT:
+ ownprintf("Data Type: Integer\n");
+ break;
+ case SINGLE:
+ ownprintf("Data Type: Single precision float\n");
+ break;
+ case DOUBLE:
+ ownprintf("Data Type: Double precision float\n");
+ break;
+ }
+ if (test->loads >= 0)
+ {
+ ownprintf("Load Ops: %d\n",test->loads);
+ }
+ if (test->stores >= 0)
+ {
+ ownprintf("Store Ops: %d\n",test->stores);
+ }
+ if (test->branches >= 0)
+ {
+ ownprintf("Branches: %d\n",test->branches);
+ }
+ if (test->instr_const >= 0)
+ {
+ ownprintf("Constant instructions: %d\n",test->instr_const);
+ }
+ if (test->instr_loop >= 0)
+ {
+ ownprintf("Loop instructions: %d\n",test->instr_loop);
+ }
+ }
+ bdestroy(testcase);
+ exit (EXIT_SUCCESS);
+
+ break;
+ case 'p':
+ optPrintDomains = 1;
+ break;
+ case 'g':
+ numberOfWorkgroups = LLU_CAST atol(optarg);
+
+ tmp = numberOfWorkgroups;
+
+ break;
+ case 't':
+ bdestroy(testcase);
+ testcase = bfromcstr(optarg);
+
+ for (i=0; i<NUMKERNELS; i++)
+ {
+ if (biseqcstr(testcase, kernels[i].name))
+ {
+ test = kernels+i;
+ break;
+ }
+ }
+
+ if (test == NULL)
+ {
+ fprintf (stderr, "Error: Unknown test case %s\n",optarg);
+ return EXIT_FAILURE;
+ }
+ bdestroy(testcase);
+ break;
+ case '?':
+ if (isprint (optopt))
+ fprintf (stderr, "Unknown option `-%c'.\n", optopt);
+ else
+ fprintf (stderr,
+ "Unknown option character `\\x%x'.\n",
+ optopt);
+ return EXIT_FAILURE;
+ default:
+ HELP_MSG;
+ }
+ }
+ if ((numberOfWorkgroups == 0) && (!optPrintDomains))
+ {
+ fprintf(stderr, "Error: At least one workgroup (-w) must be set on commandline\n");
+ exit (EXIT_FAILURE);
+ }
+
+ if (topology_init() != EXIT_SUCCESS)
+ {
+ fprintf(stderr, "Error: Unsupported processor!\n");
+ exit(EXIT_FAILURE);
+ }
+
+ if ((test == NULL) && (!optPrintDomains))
+ {
+ fprintf(stderr, "Unknown test case. Please check likwid-bench -a for available tests\n");
+ fprintf(stderr, "and select one using the -t commandline option\n");
+ exit(EXIT_FAILURE);
+ }
+
+ numa_init();
+ affinity_init();
+ timer_init();
+
+ if (optPrintDomains)
+ {
+ bdestroy(testcase);
+ AffinityDomains_t affinity = get_affinityDomains();
+ ownprintf("Number of Domains %d\n",affinity->numberOfAffinityDomains);
+ for (i=0; i < affinity->numberOfAffinityDomains; i++ )
+ {
+ ownprintf("Domain %d:\n",i);
+ ownprintf("\tTag %s:",bdata(affinity->domains[i].tag));
+
+ for ( uint32_t j=0; j < affinity->domains[i].numberOfProcessors; j++ )
+ {
+ ownprintf(" %d",affinity->domains[i].processorList[j]);
+ }
+ ownprintf("\n");
+ }
+ exit (EXIT_SUCCESS);
+ }
+
+ allocator_init(numberOfWorkgroups * MAX_STREAMS);
+ groups = (Workgroup*) malloc(numberOfWorkgroups*sizeof(Workgroup));
+ tmp = 0;
+
+ optind = 0;
+ while ((c = getopt (argc, argv, "w:t:s:l:i:aphv")) != -1)
+ {
+ switch (c)
+ {
+ case 'w':
+ currentWorkgroup = groups+tmp;
+ bstring groupstr = bfromcstr(optarg);
+ i = bstr_to_workgroup(currentWorkgroup, groupstr, test->type, test->streams);
+ bdestroy(groupstr);
+ if (i == 0)
+ {
+ for (i=0; i< test->streams; i++)
+ {
+ if (currentWorkgroup->streams[i].offset%test->stride)
+ {
+ fprintf (stderr, "Error: Stream %d: offset is not a multiple of stride!\n",i);
+ return EXIT_FAILURE;
+ }
+ allocator_allocateVector(&(currentWorkgroup->streams[i].ptr),
+ PAGE_ALIGNMENT,
+ currentWorkgroup->size,
+ currentWorkgroup->streams[i].offset,
+ test->type,
+ currentWorkgroup->streams[i].domain);
+ }
+ tmp++;
+ }
+ else
+ {
+ exit(EXIT_FAILURE);
+ }
+ break;
+ default:
+ continue;
+ break;
+ }
+ }
+
+ /* :WARNING:05/04/2010 08:58:05 AM:jt: At the moment the thread
+ * module only allows equally sized thread groups*/
+ for (i=0; i<numberOfWorkgroups; i++)
+ {
+ globalNumberOfThreads += groups[i].numberOfThreads;
+ }
+
+ ownprintf(bdata(HLINE));
+ ownprintf("LIKWID MICRO BENCHMARK\n");
+ ownprintf("Test: %s\n",test->name);
+ ownprintf(bdata(HLINE));
+ ownprintf("Using %" PRIu64 " work groups\n",numberOfWorkgroups);
+ ownprintf("Using %d threads\n",globalNumberOfThreads);
+ ownprintf(bdata(HLINE));
+
+
+ threads_init(globalNumberOfThreads);
+ threads_createGroups(numberOfWorkgroups);
+
+ /* we configure global barriers only */
+ barrier_init(1);
+ barrier_registerGroup(globalNumberOfThreads);
+ cyclesClock = timer_getCycleClock();
+
+#ifdef LIKWID_PERFMON
+ if (getenv("LIKWID_FILEPATH") != NULL)
+ {
+ ownprintf("Using Likwid Marker API\n");
+ }
+ LIKWID_MARKER_INIT;
+ ownprintf(bdata(HLINE));
+#endif
+
+
+ /* initialize data structures for threads */
+ for (i=0; i<numberOfWorkgroups; i++)
+ {
+ myData.iter = iter;
+ if (demandIter > 0)
+ {
+ myData.iter = demandIter;
+ }
+ myData.min_runtime = min_runtime;
+ myData.size = groups[i].size;
+ myData.test = test;
+ myData.cycles = 0;
+ myData.numberOfThreads = groups[i].numberOfThreads;
+ myData.processors = (int*) malloc(myData.numberOfThreads * sizeof(int));
+ myData.streams = (void**) malloc(test->streams * sizeof(void*));
+
+ for (j=0; j<groups[i].numberOfThreads; j++)
+ {
+ myData.processors[j] = groups[i].processorIds[j];
+ }
+
+ for (j=0; j< test->streams; j++)
+ {
+ myData.streams[j] = groups[i].streams[j].ptr;
+ }
+
+ threads_registerDataGroup(i, &myData, copyThreadData);
+
+ free(myData.processors);
+ free(myData.streams);
+ }
+
+ if (demandIter == 0)
+ {
+ getIterSingle((void*) &threads_data[0]);
+ for (i=0; i<numberOfWorkgroups; i++)
+ {
+ iter = threads_updateIterations(i, demandIter);
+ }
+ }
+#ifdef DEBUG_LIKWID
+ else
+ {
+ ownprintf("Using manually selected iterations per thread\n");
+ }
+#endif
+
+ threads_create(runTest);
+ threads_join();
+
+ for (int i=0; i<globalNumberOfThreads; i++)
+ {
+ realSize += threads_data[i].data.size;
+ realIter += threads_data[i].data.iter;
+ if (threads_data[i].cycles > maxCycles)
+ {
+ maxCycles = threads_data[i].cycles;
+ }
+ if (threads_data[i].cycles < minCycles)
+ {
+ minCycles = threads_data[i].cycles;
+ }
+ }
+
+
+
+ time = (double) maxCycles / (double) cyclesClock;
+ ownprintf(bdata(HLINE));
+ ownprintf("Cycles:\t\t\t%" PRIu64 "\n", maxCycles);
+ ownprintf("CPU Clock:\t\t%" PRIu64 "\n", timer_getCpuClock());
+ ownprintf("Cycle Clock:\t\t%" PRIu64 "\n", cyclesClock);
+ ownprintf("Time:\t\t\t%e sec\n", time);
+ ownprintf("Iterations:\t\t%" PRIu64 "\n", realIter);
+ ownprintf("Iterations per thread:\t%" PRIu64 "\n",threads_data[0].data.iter);
+ ownprintf("Inner loop executions:\t%.0f\n", ((double)realSize)/((double)test->stride));
+ ownprintf("Size:\t\t\t%" PRIu64 "\n", realSize*test->bytes );
+ ownprintf("Size per thread:\t%" PRIu64 "\n", threads_data[0].data.size*test->bytes);
+ ownprintf("Number of Flops:\t%" PRIu64 "\n", (threads_data[0].data.iter * realSize * test->flops));
+ ownprintf("MFlops/s:\t\t%.2f\n",
+ 1.0E-06 * ((double) threads_data[0].data.iter * realSize * test->flops/ time));
+
+ ownprintf("Data volume (Byte):\t%llu\n", LLU_CAST (threads_data[0].data.iter * realSize * test->bytes));
+ ownprintf("MByte/s:\t\t%.2f\n",
+ 1.0E-06 * ( (double) threads_data[0].data.iter * realSize * test->bytes/ time));
+
+ cycPerUp = ((double) maxCycles / (double) (threads_data[0].data.iter * realSize));
+ ownprintf("Cycles per update:\t%f\n", cycPerUp);
+
+ switch ( test->type )
+ {
+ case INT:
+ case SINGLE:
+ ownprintf("Cycles per cacheline:\t%f\n", (16.0 * cycPerUp));
+ break;
+ case DOUBLE:
+ ownprintf("Cycles per cacheline:\t%f\n", (8.0 * cycPerUp));
+ break;
+ }
+ ownprintf("Loads per update:\t%ld\n", test->loads );
+ ownprintf("Stores per update:\t%ld\n", test->stores );
+ if ((test->loads > 0) && (test->stores > 0))
+ {
+ ownprintf("Load/store ratio:\t%.2f\n", ((double)test->loads)/((double)test->stores) );
+ }
+ if ((test->instr_loop > 0) && (test->instr_const > 0))
+ {
+ ownprintf("Instructions:\t\t%" PRIu64 "\n", LLU_CAST ((double)realSize/test->stride)*test->instr_loop*threads_data[0].data.iter + test->instr_const );
+ }
+ if (test->uops > 0)
+ {
+ ownprintf("UOPs:\t\t\t%" PRIu64 "\n", LLU_CAST ((double)realSize/test->stride)*test->uops*threads_data[0].data.iter);
+ }
+
+ ownprintf(bdata(HLINE));
+ threads_destroy(numberOfWorkgroups, test->streams);
+ allocator_finalize();
+ workgroups_destroy(&groups, numberOfWorkgroups, test->streams);
+
+#ifdef LIKWID_PERFMON
+ if (getenv("LIKWID_FILEPATH") != NULL)
+ {
+ ownprintf("Writing Likwid Marker API results to file %s\n", getenv("LIKWID_FILEPATH"));
+ }
+ LIKWID_MARKER_CLOSE;
+#endif
+
+ bdestroy(HLINE);
+ return EXIT_SUCCESS;
+}
+
diff --git a/bench/perl/AsmGen.pl b/bench/perl/AsmGen.pl
new file mode 100755
index 0000000..7fee506
--- /dev/null
+++ b/bench/perl/AsmGen.pl
@@ -0,0 +1,284 @@
+#!/usr/bin/perl -w
+use strict;
+no strict "refs";
+use warnings;
+use lib './perl';
+use Parse::RecDescent;
+use Data::Dumper;
+use Getopt::Std;
+use Cwd 'abs_path';
+
+use gas;
+
+my $ROOT = abs_path('./');
+my $DEBUG=0;
+my $VERBOSE=0;
+our $ISA = 'x86';
+our $AS = 'gas';
+my $OPT_STRING = 'hpvda:i:o:';
+my %OPT;
+my $INPUTFILE;
+my $OUTPUTFILE;
+my $CPP_ARGS='';
+
+# Enable warnings within the Parse::RecDescent module.
+$::RD_ERRORS = 1; # Make sure the parser dies when it encounters an error
+#$::RD_WARN = 1; # Enable warnings. This will warn on unused rules &c.
+#$::RD_HINT = 1; # Give out hints to help fix problems.
+#$::RD_TRACE = 1; # if defined, also trace parsers' behaviour
+$::RD_AUTOACTION = q { [@item[0..$#item]] };
+
+sub init
+{
+ getopts( "$OPT_STRING", \%OPT ) or usage();
+ if ($OPT{h}) { usage(); };
+ if ($OPT{v}) { $VERBOSE = 1;}
+ if ($OPT{d}) { $DEBUG = 1;}
+
+ if (! $ARGV[0]) {
+ die "ERROR: Please specify a input file!\n\nCall script with argument -h for help.\n";
+ }
+
+ $INPUTFILE = $ARGV[0];
+ $CPP_ARGS = $ARGV[1] if ($ARGV[1]);
+
+ if ($INPUTFILE =~ /.pas$/) {
+ $INPUTFILE =~ s/\.pas//;
+ } else {
+ die "ERROR: Input file must have pas ending!\n";
+ }
+ if ($OPT{o}) {
+ $OUTPUTFILE = $OPT{o};
+ }else {
+ $OUTPUTFILE = "$INPUTFILE.s";
+ }
+ if ($OPT{i}) {
+ $ISA = $OPT{i};
+ print "INFO: Using isa $ISA.\n\n" if ($VERBOSE);
+ } else {
+ print "INFO: No isa specified.\n Using default $ISA.\n\n" if ($VERBOSE);
+ }
+ if ($OPT{a}) {
+ $AS = $OPT{a};
+ print "INFO: Using as $AS.\n\n" if ($VERBOSE);
+ } else {
+ print "INFO: No as specified.\n Using default $AS.\n\n" if ($VERBOSE);
+ }
+
+ as::isa_init();
+}
+
+sub usage
+{
+ print <<END;
+usage: $0 [-$OPT_STRING] <INFILE>
+
+Required:
+<INFILE> : Input pas file
+
+Optional:
+-h : this (help) message
+-v : verbose output
+-d : debug mode: prints out the parse tree
+-p : Print out intermediate preprocessed output
+-o <FILE> : Output file
+-a <ASM> : Specify different assembler (Default: gas)
+-i <ISA> : Specify different isa (Default: x86)
+
+Example:
+$0 -i x86-64 -a masm -o out.s myfile.pas
+
+END
+
+exit(0);
+}
+
+#=======================================
+# GRAMMAR
+#=======================================
+$main::grammar = <<'_EOGRAMMAR_';
+# Terminals
+FUNC : /func/i
+LOOP : /loop/i
+ALLOCATE : /allocate/i
+FACTOR : /factor/i
+DEFINE : /define/i
+USE : /use/i
+STOP : /stop/i
+START : /start/i
+LOCAL : /local/i
+TIMER : /timer/i
+INCREMENT : /increment/i
+ALIGN : /align/i
+INT : /int/i
+SINGLE : /single/i
+DOUBLE : /double/i
+INUMBER : NUMBER
+UNUMBER : NUMBER
+SNUMBER : NUMBER
+FNUMBER : NUMBER
+OFFSET : /([0-9]+\,){15}[0-9]+/
+NUMBER : /[-+]?[0-9]*\.?[0-9]+/
+SYMBOL : /[.A-Z-a-z_][A-Za-z0-9_]*/
+REG : /GPR[0-9]+/i
+SREG : /GPR[0-9]+/i
+COMMENT : /#.*/
+{'skip'}
+
+type: SINGLE
+ |DOUBLE
+ |INT
+
+align: ALIGN <commit> NUMBER
+{
+{FUNC => 'as::align',
+ ARGS => ["$item{NUMBER}[1]"]}
+}
+
+ASMCODE : /[A-Za-z1-9.:]+.*/
+{
+{FUNC => 'as::emit_code',
+ ARGS => [$item[1]]}
+}
+
+function: FUNC SYMBOL block
+{[
+ {FUNC => 'as::function_entry',
+ ARGS => [$item{SYMBOL}[1],0]},
+ $item{block},
+ {FUNC => 'as::function_exit',
+ ARGS => [$item{SYMBOL}[1]]}
+]}
+
+function_allocate: FUNC SYMBOL ALLOCATE NUMBER block
+{[
+ {FUNC => 'as::function_entry',
+ ARGS => [$item{SYMBOL}[1],$item{NUMBER}[1]]},
+ $item{block},
+ {FUNC => 'as::function_exit',
+ ARGS => [$item{SYMBOL}[1]]}
+]}
+
+loop: LOOP SYMBOL INUMBER SNUMBER block
+{[
+{FUNC => 'as::loop_entry',
+ ARGS => [$item{SYMBOL}[1],$item{SNUMBER}[1][1]]},
+ $item{block},
+{FUNC => 'as::loop_exit',
+ ARGS => [$item{SYMBOL}[1],$item{INUMBER}[1][1]]}
+]}
+| LOOP SYMBOL INUMBER SREG block
+{[
+{FUNC => 'as::loop_entry',
+ ARGS => [$item{SYMBOL}[1],$item{SREG}[1]]},
+ $item{block},
+{FUNC => 'as::loop_exit',
+ ARGS => [$item{SYMBOL}[1],$item{INUMBER}[1][1]]}
+]}
+
+timer: START TIMER
+{
+{FUNC => 'isa::start_timer',
+ ARGS => []}
+}
+| STOP TIMER
+{
+{FUNC => 'isa::stop_timer',
+ ARGS => []}
+}
+
+mode: START LOCAL
+{
+{FUNC => 'as::mode',
+ ARGS => [$item[1][1]]}
+}
+| STOP LOCAL
+{
+{FUNC => 'as::mode',
+ ARGS => [$item[1][1]]}
+}
+
+block: '{' expression(s) '}'
+{ $item[2] }
+
+define_data: DEFINE type SYMBOL OFFSET
+{
+{FUNC => 'as::define_offset',
+ ARGS => [$item{SYMBOL}[1], $item{type}[1][1],"$item{OFFSET}[1]"]}
+}
+
+define_data: DEFINE type SYMBOL NUMBER
+{
+{FUNC => 'as::define_data',
+ ARGS => [$item{SYMBOL}[1], $item{type}[1][1],"$item{NUMBER}[1]"]}
+}
+
+
+expression: align
+ |COMMENT
+ |loop
+ |timer
+ |mode
+ |ASMCODE
+{ $item[1] }
+
+instruction : define_data
+ | align
+ | COMMENT
+ | mode
+ | function
+ | function_allocate
+{ $item[1] }
+
+startrule: instruction(s)
+{ $item[1] }
+
+_EOGRAMMAR_
+
+
+#=======================================
+# MAIN
+#=======================================
+init();
+print "INFO: Calling cpp with arguments $CPP_ARGS.\n" if ($VERBOSE);
+my $text = `cpp -x assembler-with-cpp $CPP_ARGS $INPUTFILE.pas`;
+
+if ($OPT{p}) {
+ open FILE,">$INPUTFILE.Pas";
+ print FILE $text;
+ close FILE;
+}
+
+open STDOUT,">$OUTPUTFILE";
+print "$as::AS->{HEADER}\n";
+
+my $parser = new Parse::RecDescent ($main::grammar) or die "ERROR: Bad grammar!\n";
+my $parse_tree = $parser->startrule($text) or print STDERR "ERROR: Syntax Error\n";
+tree_exec($parse_tree);
+
+if ($DEBUG) {
+ open FILE,'>parse_tree.txt';
+ print FILE Dumper $parse_tree,"\n";
+ close FILE;
+}
+
+print "$as::AS->{FOOTER}\n";
+
+sub tree_exec
+{
+ my $tree = shift;
+
+ foreach my $node (@$tree) {
+ if ($node !~ /^skip|^instruction|^expression|^loop/) {
+ if (ref($node) eq 'ARRAY') {
+ tree_exec($node);
+ }else {
+ if (ref($node) eq 'HASH') {
+ &{$node->{FUNC}}(@{$node->{ARGS}});
+ }
+ }
+ }
+ }
+}
+
+
diff --git a/perl/Parse/RecDescent.pm b/bench/perl/Parse/RecDescent.pm
similarity index 100%
rename from perl/Parse/RecDescent.pm
rename to bench/perl/Parse/RecDescent.pm
diff --git a/perl/Template.pm b/bench/perl/Template.pm
similarity index 100%
rename from perl/Template.pm
rename to bench/perl/Template.pm
diff --git a/perl/Template/Base.pm b/bench/perl/Template/Base.pm
similarity index 100%
rename from perl/Template/Base.pm
rename to bench/perl/Template/Base.pm
diff --git a/perl/Template/Config.pm b/bench/perl/Template/Config.pm
similarity index 100%
rename from perl/Template/Config.pm
rename to bench/perl/Template/Config.pm
diff --git a/perl/Template/Constants.pm b/bench/perl/Template/Constants.pm
similarity index 100%
rename from perl/Template/Constants.pm
rename to bench/perl/Template/Constants.pm
diff --git a/perl/Template/Context.pm b/bench/perl/Template/Context.pm
similarity index 100%
rename from perl/Template/Context.pm
rename to bench/perl/Template/Context.pm
diff --git a/perl/Template/Directive.pm b/bench/perl/Template/Directive.pm
similarity index 100%
rename from perl/Template/Directive.pm
rename to bench/perl/Template/Directive.pm
diff --git a/perl/Template/Document.pm b/bench/perl/Template/Document.pm
similarity index 100%
rename from perl/Template/Document.pm
rename to bench/perl/Template/Document.pm
diff --git a/perl/Template/Exception.pm b/bench/perl/Template/Exception.pm
similarity index 100%
rename from perl/Template/Exception.pm
rename to bench/perl/Template/Exception.pm
diff --git a/perl/Template/Filters.pm b/bench/perl/Template/Filters.pm
similarity index 100%
rename from perl/Template/Filters.pm
rename to bench/perl/Template/Filters.pm
diff --git a/perl/Template/Grammar.pm b/bench/perl/Template/Grammar.pm
similarity index 100%
rename from perl/Template/Grammar.pm
rename to bench/perl/Template/Grammar.pm
diff --git a/perl/Template/Iterator.pm b/bench/perl/Template/Iterator.pm
similarity index 100%
rename from perl/Template/Iterator.pm
rename to bench/perl/Template/Iterator.pm
diff --git a/perl/Template/Namespace/Constants.pm b/bench/perl/Template/Namespace/Constants.pm
similarity index 100%
rename from perl/Template/Namespace/Constants.pm
rename to bench/perl/Template/Namespace/Constants.pm
diff --git a/perl/Template/Parser.pm b/bench/perl/Template/Parser.pm
similarity index 100%
rename from perl/Template/Parser.pm
rename to bench/perl/Template/Parser.pm
diff --git a/perl/Template/Plugin.pm b/bench/perl/Template/Plugin.pm
similarity index 100%
rename from perl/Template/Plugin.pm
rename to bench/perl/Template/Plugin.pm
diff --git a/perl/Template/Plugin/Assert.pm b/bench/perl/Template/Plugin/Assert.pm
similarity index 100%
rename from perl/Template/Plugin/Assert.pm
rename to bench/perl/Template/Plugin/Assert.pm
diff --git a/perl/Template/Plugin/CGI.pm b/bench/perl/Template/Plugin/CGI.pm
similarity index 100%
rename from perl/Template/Plugin/CGI.pm
rename to bench/perl/Template/Plugin/CGI.pm
diff --git a/perl/Template/Plugin/Datafile.pm b/bench/perl/Template/Plugin/Datafile.pm
similarity index 100%
rename from perl/Template/Plugin/Datafile.pm
rename to bench/perl/Template/Plugin/Datafile.pm
diff --git a/perl/Template/Plugin/Date.pm b/bench/perl/Template/Plugin/Date.pm
similarity index 100%
rename from perl/Template/Plugin/Date.pm
rename to bench/perl/Template/Plugin/Date.pm
diff --git a/perl/Template/Plugin/Directory.pm b/bench/perl/Template/Plugin/Directory.pm
similarity index 100%
rename from perl/Template/Plugin/Directory.pm
rename to bench/perl/Template/Plugin/Directory.pm
diff --git a/perl/Template/Plugin/Dumper.pm b/bench/perl/Template/Plugin/Dumper.pm
similarity index 100%
rename from perl/Template/Plugin/Dumper.pm
rename to bench/perl/Template/Plugin/Dumper.pm
diff --git a/perl/Template/Plugin/File.pm b/bench/perl/Template/Plugin/File.pm
similarity index 100%
rename from perl/Template/Plugin/File.pm
rename to bench/perl/Template/Plugin/File.pm
diff --git a/perl/Template/Plugin/Filter.pm b/bench/perl/Template/Plugin/Filter.pm
similarity index 100%
rename from perl/Template/Plugin/Filter.pm
rename to bench/perl/Template/Plugin/Filter.pm
diff --git a/perl/Template/Plugin/Format.pm b/bench/perl/Template/Plugin/Format.pm
similarity index 100%
rename from perl/Template/Plugin/Format.pm
rename to bench/perl/Template/Plugin/Format.pm
diff --git a/perl/Template/Plugin/HTML.pm b/bench/perl/Template/Plugin/HTML.pm
similarity index 100%
rename from perl/Template/Plugin/HTML.pm
rename to bench/perl/Template/Plugin/HTML.pm
diff --git a/perl/Template/Plugin/Image.pm b/bench/perl/Template/Plugin/Image.pm
similarity index 100%
rename from perl/Template/Plugin/Image.pm
rename to bench/perl/Template/Plugin/Image.pm
diff --git a/perl/Template/Plugin/Iterator.pm b/bench/perl/Template/Plugin/Iterator.pm
similarity index 100%
rename from perl/Template/Plugin/Iterator.pm
rename to bench/perl/Template/Plugin/Iterator.pm
diff --git a/perl/Template/Plugin/Math.pm b/bench/perl/Template/Plugin/Math.pm
similarity index 100%
rename from perl/Template/Plugin/Math.pm
rename to bench/perl/Template/Plugin/Math.pm
diff --git a/perl/Template/Plugin/Pod.pm b/bench/perl/Template/Plugin/Pod.pm
similarity index 100%
rename from perl/Template/Plugin/Pod.pm
rename to bench/perl/Template/Plugin/Pod.pm
diff --git a/perl/Template/Plugin/Procedural.pm b/bench/perl/Template/Plugin/Procedural.pm
similarity index 100%
rename from perl/Template/Plugin/Procedural.pm
rename to bench/perl/Template/Plugin/Procedural.pm
diff --git a/perl/Template/Plugin/Scalar.pm b/bench/perl/Template/Plugin/Scalar.pm
similarity index 100%
rename from perl/Template/Plugin/Scalar.pm
rename to bench/perl/Template/Plugin/Scalar.pm
diff --git a/perl/Template/Plugin/String.pm b/bench/perl/Template/Plugin/String.pm
similarity index 100%
rename from perl/Template/Plugin/String.pm
rename to bench/perl/Template/Plugin/String.pm
diff --git a/perl/Template/Plugin/Table.pm b/bench/perl/Template/Plugin/Table.pm
similarity index 100%
rename from perl/Template/Plugin/Table.pm
rename to bench/perl/Template/Plugin/Table.pm
diff --git a/perl/Template/Plugin/URL.pm b/bench/perl/Template/Plugin/URL.pm
similarity index 100%
rename from perl/Template/Plugin/URL.pm
rename to bench/perl/Template/Plugin/URL.pm
diff --git a/perl/Template/Plugin/View.pm b/bench/perl/Template/Plugin/View.pm
similarity index 100%
rename from perl/Template/Plugin/View.pm
rename to bench/perl/Template/Plugin/View.pm
diff --git a/perl/Template/Plugin/Wrap.pm b/bench/perl/Template/Plugin/Wrap.pm
similarity index 100%
rename from perl/Template/Plugin/Wrap.pm
rename to bench/perl/Template/Plugin/Wrap.pm
diff --git a/perl/Template/Plugins.pm b/bench/perl/Template/Plugins.pm
similarity index 100%
rename from perl/Template/Plugins.pm
rename to bench/perl/Template/Plugins.pm
diff --git a/perl/Template/Provider.pm b/bench/perl/Template/Provider.pm
similarity index 100%
rename from perl/Template/Provider.pm
rename to bench/perl/Template/Provider.pm
diff --git a/perl/Template/Service.pm b/bench/perl/Template/Service.pm
similarity index 100%
rename from perl/Template/Service.pm
rename to bench/perl/Template/Service.pm
diff --git a/perl/Template/Stash.pm b/bench/perl/Template/Stash.pm
similarity index 100%
rename from perl/Template/Stash.pm
rename to bench/perl/Template/Stash.pm
diff --git a/perl/Template/Stash/Context.pm b/bench/perl/Template/Stash/Context.pm
similarity index 100%
rename from perl/Template/Stash/Context.pm
rename to bench/perl/Template/Stash/Context.pm
diff --git a/perl/Template/Stash/XS.pm b/bench/perl/Template/Stash/XS.pm
similarity index 100%
rename from perl/Template/Stash/XS.pm
rename to bench/perl/Template/Stash/XS.pm
diff --git a/perl/Template/Test.pm b/bench/perl/Template/Test.pm
similarity index 100%
rename from perl/Template/Test.pm
rename to bench/perl/Template/Test.pm
diff --git a/perl/Template/VMethods.pm b/bench/perl/Template/VMethods.pm
similarity index 100%
rename from perl/Template/VMethods.pm
rename to bench/perl/Template/VMethods.pm
diff --git a/perl/Template/View.pm b/bench/perl/Template/View.pm
similarity index 100%
rename from perl/Template/View.pm
rename to bench/perl/Template/View.pm
diff --git a/bench/perl/gas.pm b/bench/perl/gas.pm
new file mode 100644
index 0000000..c9f3f81
--- /dev/null
+++ b/bench/perl/gas.pm
@@ -0,0 +1,211 @@
+#!/usr/bin/perl
+
+package as;
+use Data::Dumper;
+use isax86;
+use isax86_64;
+
+$AS = { HEADER => '.intel_syntax noprefix',
+ FOOTER => ''};
+
+$LOCAL = {};
+$MODE = 'GLOBAL';
+
+my $CURRENT_SECTION='NONE';
+my $WORDLENGTH;
+my $STACKPTR;
+my $BASEPTR;
+my $REG;
+my $ARG;
+
+sub emit_code
+{
+ my $code = shift;
+ $code =~ s/([GF]PR[0-9]+)/$REG->{$1}/g;
+ $code =~ s/(ARG[0-9]+)/$ARG->{$1}/g;
+ $code =~ s/(LOCAL[0-9]+)/$LOCAL->{$1}/g;
+ print "$code\n";
+}
+
+sub align
+{
+ my $number = shift;
+ print ".align $number\n";
+
+}
+
+sub mode
+{
+ $cmd = shift;
+
+ if ($cmd eq 'START') {
+ $MODE = 'LOCAL';
+ } elsif ($cmd eq 'STOP') {
+ $MODE = 'GLOBAL';
+ }
+}
+
+sub function_entry
+{
+ my $symbolname = shift;
+ my $allocate = shift;
+ my $distance;
+
+ foreach ( (0 .. $allocate) ) {
+ $distance = $_ * $WORDLENGTH;
+ $LOCAL->{"LOCAL$_"} = "[$BASEPTR-$distance]";
+ }
+
+ if($CURRENT_SECTION ne 'text') {
+ $CURRENT_SECTION = 'text';
+ print ".text\n";
+ }
+
+ print ".globl $symbolname\n";
+ print ".type $symbolname, \@function\n";
+ print "$symbolname :\n";
+
+ if ($main::ISA eq 'x86') {
+ print "push ebp\n";
+ print "mov ebp, esp\n";
+ $distance = $allocate * $WORDLENGTH;
+ print "sub esp, $distance\n" if ($allocate);
+ print "push ebx\n";
+ print "push esi\n";
+ print "push edi\n";
+ } elsif ($main::ISA eq 'x86-64') {
+ print "push rbp\n";
+ print "mov rbp, rsp\n";
+ $distance = $allocate * $WORDLENGTH;
+ print "sub rsp, $distance\n" if ($allocate);
+ print "push rbx\n";
+ print "push r12\n";
+ print "push r13\n";
+ print "push r14\n";
+ print "push r15\n";
+ }
+}
+
+sub function_exit
+{
+ my $symbolname = shift;
+
+ $LOCAL = {};
+
+ if ($main::ISA eq 'x86') {
+ print "pop edi\n";
+ print "pop esi\n";
+ print "pop ebx\n";
+ print "mov esp, ebp\n";
+ print "pop ebp\n";
+ } elsif ($main::ISA eq 'x86-64') {
+ print "pop r15\n";
+ print "pop r14\n";
+ print "pop r13\n";
+ print "pop r12\n";
+ print "pop rbx\n";
+ print "mov rsp, rbp\n";
+ print "pop rbp\n";
+ }
+ print "ret\n";
+ print ".size $symbolname, .-$symbolname\n";
+ print "\n";
+}
+
+sub define_data
+{
+ my $symbolname = shift;
+ my $type = shift;
+ my $value = shift;
+
+ if($CURRENT_SECTION ne 'data') {
+ $CURRENT_SECTION = 'data';
+ print ".data\n";
+ }
+ print ".align 64\n";
+ print "$symbolname:\n";
+ if ($type eq 'DOUBLE') {
+ print ".double $value, $value, $value, $value, $value, $value, $value, $value\n"
+ } elsif ($type eq 'SINGLE') {
+ print ".single $value, $value, $value, $value, $value, $value, $value, $value\n"
+ } elsif ($type eq 'INT') {
+ print ".int $value, $value\n"
+ }
+}
+
+sub define_offset
+{
+ my $symbolname = shift;
+ my $type = shift;
+ my $value = shift;
+
+ if($CURRENT_SECTION ne 'data') {
+ $CURRENT_SECTION = 'data';
+ print ".data\n";
+ }
+ print ".align 16\n";
+ print "$symbolname:\n";
+ print ".int $value\n";
+}
+
+
+sub loop_entry
+{
+ my $symbolname = shift;
+ my $stopping_criterion = shift;
+ $stopping_criterion = $REG->{$stopping_criterion} if( exists $REG->{$stopping_criterion});
+
+ if ($main::ISA eq 'x86') {
+ print "xor eax, eax\n";
+ } elsif ($main::ISA eq 'x86-64') {
+ print "xor rax, rax\n";
+ }
+ print ".align 16\n";
+ if ($MODE eq 'GLOBAL') {
+ print "$symbolname :\n";
+ }else {
+ print "1:\n";
+ }
+
+}
+
+
+sub loop_exit
+{
+ my $symbolname = shift;
+ my $step = shift;
+
+ if ($main::ISA eq 'x86') {
+ print "add eax, $step\n";
+ print "cmp eax, edi\n";
+ } elsif ($main::ISA eq 'x86-64') {
+ print "addq rax, $step\n";
+ print "cmpq rax, rdi\n";
+ }
+ if ($MODE eq 'GLOBAL') {
+ print "jl $symbolname\n";
+ }else {
+ print "jl 1b\n";
+ }
+ print "\n";
+}
+
+sub isa_init
+{
+ if ($main::ISA eq 'x86') {
+ $WORDLENGTH = $isax86::WORDLENGTH_X86 ;
+ $STACKPTR = $isax86::STACKPTR_X86 ;
+ $BASEPTR = $isax86::BASEPTR_X86 ;
+ $REG = $isax86::REG_X86;
+ $ARG = $isax86::ARG_X86 ;
+ } elsif ($main::ISA eq 'x86-64') {
+ $WORDLENGTH = $isax86_64::WORDLENGTH_X86_64;
+ $STACKPTR = $isax86_64::STACKPTR_X86_64 ;
+ $BASEPTR = $isax86_64::BASEPTR_X86_64 ;
+ $REG = $isax86_64::REG_X86_64;
+ $ARG = $isax86_64::ARG_X86_64 ;
+ }
+}
+
+
+1;
diff --git a/bench/perl/generatePas.pl b/bench/perl/generatePas.pl
new file mode 100755
index 0000000..2dcd530
--- /dev/null
+++ b/bench/perl/generatePas.pl
@@ -0,0 +1,198 @@
+#!/usr/bin/perl
+
+use lib 'util';
+use strict;
+use warnings;
+use lib './perl';
+use File::Copy;
+use Cwd 'abs_path';
+use Data::Dumper;
+use Template;
+
+my @Testcases;
+my $name;
+my $streams;
+my $type;
+my $flops;
+my $bytes;
+my $desc;
+my $prolog='';
+my $loop='';
+my $increment;
+my $isLoop=0;
+my $skip=0;
+my $multi=0;
+
+my $BenchRoot = $ARGV[0];
+my $OutputDirectory = $ARGV[1];
+my $TemplateRoot = $ARGV[2];
+my $InputFile = "";
+if (@ARGV == 4)
+{
+ $InputFile = $ARGV[3];
+}
+my $DEBUG = 0;
+
+my $stream_lookup = {
+ STR0 => 'ARG2',
+ STR1 => 'ARG3',
+ STR2 => 'ARG4',
+ STR3 => 'ARG5',
+ STR4 => 'ARG6',
+ STR5 => '[rbp+16]',
+ STR6 => '[rbp+24]',
+ STR7 => '[rbp+32]',
+ STR8 => '[rbp+40]',
+ STR9 => '[rbp+48]',
+ STR10 => '[rbp+56]',
+ STR11 => '[rbp+64]',
+ STR12 => '[rbp+72]',
+ STR13 => '[rbp+80]',
+ STR14 => '[rbp+88]',
+ STR15 => '[rbp+96]',
+ STR16 => '[rbp+104]',
+ STR17 => '[rbp+112]',
+ STR18 => '[rbp+120]',
+ STR19 => '[rbp+128]',
+ STR20 => '[rbp+136]',
+ STR21 => '[rbp+144]',
+ STR22 => '[rbp+152]',
+ STR23 => '[rbp+160]',
+ STR24 => '[rbp+168]',
+ STR25 => '[rbp+176]',
+ STR26 => '[rbp+184]',
+ STR27 => '[rbp+192]',
+ STR28 => '[rbp+200]',
+ STR29 => '[rbp+208]',
+ STR30 => '[rbp+216]',
+ STR31 => '[rbp+224]',
+ STR32 => '[rbp+232]',
+ STR33 => '[rbp+240]',
+ STR34 => '[rbp+248]',
+ STR35 => '[rbp+256]',
+ STR36 => '[rbp+264]',
+ STR37 => '[rbp+272]',
+ STR38 => '[rbp+280]',
+ STR39 => '[rbp+288]',
+ STR40 => '[rbp+296]'};
+
+opendir (DIR, "./$BenchRoot") or die "Cannot open bench directory: $!\n";
+my $tpl = Template->new({
+ INCLUDE_PATH => ["$TemplateRoot"]
+ });
+
+while (defined(my $file = readdir(DIR))) {
+ if ($file !~ /^\./) {
+ print "SCANNING $file\n" if ($DEBUG);
+
+ $file =~ /([A-Za-z_0-9]+)\.ptt/;
+ $name = $1;
+
+ $isLoop = 0;
+ $skip=0;
+ $multi=0;
+ $prolog='';
+ $loop='';
+ $desc='';
+ my $loads=-1;
+ my $stores=-1;
+ my $branches=-1;
+ my $instr=-1;
+ my $loop_instr=-1;
+ my $uops = -1;
+ open FILE, "<$BenchRoot/$file";
+ while (<FILE>) {
+ my $line = $_;
+
+ if($line =~ /STREAMS[ ]+([0-9]+)/) {
+ $streams = $1;
+ if ($streams > 10) {
+ $multi = 1;
+ }
+ } elsif ($line =~ /TYPE[ ]+(SINGLE|DOUBLE|INT)/) {
+ $type = $1;
+ } elsif ($line =~ /FLOPS[ ]+([0-9]+)/) {
+ $flops = $1;
+ } elsif ($line =~ /BYTES[ ]+([0-9]+)/) {
+ $bytes = $1;
+ } elsif ($line =~ /LOADS[ ]+([0-9]+)/) {
+ $loads = $1;
+ } elsif ($line =~ /STORES[ ]+([0-9]+)/) {
+ $stores = $1;
+ } elsif ($line =~ /BRANCHES[ ]+([0-9]+)/) {
+ $branches = $1;
+ } elsif ($line =~ /INSTR_CONST[ ]+([0-9]+)/) {
+ $instr = $1;
+ } elsif ($line =~ /INSTR_LOOP[ ]+([0-9]+)/) {
+ $loop_instr = $1;
+ } elsif ($line =~ /UOPS[ ]+([0-9]+)/) {
+ $uops = $1;
+ } elsif ($line =~ /DESC[ ]+([a-zA-z ,.\-_\(\)\+\*\/=]+)/) {
+ $desc = $1;
+ } elsif ($line =~ /INC[ ]+([0-9]+)/) {
+ $increment = $1;
+ $skip = 1;
+ } elsif ($line =~ /LOOP[ ]+([0-9]+)/) {
+ $increment = $1;
+ $isLoop = 1;
+ } else {
+ if ($isLoop) {
+ if($line =~ /SET[ ]+(STR[0-9]+)[ ]+(GPR[0-9]+)/) {
+ $loop .= "#define $1 $2\n";
+ $loop .= "mov $2, $stream_lookup->{$1}\n";
+ } else {
+ $loop .= $line;
+ }
+ } else {
+ $prolog .= $line;
+ }
+ }
+ }
+ close FILE;
+
+ if (($streams > 5) && ($streams < 10)) {
+ my $arg = 7;
+ foreach my $stream ( 5 .. $streams ) {
+ $prolog .= "mov STR$stream, ARG$arg\n";
+ $arg++;
+ }
+ }
+
+ $streams = 'STREAM_'.$streams;
+ my $Vars;
+ $Vars->{name} = $name;
+ $Vars->{prolog} = $prolog;
+ $Vars->{increment} = $increment;
+ $Vars->{loop} = $loop;
+ $Vars->{skip} = $skip;
+ $Vars->{multi} = $multi;
+ $Vars->{desc} = $desc;
+
+#print Dumper($Vars);
+
+ $tpl->process('bench.tt', $Vars, "$OutputDirectory/$name.pas");
+ push(@Testcases,{name => $name,
+ streams => $streams,
+ type => $type,
+ stride => $increment,
+ flops => $flops,
+ bytes => $bytes,
+ desc => $desc,
+ loads => $loads,
+ stores => $stores,
+ branches => $branches,
+ instr_const => $instr,
+ instr_loop => $loop_instr,
+ uops => $uops});
+ }
+}
+#print Dumper(@Testcases);
+my @TestcasesSorted = sort {$a->{name} cmp $b->{name}} @Testcases;
+
+my $Vars;
+$Vars->{Testcases} = \@TestcasesSorted;
+$Vars->{numKernels} = $#TestcasesSorted+1;
+$Vars->{allTests} = join('\n',map {$_->{name}." - ".$_->{desc}} @TestcasesSorted);
+$tpl->process('testcases.tt', $Vars, "$OutputDirectory/testcases.h");
+
+
diff --git a/perl/isax86.pm b/bench/perl/isax86.pm
similarity index 100%
rename from perl/isax86.pm
rename to bench/perl/isax86.pm
diff --git a/perl/isax86_64.pm b/bench/perl/isax86_64.pm
similarity index 100%
rename from perl/isax86_64.pm
rename to bench/perl/isax86_64.pm
diff --git a/perl/templates/bench.tt b/bench/perl/templates/bench.tt
similarity index 100%
rename from perl/templates/bench.tt
rename to bench/perl/templates/bench.tt
diff --git a/bench/perl/templates/group.tt b/bench/perl/templates/group.tt
new file mode 100644
index 0000000..5676318
--- /dev/null
+++ b/bench/perl/templates/group.tt
@@ -0,0 +1,157 @@
+/* GENERATED FILE: DO NOTE EDIT */
+
+#define NUM_GROUPS_[% arch FILTER upper %] [% numGroups %]
+
+static PerfmonGroupMap [% arch %]_group_map[NUM_GROUPS_[% arch FILTER upper %]] = {
+[% FOREACH group IN groups %]
+ {"[% group.name %]",[% group.name %],[% group.isUncore %],"[% group.shortHelp %]","[% group.eventSet %]"},
+[% END %]
+};
+
+/*void
+perfmon_printDerivedMetrics[% arch FILTER ucfirst %](PerfmonGroup group)
+{
+ int threadId;
+ double time = rdtscTime;
+ double inverseClock = 1.0 /(double) timer_getCpuClock();
+ PerfmonResultTable tableData;
+ int numRows;
+ int numColumns = perfmon_numThreads;
+ bstring label;
+ bstrList* fc;
+ double** stat;
+ double tmpValue;
+ uint64_t cpi_instr = 0;
+ uint64_t cpi_cyc = 0;
+ int cpi_index = 0;
+
+ switch ( group )
+ {
+[% FOREACH group IN groups %]
+ case [% group.name %]:
+ numRows = [% group.numRows %];
+ stat = (double**) malloc(numRows * sizeof(double*));
+ for (int i=0; i<numRows; i++)
+ {
+ stat[i] = (double*) malloc(4 * sizeof(double));
+ stat[i][0] = 0;
+ stat[i][1] = 0;
+ stat[i][2] = DBL_MAX;
+ }
+ INIT_BASIC;
+[% FOREACH metric IN group.metrics %]
+ bstrListAdd(fc,[% loop.count %],[% metric.label %]);
+[% END %]
+ initResultTable(&tableData, fc, numRows, numColumns);
+
+ for(threadId=0; threadId < perfmon_numThreads; threadId++)
+ {
+[% FOREACH metric IN group.metrics %]
+ tmpValue = [% metric.rule %];
+ if (!isnan(tmpValue))
+ {
+ tableData.rows[[% loop.index %]].value[threadId] = tmpValue;
+ }
+ else
+ {
+ tableData.rows[[% loop.index %]].value[threadId] = 0.0;
+ }
+[% IF metric.label == 'CPI' && arch == 'westmere' %]
+ cpi_instr += perfmon_getResult(threadId,"FIXC0");
+ cpi_cyc += perfmon_getResult(threadId,"FIXC1");
+ cpi_index = [% loop.index %];
+[% ELSE %]
+ stat[[% loop.index %]][0] += (double) tableData.rows[[% loop.index %]].value[threadId];
+[% END %]
+ stat[[% loop.index %]][1] = MAX(stat[[% loop.index %]][1],(double) tableData.rows[[% loop.index %]].value[threadId]);
+ stat[[% loop.index %]][2] = MIN(stat[[% loop.index %]][2],(double) tableData.rows[[% loop.index %]].value[threadId]);
+[% END %]
+ }
+
+ if (cpi_instr)
+ {
+ stat[cpi_index][0] = (double) cpi_cyc / (double) cpi_instr;
+ }
+
+ break;
+[% END %]
+
+ default:
+ fprintf (stderr, "perfmon_printDerivedMetrics[% arch %]: Unknown group! Exiting!\n" );
+ exit (EXIT_FAILURE);
+ break;
+ }
+
+ printResultTable(&tableData);
+ freeResultTable(&tableData);
+
+ // for threaded results print sum, max, min and avg
+ if (perfmon_numThreads > 1)
+ {
+ initStatisticTable(&tableData, fc, numRows);
+ for (int i=0; i<numRows; i++)
+ {
+ stat[i][3] = stat[i][0]/perfmon_numThreads;
+ for (int j=0; j<4; j++)
+ {
+ tableData.rows[i].value[j] = stat[i][j];
+ }
+ }
+ printResultTable(&tableData);
+ freeResultTable(&tableData);
+ }
+
+ for (int i=0; i<numRows; i++)
+ {
+ free(stat[i]);
+ }
+ free(stat);
+ bstrListDestroy(fc);
+}
+
+void
+perfmon_logDerivedMetrics[% arch FILTER ucfirst %](PerfmonGroup group, double time,double timeStamp)
+{
+ int threadId;
+ double tmpValue;
+ double inverseClock = 1.0 /(double) timer_getCpuClock();
+
+ switch ( group )
+ {
+ [% FOREACH group IN groups %]
+ case [% group.name %]:
+
+ [% FOREACH metric IN group.metrics %]
+ printf("[% metric.label %] %e ",timeStamp);
+ for(threadId=0; threadId < perfmon_numThreads; threadId++)
+ {
+ tmpValue = [% metric.rule %];
+ if (!isnan(tmpValue))
+ {
+ printf(" %e ", tmpValue);
+ }
+ else
+ {
+ printf(" 0.0 ");
+ }
+ }
+ printf("\n");
+ [% END %]
+ break;
+ [% END %]
+
+ default:
+ fprintf (stderr, "perfmon_printDerivedMetrics[% arch %]: Unknown group! Exiting!\n" );
+ exit (EXIT_FAILURE);
+ break;
+ }
+}*/
+
+
+
+static PerfmonGroupHelp [% arch %]_group_help[NUM_GROUPS_[% arch FILTER upper %]] = {
+[% FOREACH group IN groups %]
+ {"[% group.name %]","[% group.longHelp %]"},
+[% END %]
+};
+
diff --git a/perl/templates/group_types.tt b/bench/perl/templates/group_types.tt
similarity index 100%
rename from perl/templates/group_types.tt
rename to bench/perl/templates/group_types.tt
diff --git a/bench/perl/templates/testcases.tt b/bench/perl/templates/testcases.tt
new file mode 100644
index 0000000..ceaa23b
--- /dev/null
+++ b/bench/perl/templates/testcases.tt
@@ -0,0 +1,19 @@
+#ifndef TESTCASES_H
+#define TESTCASES_H
+
+#include <test_types.h>
+
+[% FOREACH test IN Testcases %]
+extern void [% test.name %]();
+[% END %]
+
+#define TESTS "[% allTests %]"
+#define NUMKERNELS [% numKernels %]
+
+static const TestCase kernels[NUMKERNELS] = {
+ [% FOREACH test IN Testcases %]
+ {"[% test.name %]" , [% test.streams %], [% test.type %], [% test.stride %], &[% test.name %], [% test.flops %], [% test.bytes %], "[% test.desc %]", [% test.loads %], [% test.stores %], [% test.branches %], [% test.instr_const %], [% test.instr_loop %], [% test.uops %]},
+ [% END %]
+};
+
+#endif /* TESTCASES_H */
diff --git a/bench/phi/store.ptt b/bench/phi/store.ptt
index 533501c..3aa5bd2 100644
--- a/bench/phi/store.ptt
+++ b/bench/phi/store.ptt
@@ -2,10 +2,10 @@ STREAMS 1
TYPE DOUBLE
FLOPS 0
BYTES 8
-vmovaps zmm0, [SCALAR]
-vmovaps zmm1, [SCALAR]
-vmovaps zmm2, [SCALAR]
-vmovaps zmm3, [SCALAR]
+vmovaps zmm0, [rip+SCALAR]
+vmovaps zmm1, [rip+SCALAR]
+vmovaps zmm2, [rip+SCALAR]
+vmovaps zmm3, [rip+SCALAR]
LOOP 32
vprefetch0 [STR0 + GPR1 * 8 + 1024]
vmovaps [STR0 + GPR1 * 8] , zmm0
diff --git a/bench/phi/store_mem.ptt b/bench/phi/store_mem.ptt
index fa8d262..0aeccd6 100644
--- a/bench/phi/store_mem.ptt
+++ b/bench/phi/store_mem.ptt
@@ -2,10 +2,10 @@ STREAMS 1
TYPE DOUBLE
FLOPS 0
BYTES 8
-vmovaps zmm0, [SCALAR]
-vmovaps zmm1, [SCALAR]
-vmovaps zmm2, [SCALAR]
-vmovaps zmm3, [SCALAR]
+vmovaps zmm0, [rip+SCALAR]
+vmovaps zmm1, [rip+SCALAR]
+vmovaps zmm2, [rip+SCALAR]
+vmovaps zmm3, [rip+SCALAR]
LOOP 32
vprefetch0 [STR0 + GPR1 * 8 + 1024]
vmovnrngoaps [STR0 + GPR1 * 8], zmm0
diff --git a/bench/src/allocator.c b/bench/src/allocator.c
new file mode 100644
index 0000000..ea0be48
--- /dev/null
+++ b/bench/src/allocator.c
@@ -0,0 +1,209 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: allocator.c
+ *
+ * Description: Implementation of allocator module.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+/* ##### HEADER FILE INCLUDES ######################################### */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <allocator_types.h>
+#include <allocator.h>
+#include <likwid.h>
+
+/* ##### EXPORTED VARIABLES ########################################### */
+
+
+/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
+
+
+/* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */
+
+static int numberOfAllocatedVectors = 0;
+static allocation* allocList;
+static AffinityDomains_t domains = NULL;
+
+/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
+
+
+/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
+
+void
+allocator_init(int numVectors)
+{
+ allocList = (allocation*) malloc(numVectors * sizeof(allocation));
+ domains = get_affinityDomains();
+}
+
+
+void
+allocator_finalize()
+{
+ int i;
+
+ for (i=0; i<numberOfAllocatedVectors; i++)
+ {
+ free(allocList[i].ptr);
+ allocList[i].ptr = NULL;
+ allocList[i].size = 0;
+ allocList[i].offset = 0;
+ }
+ numberOfAllocatedVectors = 0;
+}
+
+void
+allocator_allocateVector(
+ void** ptr,
+ int alignment,
+ uint64_t size,
+ int offset,
+ DataType type,
+ bstring domainString)
+{
+ int i;
+ size_t bytesize = 0;
+ const AffinityDomain* domain = NULL;
+ int errorCode;
+ int elements = 0;
+
+ switch ( type )
+ {
+ case INT:
+ bytesize = (size+offset) * sizeof(int);
+ elements = alignment / sizeof(int);
+ break;
+
+ case SINGLE:
+ bytesize = (size+offset) * sizeof(float);
+ elements = alignment / sizeof(float);
+ break;
+
+ case DOUBLE:
+ bytesize = (size+offset) * sizeof(double);
+ elements = alignment / sizeof(double);
+ break;
+ }
+
+ for (i=0;i<domains->numberOfAffinityDomains;i++)
+ {
+ if (biseq(domainString, domains->domains[i].tag))
+ {
+ domain = domains->domains + i;
+ }
+ }
+ if (!domain)
+ {
+ fprintf(stderr, "Error: Cannot use desired domain %s for vector placement, Domain %s does not exist.\n",
+ bdata(domainString), bdata(domainString));
+ exit(EXIT_FAILURE);
+ }
+
+ errorCode = posix_memalign(ptr, alignment, bytesize);
+
+ if (errorCode)
+ {
+ if (errorCode == EINVAL)
+ {
+ fprintf(stderr,
+ "Error: Alignment parameter is not a power of two\n");
+ exit(EXIT_FAILURE);
+ }
+ if (errorCode == ENOMEM)
+ {
+ fprintf(stderr,
+ "Error: Insufficient memory to fulfill the request\n");
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ if ((*ptr) == NULL)
+ {
+ fprintf(stderr, "Error: posix_memalign failed!\n");
+ exit(EXIT_FAILURE);
+ }
+
+ allocList[numberOfAllocatedVectors].ptr = *ptr;
+ allocList[numberOfAllocatedVectors].size = bytesize;
+ allocList[numberOfAllocatedVectors].offset = offset;
+ allocList[numberOfAllocatedVectors].type = type;
+ numberOfAllocatedVectors++;
+
+ affinity_pinProcess(domain->processorList[0]);
+ printf("Allocate: Process running on core %d (Domain %s) - Vector length %llu Offset %d Alignment %llu\n",
+ affinity_processGetProcessorId(),
+ bdata(domain->tag),
+ LLU_CAST bytesize,
+ offset,
+ LLU_CAST elements);
+
+ switch ( type )
+ {
+ case INT:
+ {
+ int* sptr = (int*) (*ptr);
+ sptr += offset;
+
+ for ( uint64_t i=0; i < size; i++ )
+ {
+ sptr[i] = 1;
+ }
+ *ptr = (void*) sptr;
+
+ }
+ break;
+
+ case SINGLE:
+ {
+ float* sptr = (float*) (*ptr);
+ sptr += offset;
+
+ for ( uint64_t i=0; i < size; i++ )
+ {
+ sptr[i] = 1.0;
+ }
+ *ptr = (void*) sptr;
+
+ }
+ break;
+
+ case DOUBLE:
+ {
+ double* dptr = (double*) (*ptr);
+ dptr += offset;
+
+ for ( uint64_t i=0; i < size; i++ )
+ {
+ dptr[i] = 1.0;
+ }
+ *ptr = (void*) dptr;
+ }
+ break;
+ }
+}
+
diff --git a/bench/src/barrier.c b/bench/src/barrier.c
new file mode 100644
index 0000000..4b0e344
--- /dev/null
+++ b/bench/src/barrier.c
@@ -0,0 +1,167 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: barrier.c
+ *
+ * Description: Implementation of threaded spin loop barrier
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+/* ##### HEADER FILE INCLUDES ######################################### */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <errno.h>
+#include <barrier.h>
+
+/* ##### EXPORTED VARIABLES ########################################### */
+
+
+/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
+
+#define CACHELINE_SIZE 64
+
+/* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */
+
+static BarrierGroup* groups;
+static int currentGroupId = 0;
+static int maxGroupId = 0;
+
+/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
+
+
+/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
+
+int
+barrier_registerGroup(int numThreads)
+{
+ int ret;
+
+ if (currentGroupId > maxGroupId)
+ {
+ fprintf(stderr, "ERROR: Group ID %d larger than maxGroupID %d\n",currentGroupId,maxGroupId);
+ }
+
+ groups[currentGroupId].numberOfThreads = numThreads;
+ ret = posix_memalign(
+ (void**) &groups[currentGroupId].groupBval,
+ CACHELINE_SIZE,
+ numThreads * 32 * sizeof(int));
+
+ if (ret < 0)
+ {
+ fprintf(stderr, "ERROR: Cannot register thread group - %s\n", strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+
+ return currentGroupId++;
+}
+
+void
+barrier_registerThread(BarrierData* barr, int groupId, int threadId)
+{
+ int ret;
+ int i;
+ int j = 1;
+ if (groupId > currentGroupId)
+ {
+ fprintf(stderr, "ERROR: Group not yet registered");
+ }
+ if (threadId > groups[groupId].numberOfThreads)
+ {
+ fprintf(stderr, "ERROR: Thread ID %d too large\n",threadId);
+ }
+
+ barr->numberOfThreads = groups[groupId].numberOfThreads;
+ barr->offset = 0;
+ barr->val = 1;
+ barr->bval = groups[groupId].groupBval;
+ ret = posix_memalign(
+ (void**) &(barr->index),
+ CACHELINE_SIZE,
+ barr->numberOfThreads * sizeof(int));
+
+ if (ret < 0)
+ {
+ fprintf(stderr, "ERROR: Cannot register thread - %s\n", strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+
+ barr->index[0] = threadId;
+
+ for (i = 0; i < barr->numberOfThreads; i++)
+ {
+ if (!(i == threadId))
+ {
+ barr->index[j++] = i;
+ }
+ }
+}
+
+
+void
+barrier_init(int numberOfGroups)
+{
+ maxGroupId = numberOfGroups-1;
+ groups = (BarrierGroup*) malloc(numberOfGroups * sizeof(BarrierGroup));
+ if (!groups)
+ {
+ fprintf(stderr, "ERROR: Cannot allocate barrier - %s\n", strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+}
+
+void
+barrier_synchronize(BarrierData* barr)
+{
+ int i;
+
+ barr->bval[barr->index[0] * 32 + barr->offset * 16] = barr->val;
+
+ for (i = 1; i < barr->numberOfThreads; i++)
+ {
+ while (barr->bval[barr->index[i] * 32 + barr->offset * 16] != barr->val)
+ {
+ __asm__ ("pause");
+ }
+ }
+
+ if (barr->offset)
+ {
+ barr->val = !barr->val;
+ }
+ barr->offset = !barr->offset;
+}
+
+void barrier_destroy(BarrierData* barr)
+{
+ if (currentGroupId > maxGroupId)
+ {
+ fprintf(stderr, "ERROR: Group ID %d larger than maxGroupID %d\n",currentGroupId,maxGroupId);
+ }
+ free(barr->index);
+ free(groups[currentGroupId].groupBval);
+}
diff --git a/bench/src/bench.c b/bench/src/bench.c
new file mode 100644
index 0000000..e1e1a97
--- /dev/null
+++ b/bench/src/bench.c
@@ -0,0 +1,770 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: bench.c
+ *
+ * Description: Benchmarking framework for likwid-bench
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+/* ##### HEADER FILE INCLUDES ######################################### */
+
+#include <pthread.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/syscall.h>
+#include <string.h>
+#include <sched.h>
+#include <unistd.h>
+
+#include <allocator.h>
+#include <threads.h>
+#include <barrier.h>
+#include <likwid.h>
+
+/* ##### EXPORTED VARIABLES ########################################### */
+
+
+/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
+
+#define BARRIER barrier_synchronize(&barr)
+
+
+#define EXECUTE(func) \
+ BARRIER; \
+ LIKWID_MARKER_START("bench"); \
+ timer_start(&time); \
+ for (i=0; i<myData->iter; i++) \
+ { \
+ func; \
+ } \
+ BARRIER; \
+ timer_stop(&time); \
+ LIKWID_MARKER_STOP("bench"); \
+ data->cycles = timer_printCycles(&time); \
+ BARRIER
+
+
+
+
+/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
+
+void* runTest(void* arg)
+{
+ int threadId;
+ int offset;
+ size_t size;
+ size_t vecsize;
+ size_t i;
+ BarrierData barr;
+ ThreadData* data;
+ ThreadUserData* myData;
+ TimerData time;
+ FuncPrototype func;
+
+ data = (ThreadData*) arg;
+ myData = &(data->data);
+ func = myData->test->kernel;
+ threadId = data->threadId;
+ barrier_registerThread(&barr, 0, data->globalThreadId);
+
+ /* Prepare ptrs for thread */
+ vecsize = myData->size;
+ size = myData->size / data->numberOfThreads;
+ myData->size = size;
+ size -= (size % myData->test->stride);
+ offset = data->threadId * size;
+
+
+ switch ( myData->test->type )
+ {
+ case SINGLE:
+ {
+ float* sptr;
+ for (i=0; i < myData->test->streams; i++)
+ {
+ sptr = (float*) myData->streams[i];
+ sptr += offset;
+ myData->streams[i] = (float*) sptr;
+ }
+ }
+ break;
+ case INT:
+ {
+ int* sptr;
+ for (i=0; i < myData->test->streams; i++)
+ {
+ sptr = (int*) myData->streams[i];
+ sptr += offset;
+ myData->streams[i] = (int*) sptr;
+ }
+ }
+ break;
+ case DOUBLE:
+ {
+ double* dptr;
+ for (i=0; i < myData->test->streams; i++)
+ {
+ dptr = (double*) myData->streams[i];
+ dptr += offset;
+ myData->streams[i] = (double*) dptr;
+ }
+ }
+ break;
+ }
+
+
+ /* pin the thread */
+ likwid_pinThread(myData->processors[threadId]);
+ printf("Group: %d Thread %d Global Thread %d running on core %d - Vector length %llu Offset %d\n",
+ data->groupId,
+ threadId,
+ data->globalThreadId,
+ affinity_threadGetProcessorId(),
+ LLU_CAST vecsize,
+ offset);
+ BARRIER;
+
+ /* Up to 10 streams the following registers are used for Array ptr:
+ * Size rdi
+ * in Registers: rsi rdx rcx r8 r9
+ * passed on stack, then: r10 r11 r12 r13 r14 r15
+ * If more than 10 streams are used first 5 streams are in register, above 5 a macro must be used to
+ * load them from stack
+ * */
+
+ switch ( myData->test->streams ) {
+ case STREAM_1:
+ EXECUTE(func(size,myData->streams[0]));
+ break;
+ case STREAM_2:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1]));
+ break;
+ case STREAM_3:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2]));
+ break;
+ case STREAM_4:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3]));
+ break;
+ case STREAM_5:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4]));
+ break;
+ case STREAM_6:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5]));
+ break;
+ case STREAM_7:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6]));
+ break;
+ case STREAM_8:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7]));
+ break;
+ case STREAM_9:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8]));
+ break;
+ case STREAM_10:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9]));
+ break;
+ case STREAM_11:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10]));
+ break;
+ case STREAM_12:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11]));
+ break;
+ case STREAM_13:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12]));
+ break;
+ case STREAM_14:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13]));
+ break;
+ case STREAM_15:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14]));
+ break;
+ case STREAM_16:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15]));
+ break;
+ case STREAM_17:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16]));
+ break;
+ case STREAM_18:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17]));
+ break;
+ case STREAM_19:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18]));
+ break;
+ case STREAM_20:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19]));
+ break;
+ case STREAM_21:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20]));
+ break;
+ case STREAM_22:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21]));
+ break;
+ case STREAM_23:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22]));
+ break;
+ case STREAM_24:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23]));
+ break;
+ case STREAM_25:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24]));
+ break;
+ case STREAM_26:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25]));
+ break;
+ case STREAM_27:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25],myData->streams[26]));
+ break;
+ case STREAM_28:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27]));
+ break;
+ case STREAM_29:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+ myData->streams[28]));
+ break;
+ case STREAM_30:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+ myData->streams[28],myData->streams[29]));
+ break;
+ case STREAM_31:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+ myData->streams[28],myData->streams[29],myData->streams[30]));
+ break;
+ case STREAM_32:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+ myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31]));
+ break;
+ case STREAM_33:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+ myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+ myData->streams[32]));
+ break;
+ case STREAM_34:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+ myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+ myData->streams[32],myData->streams[33]));
+ break;
+ case STREAM_35:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+ myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+ myData->streams[32],myData->streams[33],myData->streams[34]));
+ break;
+ case STREAM_36:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+ myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+ myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35]));
+ break;
+ case STREAM_37:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+ myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+ myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35],
+ myData->streams[36]));
+ break;
+ case STREAM_38:
+ EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+ myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+ myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35],
+ myData->streams[36],myData->streams[37]));
+ break;
+ default:
+ break;
+ }
+ free(barr.index);
+ pthread_exit(NULL);
+}
+
+
+#define MEASURE(func) \
+ iterations = 8; \
+ while (1) \
+ { \
+ timer_start(&time); \
+ for (i=0;i<iterations;i++) \
+ { \
+ func; \
+ } \
+ timer_stop(&time); \
+ if (timer_print(&time) < (double)data->data.min_runtime) \
+ iterations = iterations << 1; \
+ else \
+ break; \
+ } \
+
+
+void* getIterSingle(void* arg)
+{
+ int threadId = 0;
+ int offset = 0;
+ size_t size = 0;
+ size_t i;
+ ThreadData* data;
+ ThreadUserData* myData;
+ TimerData time;
+ FuncPrototype func;
+ size_t iterations = 0;
+
+ data = (ThreadData*) arg;
+ myData = &(data->data);
+ func = myData->test->kernel;
+ threadId = data->threadId;
+
+ size = myData->size - (myData->size % myData->test->stride);
+ likwid_pinThread(myData->processors[threadId]);
+
+#ifdef DEBUG_LIKWID
+ printf("Automatic iteration count detection:");
+#endif
+
+ switch ( myData->test->streams ) {
+ case STREAM_1:
+ MEASURE(func(size,myData->streams[0]));
+ break;
+ case STREAM_2:
+ MEASURE(func(size,myData->streams[0],myData->streams[1]));
+ break;
+ case STREAM_3:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2]));
+ break;
+ case STREAM_4:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3]));
+ break;
+ case STREAM_5:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4]));
+ break;
+ case STREAM_6:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5]));
+ break;
+ case STREAM_7:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6]));
+ break;
+ case STREAM_8:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7]));
+ break;
+ case STREAM_9:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8]));
+ break;
+ case STREAM_10:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9]));
+ break;
+ case STREAM_11:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10]));
+ break;
+ case STREAM_12:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11]));
+ break;
+ case STREAM_13:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12]));
+ break;
+ case STREAM_14:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13]));
+ break;
+ case STREAM_15:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14]));
+ break;
+ case STREAM_16:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15]));
+ break;
+ case STREAM_17:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16]));
+ break;
+ case STREAM_18:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17]));
+ break;
+ case STREAM_19:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18]));
+ break;
+ case STREAM_20:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19]));
+ break;
+ case STREAM_21:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20]));
+ break;
+ case STREAM_22:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21]));
+ break;
+ case STREAM_23:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22]));
+ break;
+ case STREAM_24:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23]));
+ break;
+ case STREAM_25:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24]));
+ break;
+ case STREAM_26:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25]));
+ break;
+ case STREAM_27:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25],myData->streams[26]));
+ break;
+ case STREAM_28:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27]));
+ break;
+ case STREAM_29:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+ myData->streams[28]));
+ break;
+ case STREAM_30:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+ myData->streams[28],myData->streams[29]));
+ break;
+ case STREAM_31:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+ myData->streams[28],myData->streams[29],myData->streams[30]));
+ break;
+ case STREAM_32:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+ myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31]));
+ break;
+ case STREAM_33:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+ myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+ myData->streams[32]));
+ break;
+ case STREAM_34:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+ myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+ myData->streams[32],myData->streams[33]));
+ break;
+ case STREAM_35:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+ myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+ myData->streams[32],myData->streams[33],myData->streams[34]));
+ break;
+ case STREAM_36:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+ myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+ myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35]));
+ break;
+ case STREAM_37:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+ myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+ myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35],
+ myData->streams[36]));
+ break;
+ case STREAM_38:
+ MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+ myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+ myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+ myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+ myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+ myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+ myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+ myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+ myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35],
+ myData->streams[36],myData->streams[37]));
+ break;
+ default:
+ break;
+ }
+ data->data.iter = iterations;
+#ifdef DEBUG_LIKWID
+ printf(" %d iterations per thread\n", iterations);
+ if (iterations < MIN_ITERATIONS)
+ printf("Sanitizing iterations count per thread to %d\n",MIN_ITERATIONS);
+#endif
+ return NULL;
+}
diff --git a/bench/src/bstrlib.c b/bench/src/bstrlib.c
new file mode 100644
index 0000000..380269c
--- /dev/null
+++ b/bench/src/bstrlib.c
@@ -0,0 +1,2955 @@
+/*
+ * =======================================================================================
+ * This source file is part of the bstring string library. This code was
+ * written by Paul Hsieh in 2002-2008, and is covered by the BSD open source
+ * license and the GPL. Refer to the accompanying documentation for details
+ * on usage and license.
+ */
+/*
+ * bstrlib.c
+ *
+ * This file is the core module for implementing the bstring functions.
+ */
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include "bstrlib.h"
+
+/* Optionally include a mechanism for debugging memory */
+
+#if defined(MEMORY_DEBUG) || defined(BSTRLIB_MEMORY_DEBUG)
+#include "memdbg.h"
+#endif
+
+#ifndef bstr__alloc
+#define bstr__alloc(x) malloc (x)
+#endif
+
+#ifndef bstr__free
+#define bstr__free(p) free (p)
+#endif
+
+#ifndef bstr__realloc
+#define bstr__realloc(p,x) realloc ((p), (x))
+#endif
+
+#ifndef bstr__memcpy
+#define bstr__memcpy(d,s,l) memcpy ((d), (s), (l))
+#endif
+
+#ifndef bstr__memmove
+#define bstr__memmove(d,s,l) memmove ((d), (s), (l))
+#endif
+
+#ifndef bstr__memset
+#define bstr__memset(d,c,l) memset ((d), (c), (l))
+#endif
+
+#ifndef bstr__memcmp
+#define bstr__memcmp(d,c,l) memcmp ((d), (c), (l))
+#endif
+
+#ifndef bstr__memchr
+#define bstr__memchr(s,c,l) memchr ((s), (c), (l))
+#endif
+
+/* Just a length safe wrapper for memmove. */
+
+#define bBlockCopy(D,S,L) { if ((L) > 0) bstr__memmove ((D),(S),(L)); }
+
+/* Compute the snapped size for a given requested size. By snapping to powers
+ of 2 like this, repeated reallocations are avoided. */
+static int snapUpSize (int i) {
+ if (i < 8) {
+ i = 8;
+ } else {
+ unsigned int j;
+ j = (unsigned int) i;
+
+ j |= (j >> 1);
+ j |= (j >> 2);
+ j |= (j >> 4);
+ j |= (j >> 8); /* Ok, since int >= 16 bits */
+#if (UINT_MAX != 0xffff)
+ j |= (j >> 16); /* For 32 bit int systems */
+#if (UINT_MAX > 0xffffffffUL)
+ j |= (j >> 32); /* For 64 bit int systems */
+#endif
+#endif
+ /* Least power of two greater than i */
+ j++;
+ if ((int) j >= i) i = (int) j;
+ }
+ return i;
+}
+
+/* int balloc (bstring b, int len)
+ *
+ * Increase the size of the memory backing the bstring b to at least len.
+ */
+int balloc (bstring b, int olen) {
+ int len;
+ if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen <= 0 ||
+ b->mlen < b->slen || olen <= 0) {
+ return BSTR_ERR;
+ }
+
+ if (olen >= b->mlen) {
+ unsigned char * x;
+
+ if ((len = snapUpSize (olen)) <= b->mlen) return BSTR_OK;
+
+ /* Assume probability of a non-moving realloc is 0.125 */
+ if (7 * b->mlen < 8 * b->slen) {
+
+ /* If slen is close to mlen in size then use realloc to reduce
+ the memory defragmentation */
+
+ reallocStrategy:;
+
+ x = (unsigned char *) bstr__realloc (b->data, (size_t) len);
+ if (x == NULL) {
+
+ /* Since we failed, try allocating the tighest possible
+ allocation */
+
+ if (NULL == (x = (unsigned char *) bstr__realloc (b->data, (size_t) (len = olen)))) {
+ return BSTR_ERR;
+ }
+ }
+ } else {
+
+ /* If slen is not close to mlen then avoid the penalty of copying
+ the extra bytes that are allocated, but not considered part of
+ the string */
+
+ if (NULL == (x = (unsigned char *) bstr__alloc ((size_t) len))) {
+
+ /* Perhaps there is no available memory for the two
+ allocations to be in memory at once */
+
+ goto reallocStrategy;
+
+ } else {
+ if (b->slen) bstr__memcpy ((char *) x, (char *) b->data, (size_t) b->slen);
+ bstr__free (b->data);
+ }
+ }
+ b->data = x;
+ b->mlen = len;
+ b->data[b->slen] = (unsigned char) '\0';
+ }
+
+ return BSTR_OK;
+}
+
+/* int ballocmin (bstring b, int len)
+ *
+ * Set the size of the memory backing the bstring b to len or b->slen+1,
+ * whichever is larger. Note that repeated use of this function can degrade
+ * performance.
+ */
+int ballocmin (bstring b, int len) {
+ unsigned char * s;
+
+ if (b == NULL || b->data == NULL || (b->slen+1) < 0 || b->mlen <= 0 ||
+ b->mlen < b->slen || len <= 0) {
+ return BSTR_ERR;
+ }
+
+ if (len < b->slen + 1) len = b->slen + 1;
+
+ if (len != b->mlen) {
+ s = (unsigned char *) bstr__realloc (b->data, (size_t) len);
+ if (NULL == s) return BSTR_ERR;
+ s[b->slen] = (unsigned char) '\0';
+ b->data = s;
+ b->mlen = len;
+ }
+
+ return BSTR_OK;
+}
+
+/* bstring bfromcstr (const char * str)
+ *
+ * Create a bstring which contains the contents of the '\0' terminated char *
+ * buffer str.
+ */
+bstring bfromcstr (const char * str) {
+bstring b;
+int i;
+size_t j;
+
+ if (str == NULL) return NULL;
+ j = (strlen) (str);
+ i = snapUpSize ((int) (j + (2 - (j != 0))));
+ if (i <= (int) j) return NULL;
+
+ b = (bstring) bstr__alloc (sizeof (struct tagbstring));
+ if (NULL == b) return NULL;
+ b->slen = (int) j;
+ if (NULL == (b->data = (unsigned char *) bstr__alloc (b->mlen = i))) {
+ bstr__free (b);
+ return NULL;
+ }
+
+ bstr__memcpy (b->data, str, j+1);
+ return b;
+}
+
+/* bstring bfromcstralloc (int mlen, const char * str)
+ *
+ * Create a bstring which contains the contents of the '\0' terminated char *
+ * buffer str. The memory buffer backing the string is at least len
+ * characters in length.
+ */
+bstring bfromcstralloc (int mlen, const char * str) {
+bstring b;
+int i;
+size_t j;
+
+ if (str == NULL) return NULL;
+ j = (strlen) (str);
+ i = snapUpSize ((int) (j + (2 - (j != 0))));
+ if (i <= (int) j) return NULL;
+
+ b = (bstring) bstr__alloc (sizeof (struct tagbstring));
+ if (b == NULL) return NULL;
+ b->slen = (int) j;
+ if (i < mlen) i = mlen;
+
+ if (NULL == (b->data = (unsigned char *) bstr__alloc (b->mlen = i))) {
+ bstr__free (b);
+ return NULL;
+ }
+
+ bstr__memcpy (b->data, str, j+1);
+ return b;
+}
+
+/* bstring blk2bstr (const void * blk, int len)
+ *
+ * Create a bstring which contains the content of the block blk of length
+ * len.
+ */
+bstring blk2bstr (const void * blk, int len) {
+bstring b;
+int i;
+
+ if (blk == NULL || len < 0) return NULL;
+ b = (bstring) bstr__alloc (sizeof (struct tagbstring));
+ if (b == NULL) return NULL;
+ b->slen = len;
+
+ i = len + (2 - (len != 0));
+ i = snapUpSize (i);
+
+ b->mlen = i;
+
+ b->data = (unsigned char *) bstr__alloc ((size_t) b->mlen);
+ if (b->data == NULL) {
+ bstr__free (b);
+ return NULL;
+ }
+
+ if (len > 0) bstr__memcpy (b->data, blk, (size_t) len);
+ b->data[len] = (unsigned char) '\0';
+
+ return b;
+}
+
+/* char * bstr2cstr (const_bstring s, char z)
+ *
+ * Create a '\0' terminated char * buffer which is equal to the contents of
+ * the bstring s, except that any contained '\0' characters are converted
+ * to the character in z. This returned value should be freed with a
+ * bcstrfree () call, by the calling application.
+ */
+char * bstr2cstr (const_bstring b, char z) {
+int i, l;
+char * r;
+
+ if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
+ l = b->slen;
+ r = (char *) bstr__alloc ((size_t) (l + 1));
+ if (r == NULL) return r;
+
+ for (i=0; i < l; i ++) {
+ r[i] = (char) ((b->data[i] == '\0') ? z : (char) (b->data[i]));
+ }
+
+ r[l] = (unsigned char) '\0';
+
+ return r;
+}
+
+/* int bcstrfree (char * s)
+ *
+ * Frees a C-string generated by bstr2cstr (). This is normally unnecessary
+ * since it just wraps a call to bstr__free (), however, if bstr__alloc ()
+ * and bstr__free () have been redefined as a macros within the bstrlib
+ * module (via defining them in memdbg.h after defining
+ * BSTRLIB_MEMORY_DEBUG) with some difference in behaviour from the std
+ * library functions, then this allows a correct way of freeing the memory
+ * that allows higher level code to be independent from these macro
+ * redefinitions.
+ */
+int bcstrfree (char * s) {
+ if (s) {
+ bstr__free (s);
+ return BSTR_OK;
+ }
+ return BSTR_ERR;
+}
+
+/* int bconcat (bstring b0, const_bstring b1)
+ *
+ * Concatenate the bstring b1 to the bstring b0.
+ */
+int bconcat (bstring b0, const_bstring b1) {
+int len, d;
+bstring aux = (bstring) b1;
+
+ if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL) return BSTR_ERR;
+
+ d = b0->slen;
+ len = b1->slen;
+ if ((d | (b0->mlen - d) | len | (d + len)) < 0) return BSTR_ERR;
+
+ if (b0->mlen <= d + len + 1) {
+ ptrdiff_t pd = b1->data - b0->data;
+ if (0 <= pd && pd < b0->mlen) {
+ if (NULL == (aux = bstrcpy (b1))) return BSTR_ERR;
+ }
+ if (balloc (b0, d + len + 1) != BSTR_OK) {
+ if (aux != b1) bdestroy (aux);
+ return BSTR_ERR;
+ }
+ }
+
+ bBlockCopy (&b0->data[d], &aux->data[0], (size_t) len);
+ b0->data[d + len] = (unsigned char) '\0';
+ b0->slen = d + len;
+ if (aux != b1) bdestroy (aux);
+ return BSTR_OK;
+}
+
+/* int bconchar (bstring b, char c)
+/ *
+ * Concatenate the single character c to the bstring b.
+ */
+int bconchar (bstring b, char c) {
+int d;
+
+ if (b == NULL) return BSTR_ERR;
+ d = b->slen;
+ if ((d | (b->mlen - d)) < 0 || balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
+ b->data[d] = (unsigned char) c;
+ b->data[d + 1] = (unsigned char) '\0';
+ b->slen++;
+ return BSTR_OK;
+}
+
+/* int bcatcstr (bstring b, const char * s)
+ *
+ * Concatenate a char * string to a bstring.
+ */
+int bcatcstr (bstring b, const char * s) {
+char * d;
+int i, l;
+
+ if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen < b->slen
+ || b->mlen <= 0 || s == NULL) return BSTR_ERR;
+
+ /* Optimistically concatenate directly */
+ l = b->mlen - b->slen;
+ d = (char *) &b->data[b->slen];
+ for (i=0; i < l; i++) {
+ if ((*d++ = *s++) == '\0') {
+ b->slen += i;
+ return BSTR_OK;
+ }
+ }
+ b->slen += i;
+
+ /* Need to explicitely resize and concatenate tail */
+ return bcatblk (b, (const void *) s, (int) strlen (s));
+}
+
+/* int bcatblk (bstring b, const void * s, int len)
+ *
+ * Concatenate a fixed length buffer to a bstring.
+ */
+int bcatblk (bstring b, const void * s, int len) {
+int nl;
+
+ if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen < b->slen
+ || b->mlen <= 0 || s == NULL || len < 0) return BSTR_ERR;
+
+ if (0 > (nl = b->slen + len)) return BSTR_ERR; /* Overflow? */
+ if (b->mlen <= nl && 0 > balloc (b, nl + 1)) return BSTR_ERR;
+
+ bBlockCopy (&b->data[b->slen], s, (size_t) len);
+ b->slen = nl;
+ b->data[nl] = (unsigned char) '\0';
+ return BSTR_OK;
+}
+
+/* bstring bstrcpy (const_bstring b)
+ *
+ * Create a copy of the bstring b.
+ */
+bstring bstrcpy (const_bstring b) {
+bstring b0;
+int i,j;
+
+ /* Attempted to copy an invalid string? */
+ if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
+
+ b0 = (bstring) bstr__alloc (sizeof (struct tagbstring));
+ if (b0 == NULL) {
+ /* Unable to allocate memory for string header */
+ return NULL;
+ }
+
+ i = b->slen;
+ j = snapUpSize (i + 1);
+
+ b0->data = (unsigned char *) bstr__alloc (j);
+ if (b0->data == NULL) {
+ j = i + 1;
+ b0->data = (unsigned char *) bstr__alloc (j);
+ if (b0->data == NULL) {
+ /* Unable to allocate memory for string data */
+ bstr__free (b0);
+ return NULL;
+ }
+ }
+
+ b0->mlen = j;
+ b0->slen = i;
+
+ if (i) bstr__memcpy ((char *) b0->data, (char *) b->data, i);
+ b0->data[b0->slen] = (unsigned char) '\0';
+
+ return b0;
+}
+
+/* int bassign (bstring a, const_bstring b)
+ *
+ * Overwrite the string a with the contents of string b.
+ */
+int bassign (bstring a, const_bstring b) {
+ if (b == NULL || b->data == NULL || b->slen < 0)
+ return BSTR_ERR;
+ if (b->slen != 0) {
+ if (balloc (a, b->slen) != BSTR_OK) return BSTR_ERR;
+ bstr__memmove (a->data, b->data, b->slen);
+ } else {
+ if (a == NULL || a->data == NULL || a->mlen < a->slen ||
+ a->slen < 0 || a->mlen == 0)
+ return BSTR_ERR;
+ }
+ a->data[b->slen] = (unsigned char) '\0';
+ a->slen = b->slen;
+ return BSTR_OK;
+}
+
+/* int bassignmidstr (bstring a, const_bstring b, int left, int len)
+ *
+ * Overwrite the string a with the middle of contents of string b
+ * starting from position left and running for a length len. left and
+ * len are clamped to the ends of b as with the function bmidstr.
+ */
+int bassignmidstr (bstring a, const_bstring b, int left, int len) {
+ if (b == NULL || b->data == NULL || b->slen < 0)
+ return BSTR_ERR;
+
+ if (left < 0) {
+ len += left;
+ left = 0;
+ }
+
+ if (len > b->slen - left) len = b->slen - left;
+
+ if (a == NULL || a->data == NULL || a->mlen < a->slen ||
+ a->slen < 0 || a->mlen == 0)
+ return BSTR_ERR;
+
+ if (len > 0) {
+ if (balloc (a, len) != BSTR_OK) return BSTR_ERR;
+ bstr__memmove (a->data, b->data + left, len);
+ a->slen = len;
+ } else {
+ a->slen = 0;
+ }
+ a->data[a->slen] = (unsigned char) '\0';
+ return BSTR_OK;
+}
+
+/* int bassigncstr (bstring a, const char * str)
+ *
+ * Overwrite the string a with the contents of char * string str. Note that
+ * the bstring a must be a well defined and writable bstring. If an error
+ * occurs BSTR_ERR is returned however a may be partially overwritten.
+ */
+int bassigncstr (bstring a, const char * str) {
+int i;
+size_t len;
+ if (a == NULL || a->data == NULL || a->mlen < a->slen ||
+ a->slen < 0 || a->mlen == 0 || NULL == str)
+ return BSTR_ERR;
+
+ for (i=0; i < a->mlen; i++) {
+ if ('\0' == (a->data[i] = str[i])) {
+ a->slen = i;
+ return BSTR_OK;
+ }
+ }
+
+ a->slen = i;
+ len = strlen (str + i);
+ if (len > INT_MAX || i + len + 1 > INT_MAX ||
+ 0 > balloc (a, (int) (i + len + 1))) return BSTR_ERR;
+ bBlockCopy (a->data + i, str + i, (size_t) len + 1);
+ a->slen += (int) len;
+ return BSTR_OK;
+}
+
+/* int bassignblk (bstring a, const void * s, int len)
+ *
+ * Overwrite the string a with the contents of the block (s, len). Note that
+ * the bstring a must be a well defined and writable bstring. If an error
+ * occurs BSTR_ERR is returned and a is not overwritten.
+ */
+int bassignblk (bstring a, const void * s, int len) {
+ if (a == NULL || a->data == NULL || a->mlen < a->slen ||
+ a->slen < 0 || a->mlen == 0 || NULL == s || len + 1 < 1)
+ return BSTR_ERR;
+ if (len + 1 > a->mlen && 0 > balloc (a, len + 1)) return BSTR_ERR;
+ bBlockCopy (a->data, s, (size_t) len);
+ a->data[len] = (unsigned char) '\0';
+ a->slen = len;
+ return BSTR_OK;
+}
+
+/* int btrunc (bstring b, int n)
+ *
+ * Truncate the bstring to at most n characters.
+ */
+int btrunc (bstring b, int n) {
+ if (n < 0 || b == NULL || b->data == NULL || b->mlen < b->slen ||
+ b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+ if (b->slen > n) {
+ b->slen = n;
+ b->data[n] = (unsigned char) '\0';
+ }
+ return BSTR_OK;
+}
+
+#define upcase(c) (toupper ((unsigned char) c))
+#define downcase(c) (tolower ((unsigned char) c))
+#define wspace(c) (isspace ((unsigned char) c))
+
+/* int btoupper (bstring b)
+ *
+ * Convert contents of bstring to upper case.
+ */
+int btoupper (bstring b) {
+int i, len;
+ if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+ b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+ for (i=0, len = b->slen; i < len; i++) {
+ b->data[i] = (unsigned char) upcase (b->data[i]);
+ }
+ return BSTR_OK;
+}
+
+/* int btolower (bstring b)
+ *
+ * Convert contents of bstring to lower case.
+ */
+int btolower (bstring b) {
+int i, len;
+ if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+ b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+ for (i=0, len = b->slen; i < len; i++) {
+ b->data[i] = (unsigned char) downcase (b->data[i]);
+ }
+ return BSTR_OK;
+}
+
+/* int bstricmp (const_bstring b0, const_bstring b1)
+ *
+ * Compare two strings without differentiating between case. The return
+ * value is the difference of the values of the characters where the two
+ * strings first differ after lower case transformation, otherwise 0 is
+ * returned indicating that the strings are equal. If the lengths are
+ * different, then a difference from 0 is given, but if the first extra
+ * character is '\0', then it is taken to be the value UCHAR_MAX+1.
+ */
+int bstricmp (const_bstring b0, const_bstring b1) {
+int i, v, n;
+
+ if (bdata (b0) == NULL || b0->slen < 0 ||
+ bdata (b1) == NULL || b1->slen < 0) return SHRT_MIN;
+ if ((n = b0->slen) > b1->slen) n = b1->slen;
+ else if (b0->slen == b1->slen && b0->data == b1->data) return BSTR_OK;
+
+ for (i = 0; i < n; i ++) {
+ v = (char) downcase (b0->data[i])
+ - (char) downcase (b1->data[i]);
+ if (0 != v) return v;
+ }
+
+ if (b0->slen > n) {
+ v = (char) downcase (b0->data[n]);
+ if (v) return v;
+ return UCHAR_MAX + 1;
+ }
+ if (b1->slen > n) {
+ v = - (char) downcase (b1->data[n]);
+ if (v) return v;
+ return - (int) (UCHAR_MAX + 1);
+ }
+ return BSTR_OK;
+}
+
+/* int bstrnicmp (const_bstring b0, const_bstring b1, int n)
+ *
+ * Compare two strings without differentiating between case for at most n
+ * characters. If the position where the two strings first differ is
+ * before the nth position, the return value is the difference of the values
+ * of the characters, otherwise 0 is returned. If the lengths are different
+ * and less than n characters, then a difference from 0 is given, but if the
+ * first extra character is '\0', then it is taken to be the value
+ * UCHAR_MAX+1.
+ */
+int bstrnicmp (const_bstring b0, const_bstring b1, int n) {
+int i, v, m;
+
+ if (bdata (b0) == NULL || b0->slen < 0 ||
+ bdata (b1) == NULL || b1->slen < 0 || n < 0) return SHRT_MIN;
+ m = n;
+ if (m > b0->slen) m = b0->slen;
+ if (m > b1->slen) m = b1->slen;
+
+ if (b0->data != b1->data) {
+ for (i = 0; i < m; i ++) {
+ v = (char) downcase (b0->data[i]);
+ v -= (char) downcase (b1->data[i]);
+ if (v != 0) return b0->data[i] - b1->data[i];
+ }
+ }
+
+ if (n == m || b0->slen == b1->slen) return BSTR_OK;
+
+ if (b0->slen > m) {
+ v = (char) downcase (b0->data[m]);
+ if (v) return v;
+ return UCHAR_MAX + 1;
+ }
+
+ v = - (char) downcase (b1->data[m]);
+ if (v) return v;
+ return - (int) (UCHAR_MAX + 1);
+}
+
+/* int biseqcaseless (const_bstring b0, const_bstring b1)
+ *
+ * Compare two strings for equality without differentiating between case.
+ * If the strings differ other than in case, 0 is returned, if the strings
+ * are the same, 1 is returned, if there is an error, -1 is returned. If
+ * the length of the strings are different, this function is O(1). '\0'
+ * termination characters are not treated in any special way.
+ */
+int biseqcaseless (const_bstring b0, const_bstring b1) {
+int i, n;
+
+ if (bdata (b0) == NULL || b0->slen < 0 ||
+ bdata (b1) == NULL || b1->slen < 0) return BSTR_ERR;
+ if (b0->slen != b1->slen) return BSTR_OK;
+ if (b0->data == b1->data || b0->slen == 0) return 1;
+ for (i=0, n=b0->slen; i < n; i++) {
+ if (b0->data[i] != b1->data[i]) {
+ unsigned char c = (unsigned char) downcase (b0->data[i]);
+ if (c != (unsigned char) downcase (b1->data[i])) return 0;
+ }
+ }
+ return 1;
+}
+
+/* int bisstemeqcaselessblk (const_bstring b0, const void * blk, int len)
+ *
+ * Compare beginning of string b0 with a block of memory of length len
+ * without differentiating between case for equality. If the beginning of b0
+ * differs from the memory block other than in case (or if b0 is too short),
+ * 0 is returned, if the strings are the same, 1 is returned, if there is an
+ * error, -1 is returned. '\0' characters are not treated in any special
+ * way.
+ */
+int bisstemeqcaselessblk (const_bstring b0, const void * blk, int len) {
+int i;
+
+ if (bdata (b0) == NULL || b0->slen < 0 || NULL == blk || len < 0)
+ return BSTR_ERR;
+ if (b0->slen < len) return BSTR_OK;
+ if (b0->data == (const unsigned char *) blk || len == 0) return 1;
+
+ for (i = 0; i < len; i ++) {
+ if (b0->data[i] != ((const unsigned char *) blk)[i]) {
+ if (downcase (b0->data[i]) !=
+ downcase (((const unsigned char *) blk)[i])) return 0;
+ }
+ }
+ return 1;
+}
+
+/*
+ * int bltrimws (bstring b)
+ *
+ * Delete whitespace contiguous from the left end of the string.
+ */
+int bltrimws (bstring b) {
+int i, len;
+
+ if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+ b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+
+ for (len = b->slen, i = 0; i < len; i++) {
+ if (!wspace (b->data[i])) {
+ return bdelete (b, 0, i);
+ }
+ }
+
+ b->data[0] = (unsigned char) '\0';
+ b->slen = 0;
+ return BSTR_OK;
+}
+
+/*
+ * int brtrimws (bstring b)
+ *
+ * Delete whitespace contiguous from the right end of the string.
+ */
+int brtrimws (bstring b) {
+int i;
+
+ if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+ b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+
+ for (i = b->slen - 1; i >= 0; i--) {
+ if (!wspace (b->data[i])) {
+ if (b->mlen > i) b->data[i+1] = (unsigned char) '\0';
+ b->slen = i + 1;
+ return BSTR_OK;
+ }
+ }
+
+ b->data[0] = (unsigned char) '\0';
+ b->slen = 0;
+ return BSTR_OK;
+}
+
+/*
+ * int btrimws (bstring b)
+ *
+ * Delete whitespace contiguous from both ends of the string.
+ */
+int btrimws (bstring b) {
+int i, j;
+
+ if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+ b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+
+ for (i = b->slen - 1; i >= 0; i--) {
+ if (!wspace (b->data[i])) {
+ if (b->mlen > i) b->data[i+1] = (unsigned char) '\0';
+ b->slen = i + 1;
+ for (j = 0; wspace (b->data[j]); j++) {}
+ return bdelete (b, 0, j);
+ }
+ }
+
+ b->data[0] = (unsigned char) '\0';
+ b->slen = 0;
+ return BSTR_OK;
+}
+
+/* int biseq (const_bstring b0, const_bstring b1)
+ *
+ * Compare the string b0 and b1. If the strings differ, 0 is returned, if
+ * the strings are the same, 1 is returned, if there is an error, -1 is
+ * returned. If the length of the strings are different, this function is
+ * O(1). '\0' termination characters are not treated in any special way.
+ */
+int biseq (const_bstring b0, const_bstring b1) {
+ if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
+ b0->slen < 0 || b1->slen < 0) return BSTR_ERR;
+ if (b0->slen != b1->slen) return BSTR_OK;
+ if (b0->data == b1->data || b0->slen == 0) return 1;
+ return !bstr__memcmp (b0->data, b1->data, b0->slen);
+}
+
+/* int bisstemeqblk (const_bstring b0, const void * blk, int len)
+ *
+ * Compare beginning of string b0 with a block of memory of length len for
+ * equality. If the beginning of b0 differs from the memory block (or if b0
+ * is too short), 0 is returned, if the strings are the same, 1 is returned,
+ * if there is an error, -1 is returned. '\0' characters are not treated in
+ * any special way.
+ */
+int bisstemeqblk (const_bstring b0, const void * blk, int len) {
+int i;
+
+ if (bdata (b0) == NULL || b0->slen < 0 || NULL == blk || len < 0)
+ return BSTR_ERR;
+ if (b0->slen < len) return BSTR_OK;
+ if (b0->data == (const unsigned char *) blk || len == 0) return 1;
+
+ for (i = 0; i < len; i ++) {
+ if (b0->data[i] != ((const unsigned char *) blk)[i]) return BSTR_OK;
+ }
+ return 1;
+}
+
+/* int biseqcstr (const_bstring b, const char *s)
+ *
+ * Compare the bstring b and char * string s. The C string s must be '\0'
+ * terminated at exactly the length of the bstring b, and the contents
+ * between the two must be identical with the bstring b with no '\0'
+ * characters for the two contents to be considered equal. This is
+ * equivalent to the condition that their current contents will be always be
+ * equal when comparing them in the same format after converting one or the
+ * other. If the strings are equal 1 is returned, if they are unequal 0 is
+ * returned and if there is a detectable error BSTR_ERR is returned.
+ */
+int biseqcstr (const_bstring b, const char * s) {
+int i;
+ if (b == NULL || s == NULL || b->data == NULL || b->slen < 0) return BSTR_ERR;
+ for (i=0; i < b->slen; i++) {
+ if (s[i] == '\0' || b->data[i] != (unsigned char) s[i]) return BSTR_OK;
+ }
+ return s[i] == '\0';
+}
+
+/* int biseqcstrcaseless (const_bstring b, const char *s)
+ *
+ * Compare the bstring b and char * string s. The C string s must be '\0'
+ * terminated at exactly the length of the bstring b, and the contents
+ * between the two must be identical except for case with the bstring b with
+ * no '\0' characters for the two contents to be considered equal. This is
+ * equivalent to the condition that their current contents will be always be
+ * equal ignoring case when comparing them in the same format after
+ * converting one or the other. If the strings are equal, except for case,
+ * 1 is returned, if they are unequal regardless of case 0 is returned and
+ * if there is a detectable error BSTR_ERR is returned.
+ */
+int biseqcstrcaseless (const_bstring b, const char * s) {
+int i;
+ if (b == NULL || s == NULL || b->data == NULL || b->slen < 0) return BSTR_ERR;
+ for (i=0; i < b->slen; i++) {
+ if (s[i] == '\0' ||
+ (b->data[i] != (unsigned char) s[i] &&
+ downcase (b->data[i]) != (unsigned char) downcase (s[i])))
+ return BSTR_OK;
+ }
+ return s[i] == '\0';
+}
+
+/* int bstrcmp (const_bstring b0, const_bstring b1)
+ *
+ * Compare the string b0 and b1. If there is an error, SHRT_MIN is returned,
+ * otherwise a value less than or greater than zero, indicating that the
+ * string pointed to by b0 is lexicographically less than or greater than
+ * the string pointed to by b1 is returned. If the the string lengths are
+ * unequal but the characters up until the length of the shorter are equal
+ * then a value less than, or greater than zero, indicating that the string
+ * pointed to by b0 is shorter or longer than the string pointed to by b1 is
+ * returned. 0 is returned if and only if the two strings are the same. If
+ * the length of the strings are different, this function is O(n). Like its
+ * standard C library counter part strcmp, the comparison does not proceed
+ * past any '\0' termination characters encountered.
+ */
+int bstrcmp (const_bstring b0, const_bstring b1) {
+int i, v, n;
+
+ if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
+ b0->slen < 0 || b1->slen < 0) return SHRT_MIN;
+ n = b0->slen; if (n > b1->slen) n = b1->slen;
+ if (b0->slen == b1->slen && (b0->data == b1->data || b0->slen == 0))
+ return BSTR_OK;
+
+ for (i = 0; i < n; i ++) {
+ v = ((char) b0->data[i]) - ((char) b1->data[i]);
+ if (v != 0) return v;
+ if (b0->data[i] == (unsigned char) '\0') return BSTR_OK;
+ }
+
+ if (b0->slen > n) return 1;
+ if (b1->slen > n) return -1;
+ return BSTR_OK;
+}
+
+/* int bstrncmp (const_bstring b0, const_bstring b1, int n)
+ *
+ * Compare the string b0 and b1 for at most n characters. If there is an
+ * error, SHRT_MIN is returned, otherwise a value is returned as if b0 and
+ * b1 were first truncated to at most n characters then bstrcmp was called
+ * with these new strings are paremeters. If the length of the strings are
+ * different, this function is O(n). Like its standard C library counter
+ * part strcmp, the comparison does not proceed past any '\0' termination
+ * characters encountered.
+ */
+int bstrncmp (const_bstring b0, const_bstring b1, int n) {
+int i, v, m;
+
+ if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
+ b0->slen < 0 || b1->slen < 0) return SHRT_MIN;
+ m = n;
+ if (m > b0->slen) m = b0->slen;
+ if (m > b1->slen) m = b1->slen;
+
+ if (b0->data != b1->data) {
+ for (i = 0; i < m; i ++) {
+ v = ((char) b0->data[i]) - ((char) b1->data[i]);
+ if (v != 0) return v;
+ if (b0->data[i] == (unsigned char) '\0') return BSTR_OK;
+ }
+ }
+
+ if (n == m || b0->slen == b1->slen) return BSTR_OK;
+
+ if (b0->slen > m) return 1;
+ return -1;
+}
+
+/* bstring bmidstr (const_bstring b, int left, int len)
+ *
+ * Create a bstring which is the substring of b starting from position left
+ * and running for a length len (clamped by the end of the bstring b.) If
+ * b is detectably invalid, then NULL is returned. The section described
+ * by (left, len) is clamped to the boundaries of b.
+ */
+bstring bmidstr (const_bstring b, int left, int len) {
+
+ if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
+
+ if (left < 0) {
+ len += left;
+ left = 0;
+ }
+
+ if (len > b->slen - left) len = b->slen - left;
+
+ if (len <= 0) return bfromcstr ("");
+ return blk2bstr (b->data + left, len);
+}
+
+/* int bdelete (bstring b, int pos, int len)
+ *
+ * Removes characters from pos to pos+len-1 inclusive and shifts the tail of
+ * the bstring starting from pos+len to pos. len must be positive for this
+ * call to have any effect. The section of the string described by (pos,
+ * len) is clamped to boundaries of the bstring b.
+ */
+int bdelete (bstring b, int pos, int len) {
+ /* Clamp to left side of bstring */
+ if (pos < 0) {
+ len += pos;
+ pos = 0;
+ }
+
+ if (len < 0 || b == NULL || b->data == NULL || b->slen < 0 ||
+ b->mlen < b->slen || b->mlen <= 0)
+ return BSTR_ERR;
+ if (len > 0 && pos < b->slen) {
+ if (pos + len >= b->slen) {
+ b->slen = pos;
+ } else {
+ bBlockCopy ((char *) (b->data + pos),
+ (char *) (b->data + pos + len),
+ b->slen - (pos+len));
+ b->slen -= len;
+ }
+ b->data[b->slen] = (unsigned char) '\0';
+ }
+ return BSTR_OK;
+}
+
+/* int bdestroy (bstring b)
+ *
+ * Free up the bstring. Note that if b is detectably invalid or not writable
+ * then no action is performed and BSTR_ERR is returned. Like a freed memory
+ * allocation, dereferences, writes or any other action on b after it has
+ * been bdestroyed is undefined.
+ */
+int bdestroy (bstring b) {
+ if (b == NULL || b->slen < 0 || b->mlen <= 0 || b->mlen < b->slen ||
+ b->data == NULL)
+ return BSTR_ERR;
+
+ bstr__free (b->data);
+
+ /* In case there is any stale usage, there is one more chance to
+ notice this error. */
+
+ b->slen = -1;
+ b->mlen = -__LINE__;
+ b->data = NULL;
+
+ bstr__free (b);
+ return BSTR_OK;
+}
+
+/* int binstr (const_bstring b1, int pos, const_bstring b2)
+ *
+ * Search for the bstring b2 in b1 starting from position pos, and searching
+ * forward. If it is found then return with the first position where it is
+ * found, otherwise return BSTR_ERR. Note that this is just a brute force
+ * string searcher that does not attempt clever things like the Boyer-Moore
+ * search algorithm. Because of this there are many degenerate cases where
+ * this can take much longer than it needs to.
+ */
+int binstr (const_bstring b1, int pos, const_bstring b2) {
+int j, ii, ll, lf;
+unsigned char * d0;
+unsigned char c0;
+register unsigned char * d1;
+register unsigned char c1;
+register int i;
+
+ if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
+ b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
+ if (b1->slen == pos) return (b2->slen == 0)?pos:BSTR_ERR;
+ if (b1->slen < pos || pos < 0) return BSTR_ERR;
+ if (b2->slen == 0) return pos;
+
+ /* No space to find such a string? */
+ if ((lf = b1->slen - b2->slen + 1) <= pos) return BSTR_ERR;
+
+ /* An obvious alias case */
+ if (b1->data == b2->data && pos == 0) return 0;
+
+ i = pos;
+
+ d0 = b2->data;
+ d1 = b1->data;
+ ll = b2->slen;
+
+ /* Peel off the b2->slen == 1 case */
+ c0 = d0[0];
+ if (1 == ll) {
+ for (;i < lf; i++) if (c0 == d1[i]) return i;
+ return BSTR_ERR;
+ }
+
+ c1 = c0;
+ j = 0;
+ lf = b1->slen - 1;
+
+ ii = -1;
+ if (i < lf) do {
+ /* Unrolled current character test */
+ if (c1 != d1[i]) {
+ if (c1 != d1[1+i]) {
+ i += 2;
+ continue;
+ }
+ i++;
+ }
+
+ /* Take note if this is the start of a potential match */
+ if (0 == j) ii = i;
+
+ /* Shift the test character down by one */
+ j++;
+ i++;
+
+ /* If this isn't past the last character continue */
+ if (j < ll) {
+ c1 = d0[j];
+ continue;
+ }
+
+ N0:;
+
+ /* If no characters mismatched, then we matched */
+ if (i == ii+j) return ii;
+
+ /* Shift back to the beginning */
+ i -= j;
+ j = 0;
+ c1 = c0;
+ } while (i < lf);
+
+ /* Deal with last case if unrolling caused a misalignment */
+ if (i == lf && ll == j+1 && c1 == d1[i]) goto N0;
+
+ return BSTR_ERR;
+}
+
+/* int binstrr (const_bstring b1, int pos, const_bstring b2)
+ *
+ * Search for the bstring b2 in b1 starting from position pos, and searching
+ * backward. If it is found then return with the first position where it is
+ * found, otherwise return BSTR_ERR. Note that this is just a brute force
+ * string searcher that does not attempt clever things like the Boyer-Moore
+ * search algorithm. Because of this there are many degenerate cases where
+ * this can take much longer than it needs to.
+ */
+int binstrr (const_bstring b1, int pos, const_bstring b2) {
+int j, i, l;
+unsigned char * d0, * d1;
+
+ if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
+ b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
+ if (b1->slen == pos && b2->slen == 0) return pos;
+ if (b1->slen < pos || pos < 0) return BSTR_ERR;
+ if (b2->slen == 0) return pos;
+
+ /* Obvious alias case */
+ if (b1->data == b2->data && pos == 0 && b2->slen <= b1->slen) return 0;
+
+ i = pos;
+ if ((l = b1->slen - b2->slen) < 0) return BSTR_ERR;
+
+ /* If no space to find such a string then snap back */
+ if (l + 1 <= i) i = l;
+ j = 0;
+
+ d0 = b2->data;
+ d1 = b1->data;
+ l = b2->slen;
+
+ for (;;) {
+ if (d0[j] == d1[i + j]) {
+ j ++;
+ if (j >= l) return i;
+ } else {
+ i --;
+ if (i < 0) break;
+ j=0;
+ }
+ }
+
+ return BSTR_ERR;
+}
+
+/* int binstrcaseless (const_bstring b1, int pos, const_bstring b2)
+ *
+ * Search for the bstring b2 in b1 starting from position pos, and searching
+ * forward but without regard to case. If it is found then return with the
+ * first position where it is found, otherwise return BSTR_ERR. Note that
+ * this is just a brute force string searcher that does not attempt clever
+ * things like the Boyer-Moore search algorithm. Because of this there are
+ * many degenerate cases where this can take much longer than it needs to.
+ */
+int binstrcaseless (const_bstring b1, int pos, const_bstring b2) {
+int j, i, l, ll;
+unsigned char * d0, * d1;
+
+ if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
+ b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
+ if (b1->slen == pos) return (b2->slen == 0)?pos:BSTR_ERR;
+ if (b1->slen < pos || pos < 0) return BSTR_ERR;
+ if (b2->slen == 0) return pos;
+
+ l = b1->slen - b2->slen + 1;
+
+ /* No space to find such a string? */
+ if (l <= pos) return BSTR_ERR;
+
+ /* An obvious alias case */
+ if (b1->data == b2->data && pos == 0) return BSTR_OK;
+
+ i = pos;
+ j = 0;
+
+ d0 = b2->data;
+ d1 = b1->data;
+ ll = b2->slen;
+
+ for (;;) {
+ if (d0[j] == d1[i + j] || downcase (d0[j]) == downcase (d1[i + j])) {
+ j ++;
+ if (j >= ll) return i;
+ } else {
+ i ++;
+ if (i >= l) break;
+ j=0;
+ }
+ }
+
+ return BSTR_ERR;
+}
+
+/* int binstrrcaseless (const_bstring b1, int pos, const_bstring b2)
+ *
+ * Search for the bstring b2 in b1 starting from position pos, and searching
+ * backward but without regard to case. If it is found then return with the
+ * first position where it is found, otherwise return BSTR_ERR. Note that
+ * this is just a brute force string searcher that does not attempt clever
+ * things like the Boyer-Moore search algorithm. Because of this there are
+ * many degenerate cases where this can take much longer than it needs to.
+ */
+int binstrrcaseless (const_bstring b1, int pos, const_bstring b2) {
+int j, i, l;
+unsigned char * d0, * d1;
+
+ if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
+ b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
+ if (b1->slen == pos && b2->slen == 0) return pos;
+ if (b1->slen < pos || pos < 0) return BSTR_ERR;
+ if (b2->slen == 0) return pos;
+
+ /* Obvious alias case */
+ if (b1->data == b2->data && pos == 0 && b2->slen <= b1->slen) return BSTR_OK;
+
+ i = pos;
+ if ((l = b1->slen - b2->slen) < 0) return BSTR_ERR;
+
+ /* If no space to find such a string then snap back */
+ if (l + 1 <= i) i = l;
+ j = 0;
+
+ d0 = b2->data;
+ d1 = b1->data;
+ l = b2->slen;
+
+ for (;;) {
+ if (d0[j] == d1[i + j] || downcase (d0[j]) == downcase (d1[i + j])) {
+ j ++;
+ if (j >= l) return i;
+ } else {
+ i --;
+ if (i < 0) break;
+ j=0;
+ }
+ }
+
+ return BSTR_ERR;
+}
+
+
+/* int bstrchrp (const_bstring b, int c, int pos)
+ *
+ * Search for the character c in b forwards from the position pos
+ * (inclusive).
+ */
+int bstrchrp (const_bstring b, int c, int pos) {
+unsigned char * p;
+
+ if (b == NULL || b->data == NULL || b->slen <= pos || pos < 0) return BSTR_ERR;
+ p = (unsigned char *) bstr__memchr ((b->data + pos), (unsigned char) c, (b->slen - pos));
+ if (p) return (int) (p - b->data);
+ return BSTR_ERR;
+}
+
+/* int bstrrchrp (const_bstring b, int c, int pos)
+ *
+ * Search for the character c in b backwards from the position pos in string
+ * (inclusive).
+ */
+int bstrrchrp (const_bstring b, int c, int pos) {
+int i;
+
+ if (b == NULL || b->data == NULL || b->slen <= pos || pos < 0) return BSTR_ERR;
+ for (i=pos; i >= 0; i--) {
+ if (b->data[i] == (unsigned char) c) return i;
+ }
+ return BSTR_ERR;
+}
+
+#if !defined (BSTRLIB_AGGRESSIVE_MEMORY_FOR_SPEED_TRADEOFF)
+#define LONG_LOG_BITS_QTY (3)
+#define LONG_BITS_QTY (1 << LONG_LOG_BITS_QTY)
+#define LONG_TYPE unsigned char
+
+#define CFCLEN ((1 << CHAR_BIT) / LONG_BITS_QTY)
+struct charField { LONG_TYPE content[CFCLEN]; };
+#define testInCharField(cf,c) ((cf)->content[(c) >> LONG_LOG_BITS_QTY] & (((long)1) << ((c) & (LONG_BITS_QTY-1))))
+#define setInCharField(cf,idx) { \
+ unsigned int c = (unsigned int) (idx); \
+ (cf)->content[c >> LONG_LOG_BITS_QTY] |= (LONG_TYPE) (1ul << (c & (LONG_BITS_QTY-1))); \
+}
+
+#else
+
+#define CFCLEN (1 << CHAR_BIT)
+struct charField { unsigned char content[CFCLEN]; };
+#define testInCharField(cf,c) ((cf)->content[(unsigned char) (c)])
+#define setInCharField(cf,idx) (cf)->content[(unsigned int) (idx)] = ~0
+
+#endif
+
+/* Convert a bstring to charField */
+static int buildCharField (struct charField * cf, const_bstring b) {
+int i;
+ if (b == NULL || b->data == NULL || b->slen <= 0) return BSTR_ERR;
+ memset ((void *) cf->content, 0, sizeof (struct charField));
+ for (i=0; i < b->slen; i++) {
+ setInCharField (cf, b->data[i]);
+ }
+ return BSTR_OK;
+}
+
+static void invertCharField (struct charField * cf) {
+int i;
+ for (i=0; i < CFCLEN; i++) cf->content[i] = ~cf->content[i];
+}
+
+/* Inner engine for binchr */
+static int binchrCF (const unsigned char * data, int len, int pos, const struct charField * cf) {
+int i;
+ for (i=pos; i < len; i++) {
+ unsigned char c = (unsigned char) data[i];
+ if (testInCharField (cf, c)) return i;
+ }
+ return BSTR_ERR;
+}
+
+/* int binchr (const_bstring b0, int pos, const_bstring b1);
+ *
+ * Search for the first position in b0 starting from pos or after, in which
+ * one of the characters in b1 is found and return it. If such a position
+ * does not exist in b0, then BSTR_ERR is returned.
+ */
+int binchr (const_bstring b0, int pos, const_bstring b1) {
+struct charField chrs;
+ if (pos < 0 || b0 == NULL || b0->data == NULL ||
+ b0->slen <= pos) return BSTR_ERR;
+ if (1 == b1->slen) return bstrchrp (b0, b1->data[0], pos);
+ if (0 > buildCharField (&chrs, b1)) return BSTR_ERR;
+ return binchrCF (b0->data, b0->slen, pos, &chrs);
+}
+
+/* Inner engine for binchrr */
+static int binchrrCF (const unsigned char * data, int pos, const struct charField * cf) {
+int i;
+ for (i=pos; i >= 0; i--) {
+ unsigned int c = (unsigned int) data[i];
+ if (testInCharField (cf, c)) return i;
+ }
+ return BSTR_ERR;
+}
+
+/* int binchrr (const_bstring b0, int pos, const_bstring b1);
+ *
+ * Search for the last position in b0 no greater than pos, in which one of
+ * the characters in b1 is found and return it. If such a position does not
+ * exist in b0, then BSTR_ERR is returned.
+ */
+int binchrr (const_bstring b0, int pos, const_bstring b1) {
+struct charField chrs;
+ if (pos < 0 || b0 == NULL || b0->data == NULL || b1 == NULL ||
+ b0->slen < pos) return BSTR_ERR;
+ if (pos == b0->slen) pos--;
+ if (1 == b1->slen) return bstrrchrp (b0, b1->data[0], pos);
+ if (0 > buildCharField (&chrs, b1)) return BSTR_ERR;
+ return binchrrCF (b0->data, pos, &chrs);
+}
+
+/* int bninchr (const_bstring b0, int pos, const_bstring b1);
+ *
+ * Search for the first position in b0 starting from pos or after, in which
+ * none of the characters in b1 is found and return it. If such a position
+ * does not exist in b0, then BSTR_ERR is returned.
+ */
+int bninchr (const_bstring b0, int pos, const_bstring b1) {
+struct charField chrs;
+ if (pos < 0 || b0 == NULL || b0->data == NULL ||
+ b0->slen <= pos) return BSTR_ERR;
+ if (buildCharField (&chrs, b1) < 0) return BSTR_ERR;
+ invertCharField (&chrs);
+ return binchrCF (b0->data, b0->slen, pos, &chrs);
+}
+
+/* int bninchrr (const_bstring b0, int pos, const_bstring b1);
+ *
+ * Search for the last position in b0 no greater than pos, in which none of
+ * the characters in b1 is found and return it. If such a position does not
+ * exist in b0, then BSTR_ERR is returned.
+ */
+int bninchrr (const_bstring b0, int pos, const_bstring b1) {
+struct charField chrs;
+ if (pos < 0 || b0 == NULL || b0->data == NULL ||
+ b0->slen < pos) return BSTR_ERR;
+ if (pos == b0->slen) pos--;
+ if (buildCharField (&chrs, b1) < 0) return BSTR_ERR;
+ invertCharField (&chrs);
+ return binchrrCF (b0->data, pos, &chrs);
+}
+
+/* int bsetstr (bstring b0, int pos, bstring b1, unsigned char fill)
+ *
+ * Overwrite the string b0 starting at position pos with the string b1. If
+ * the position pos is past the end of b0, then the character "fill" is
+ * appended as necessary to make up the gap between the end of b0 and pos.
+ * If b1 is NULL, it behaves as if it were a 0-length string.
+ */
+int bsetstr (bstring b0, int pos, const_bstring b1, unsigned char fill) {
+int d, newlen;
+ptrdiff_t pd;
+bstring aux = (bstring) b1;
+
+ if (pos < 0 || b0 == NULL || b0->slen < 0 || NULL == b0->data ||
+ b0->mlen < b0->slen || b0->mlen <= 0) return BSTR_ERR;
+ if (b1 != NULL && (b1->slen < 0 || b1->data == NULL)) return BSTR_ERR;
+
+ d = pos;
+
+ /* Aliasing case */
+ if (NULL != aux) {
+ if ((pd = (ptrdiff_t) (b1->data - b0->data)) >= 0 && pd < (ptrdiff_t) b0->mlen) {
+ if (NULL == (aux = bstrcpy (b1))) return BSTR_ERR;
+ }
+ d += aux->slen;
+ }
+
+ /* Increase memory size if necessary */
+ if (balloc (b0, d + 1) != BSTR_OK) {
+ if (aux != b1) bdestroy (aux);
+ return BSTR_ERR;
+ }
+
+ newlen = b0->slen;
+
+ /* Fill in "fill" character as necessary */
+ if (pos > newlen) {
+ bstr__memset (b0->data + b0->slen, (int) fill, (size_t) (pos - b0->slen));
+ newlen = pos;
+ }
+
+ /* Copy b1 to position pos in b0. */
+ if (aux != NULL) {
+ bBlockCopy ((char *) (b0->data + pos), (char *) aux->data, aux->slen);
+ if (aux != b1) bdestroy (aux);
+ }
+
+ /* Indicate the potentially increased size of b0 */
+ if (d > newlen) newlen = d;
+
+ b0->slen = newlen;
+ b0->data[newlen] = (unsigned char) '\0';
+
+ return BSTR_OK;
+}
+
+/* int binsert (bstring b1, int pos, bstring b2, unsigned char fill)
+ *
+ * Inserts the string b2 into b1 at position pos. If the position pos is
+ * past the end of b1, then the character "fill" is appended as necessary to
+ * make up the gap between the end of b1 and pos. Unlike bsetstr, binsert
+ * does not allow b2 to be NULL.
+ */
+int binsert (bstring b1, int pos, const_bstring b2, unsigned char fill) {
+int d, l;
+ptrdiff_t pd;
+bstring aux = (bstring) b2;
+
+ if (pos < 0 || b1 == NULL || b2 == NULL || b1->slen < 0 ||
+ b2->slen < 0 || b1->mlen < b1->slen || b1->mlen <= 0) return BSTR_ERR;
+
+ /* Aliasing case */
+ if ((pd = (ptrdiff_t) (b2->data - b1->data)) >= 0 && pd < (ptrdiff_t) b1->mlen) {
+ if (NULL == (aux = bstrcpy (b2))) return BSTR_ERR;
+ }
+
+ /* Compute the two possible end pointers */
+ d = b1->slen + aux->slen;
+ l = pos + aux->slen;
+ if ((d|l) < 0) return BSTR_ERR;
+
+ if (l > d) {
+ /* Inserting past the end of the string */
+ if (balloc (b1, l + 1) != BSTR_OK) {
+ if (aux != b2) bdestroy (aux);
+ return BSTR_ERR;
+ }
+ bstr__memset (b1->data + b1->slen, (int) fill, (size_t) (pos - b1->slen));
+ b1->slen = l;
+ } else {
+ /* Inserting in the middle of the string */
+ if (balloc (b1, d + 1) != BSTR_OK) {
+ if (aux != b2) bdestroy (aux);
+ return BSTR_ERR;
+ }
+ bBlockCopy (b1->data + l, b1->data + pos, d - l);
+ b1->slen = d;
+ }
+ bBlockCopy (b1->data + pos, aux->data, aux->slen);
+ b1->data[b1->slen] = (unsigned char) '\0';
+ if (aux != b2) bdestroy (aux);
+ return BSTR_OK;
+}
+
+/* int breplace (bstring b1, int pos, int len, bstring b2,
+ * unsigned char fill)
+ *
+ * Replace a section of a string from pos for a length len with the string b2.
+ * fill is used is pos > b1->slen.
+ */
+int breplace (bstring b1, int pos, int len, const_bstring b2,
+ unsigned char fill) {
+int pl, ret;
+ptrdiff_t pd;
+bstring aux = (bstring) b2;
+
+ if (pos < 0 || len < 0 || (pl = pos + len) < 0 || b1 == NULL ||
+ b2 == NULL || b1->data == NULL || b2->data == NULL ||
+ b1->slen < 0 || b2->slen < 0 || b1->mlen < b1->slen ||
+ b1->mlen <= 0) return BSTR_ERR;
+
+ /* Straddles the end? */
+ if (pl >= b1->slen) {
+ if ((ret = bsetstr (b1, pos, b2, fill)) < 0) return ret;
+ if (pos + b2->slen < b1->slen) {
+ b1->slen = pos + b2->slen;
+ b1->data[b1->slen] = (unsigned char) '\0';
+ }
+ return ret;
+ }
+
+ /* Aliasing case */
+ if ((pd = (ptrdiff_t) (b2->data - b1->data)) >= 0 && pd < (ptrdiff_t) b1->slen) {
+ if (NULL == (aux = bstrcpy (b2))) return BSTR_ERR;
+ }
+
+ if (aux->slen > len) {
+ if (balloc (b1, b1->slen + aux->slen - len) != BSTR_OK) {
+ if (aux != b2) bdestroy (aux);
+ return BSTR_ERR;
+ }
+ }
+
+ if (aux->slen != len) bstr__memmove (b1->data + pos + aux->slen, b1->data + pos + len, b1->slen - (pos + len));
+ bstr__memcpy (b1->data + pos, aux->data, aux->slen);
+ b1->slen += aux->slen - len;
+ b1->data[b1->slen] = (unsigned char) '\0';
+ if (aux != b2) bdestroy (aux);
+ return BSTR_OK;
+}
+
+/* int bfindreplace (bstring b, const_bstring find, const_bstring repl,
+ * int pos)
+ *
+ * Replace all occurrences of a find string with a replace string after a
+ * given point in a bstring.
+ */
+
+typedef int (*instr_fnptr) (const_bstring s1, int pos, const_bstring s2);
+
+static int findreplaceengine (bstring b, const_bstring find, const_bstring repl, int pos, instr_fnptr instr) {
+int i, ret, slen, mlen, delta, acc;
+int * d;
+int static_d[32];
+ptrdiff_t pd;
+bstring auxf = (bstring) find;
+bstring auxr = (bstring) repl;
+
+ if (b == NULL || b->data == NULL || find == NULL ||
+ find->data == NULL || repl == NULL || repl->data == NULL ||
+ pos < 0 || find->slen <= 0 || b->mlen < 0 || b->slen > b->mlen ||
+ b->mlen <= 0 || b->slen < 0 || repl->slen < 0) return BSTR_ERR;
+ if (pos > b->slen - find->slen) return BSTR_OK;
+
+ /* Alias with find string */
+ pd = (ptrdiff_t) (find->data - b->data);
+ if ((ptrdiff_t) (pos - find->slen) < pd && pd < (ptrdiff_t) b->slen) {
+ if (NULL == (auxf = bstrcpy (find))) return BSTR_ERR;
+ }
+
+ /* Alias with repl string */
+ pd = (ptrdiff_t) (repl->data - b->data);
+ if ((ptrdiff_t) (pos - repl->slen) < pd && pd < (ptrdiff_t) b->slen) {
+ if (NULL == (auxr = bstrcpy (repl))) {
+ if (auxf != find) bdestroy (auxf);
+ return BSTR_ERR;
+ }
+ }
+
+ delta = auxf->slen - auxr->slen;
+
+ /* in-place replacement since find and replace strings are of equal
+ length */
+ if (delta == 0) {
+ while ((pos = instr (b, pos, auxf)) >= 0) {
+ bstr__memcpy (b->data + pos, auxr->data, auxr->slen);
+ pos += auxf->slen;
+ }
+ if (auxf != find) bdestroy (auxf);
+ if (auxr != repl) bdestroy (auxr);
+ return BSTR_OK;
+ }
+
+ /* shrinking replacement since auxf->slen > auxr->slen */
+ if (delta > 0) {
+ acc = 0;
+
+ while ((i = instr (b, pos, auxf)) >= 0) {
+ if (acc && i > pos)
+ bstr__memmove (b->data + pos - acc, b->data + pos, i - pos);
+ if (auxr->slen)
+ bstr__memcpy (b->data + i - acc, auxr->data, auxr->slen);
+ acc += delta;
+ pos = i + auxf->slen;
+ }
+
+ if (acc) {
+ i = b->slen;
+ if (i > pos)
+ bstr__memmove (b->data + pos - acc, b->data + pos, i - pos);
+ b->slen -= acc;
+ b->data[b->slen] = (unsigned char) '\0';
+ }
+
+ if (auxf != find) bdestroy (auxf);
+ if (auxr != repl) bdestroy (auxr);
+ return BSTR_OK;
+ }
+
+ /* expanding replacement since find->slen < repl->slen. Its a lot
+ more complicated. */
+
+ mlen = 32;
+ d = (int *) static_d; /* Avoid malloc for trivial cases */
+ acc = slen = 0;
+
+ while ((pos = instr (b, pos, auxf)) >= 0) {
+ if (slen + 1 >= mlen) {
+ int sl;
+ int * t;
+ mlen += mlen;
+ sl = sizeof (int *) * mlen;
+ if (static_d == d) d = NULL;
+ if (sl < mlen || NULL == (t = (int *) bstr__realloc (d, sl))) {
+ ret = BSTR_ERR;
+ goto done;
+ }
+ if (NULL == d) bstr__memcpy (t, static_d, sizeof (static_d));
+ d = t;
+ }
+ d[slen] = pos;
+ slen++;
+ acc -= delta;
+ pos += auxf->slen;
+ if (pos < 0 || acc < 0) {
+ ret = BSTR_ERR;
+ goto done;
+ }
+ }
+ d[slen] = b->slen;
+
+ if (BSTR_OK == (ret = balloc (b, b->slen + acc + 1))) {
+ b->slen += acc;
+ for (i = slen-1; i >= 0; i--) {
+ int s, l;
+ s = d[i] + auxf->slen;
+ l = d[i+1] - s;
+ if (l) {
+ bstr__memmove (b->data + s + acc, b->data + s, l);
+ }
+ if (auxr->slen) {
+ bstr__memmove (b->data + s + acc - auxr->slen,
+ auxr->data, auxr->slen);
+ }
+ acc += delta;
+ }
+ b->data[b->slen] = (unsigned char) '\0';
+ }
+
+ done:;
+ if (static_d == d) d = NULL;
+ bstr__free (d);
+ if (auxf != find) bdestroy (auxf);
+ if (auxr != repl) bdestroy (auxr);
+ return ret;
+}
+
+/* int bfindreplace (bstring b, const_bstring find, const_bstring repl,
+ * int pos)
+ *
+ * Replace all occurrences of a find string with a replace string after a
+ * given point in a bstring.
+ */
+int bfindreplace (bstring b, const_bstring find, const_bstring repl, int pos) {
+ return findreplaceengine (b, find, repl, pos, binstr);
+}
+
+/* int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl,
+ * int pos)
+ *
+ * Replace all occurrences of a find string, ignoring case, with a replace
+ * string after a given point in a bstring.
+ */
+int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl, int pos) {
+ return findreplaceengine (b, find, repl, pos, binstrcaseless);
+}
+
+/* int binsertch (bstring b, int pos, int len, unsigned char fill)
+ *
+ * Inserts the character fill repeatedly into b at position pos for a
+ * length len. If the position pos is past the end of b, then the
+ * character "fill" is appended as necessary to make up the gap between the
+ * end of b and the position pos + len.
+ */
+int binsertch (bstring b, int pos, int len, unsigned char fill) {
+int d, l, i;
+
+ if (pos < 0 || b == NULL || b->slen < 0 || b->mlen < b->slen ||
+ b->mlen <= 0 || len < 0) return BSTR_ERR;
+
+ /* Compute the two possible end pointers */
+ d = b->slen + len;
+ l = pos + len;
+ if ((d|l) < 0) return BSTR_ERR;
+
+ if (l > d) {
+ /* Inserting past the end of the string */
+ if (balloc (b, l + 1) != BSTR_OK) return BSTR_ERR;
+ pos = b->slen;
+ b->slen = l;
+ } else {
+ /* Inserting in the middle of the string */
+ if (balloc (b, d + 1) != BSTR_OK) return BSTR_ERR;
+ for (i = d - 1; i >= l; i--) {
+ b->data[i] = b->data[i - len];
+ }
+ b->slen = d;
+ }
+
+ for (i=pos; i < l; i++) b->data[i] = fill;
+ b->data[b->slen] = (unsigned char) '\0';
+ return BSTR_OK;
+}
+
+/* int bpattern (bstring b, int len)
+ *
+ * Replicate the bstring, b in place, end to end repeatedly until it
+ * surpasses len characters, then chop the result to exactly len characters.
+ * This function operates in-place. The function will return with BSTR_ERR
+ * if b is NULL or of length 0, otherwise BSTR_OK is returned.
+ */
+int bpattern (bstring b, int len) {
+int i, d;
+
+ d = blength (b);
+ if (d <= 0 || len < 0 || balloc (b, len + 1) != BSTR_OK) return BSTR_ERR;
+ if (len > 0) {
+ if (d == 1) return bsetstr (b, len, NULL, b->data[0]);
+ for (i = d; i < len; i++) b->data[i] = b->data[i - d];
+ }
+ b->data[len] = (unsigned char) '\0';
+ b->slen = len;
+ return BSTR_OK;
+}
+
+#define BS_BUFF_SZ (1024)
+
+/* int breada (bstring b, bNread readPtr, void * parm)
+ *
+ * Use a finite buffer fread-like function readPtr to concatenate to the
+ * bstring b the entire contents of file-like source data in a roughly
+ * efficient way.
+ */
+int breada (bstring b, bNread readPtr, void * parm) {
+int i, l, n;
+
+ if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
+ b->mlen <= 0 || readPtr == NULL) return BSTR_ERR;
+
+ i = b->slen;
+ for (n=i+16; ; n += ((n < BS_BUFF_SZ) ? n : BS_BUFF_SZ)) {
+ if (BSTR_OK != balloc (b, n + 1)) return BSTR_ERR;
+ l = (int) readPtr ((void *) (b->data + i), 1, n - i, parm);
+ i += l;
+ b->slen = i;
+ if (i < n) break;
+ }
+
+ b->data[i] = (unsigned char) '\0';
+ return BSTR_OK;
+}
+
+/* bstring bread (bNread readPtr, void * parm)
+ *
+ * Use a finite buffer fread-like function readPtr to create a bstring
+ * filled with the entire contents of file-like source data in a roughly
+ * efficient way.
+ */
+bstring bread (bNread readPtr, void * parm) {
+bstring buff;
+
+ if (0 > breada (buff = bfromcstr (""), readPtr, parm)) {
+ bdestroy (buff);
+ return NULL;
+ }
+ return buff;
+}
+
+/* int bassigngets (bstring b, bNgetc getcPtr, void * parm, char terminator)
+ *
+ * Use an fgetc-like single character stream reading function (getcPtr) to
+ * obtain a sequence of characters which are concatenated to the end of the
+ * bstring b. The stream read is terminated by the passed in terminator
+ * parameter.
+ *
+ * If getcPtr returns with a negative number, or the terminator character
+ * (which is appended) is read, then the stream reading is halted and the
+ * function returns with a partial result in b. If there is an empty partial
+ * result, 1 is returned. If no characters are read, or there is some other
+ * detectable error, BSTR_ERR is returned.
+ */
+int bassigngets (bstring b, bNgetc getcPtr, void * parm, char terminator) {
+int c, d, e;
+
+ if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
+ b->mlen <= 0 || getcPtr == NULL) return BSTR_ERR;
+ d = 0;
+ e = b->mlen - 2;
+
+ while ((c = getcPtr (parm)) >= 0) {
+ if (d > e) {
+ b->slen = d;
+ if (balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
+ e = b->mlen - 2;
+ }
+ b->data[d] = (unsigned char) c;
+ d++;
+ if (c == terminator) break;
+ }
+
+ b->data[d] = (unsigned char) '\0';
+ b->slen = d;
+
+ return d == 0 && c < 0;
+}
+
+/* int bgetsa (bstring b, bNgetc getcPtr, void * parm, char terminator)
+ *
+ * Use an fgetc-like single character stream reading function (getcPtr) to
+ * obtain a sequence of characters which are concatenated to the end of the
+ * bstring b. The stream read is terminated by the passed in terminator
+ * parameter.
+ *
+ * If getcPtr returns with a negative number, or the terminator character
+ * (which is appended) is read, then the stream reading is halted and the
+ * function returns with a partial result concatentated to b. If there is
+ * an empty partial result, 1 is returned. If no characters are read, or
+ * there is some other detectable error, BSTR_ERR is returned.
+ */
+int bgetsa (bstring b, bNgetc getcPtr, void * parm, char terminator) {
+int c, d, e;
+
+ if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
+ b->mlen <= 0 || getcPtr == NULL) return BSTR_ERR;
+ d = b->slen;
+ e = b->mlen - 2;
+
+ while ((c = getcPtr (parm)) >= 0) {
+ if (d > e) {
+ b->slen = d;
+ if (balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
+ e = b->mlen - 2;
+ }
+ b->data[d] = (unsigned char) c;
+ d++;
+ if (c == terminator) break;
+ }
+
+ b->data[d] = (unsigned char) '\0';
+ b->slen = d;
+
+ return d == 0 && c < 0;
+}
+
+/* bstring bgets (bNgetc getcPtr, void * parm, char terminator)
+ *
+ * Use an fgetc-like single character stream reading function (getcPtr) to
+ * obtain a sequence of characters which are concatenated into a bstring.
+ * The stream read is terminated by the passed in terminator function.
+ *
+ * If getcPtr returns with a negative number, or the terminator character
+ * (which is appended) is read, then the stream reading is halted and the
+ * result obtained thus far is returned. If no characters are read, or
+ * there is some other detectable error, NULL is returned.
+ */
+bstring bgets (bNgetc getcPtr, void * parm, char terminator) {
+bstring buff;
+
+ if (0 > bgetsa (buff = bfromcstr (""), getcPtr, parm, terminator) || 0 >= buff->slen) {
+ bdestroy (buff);
+ buff = NULL;
+ }
+ return buff;
+}
+
+struct bStream {
+ bstring buff; /* Buffer for over-reads */
+ void * parm; /* The stream handle for core stream */
+ bNread readFnPtr; /* fread compatible fnptr for core stream */
+ int isEOF; /* track file's EOF state */
+ int maxBuffSz;
+};
+
+/* struct bStream * bsopen (bNread readPtr, void * parm)
+ *
+ * Wrap a given open stream (described by a fread compatible function
+ * pointer and stream handle) into an open bStream suitable for the bstring
+ * library streaming functions.
+ */
+struct bStream * bsopen (bNread readPtr, void * parm) {
+struct bStream * s;
+
+ if (readPtr == NULL) return NULL;
+ s = (struct bStream *) bstr__alloc (sizeof (struct bStream));
+ if (s == NULL) return NULL;
+ s->parm = parm;
+ s->buff = bfromcstr ("");
+ s->readFnPtr = readPtr;
+ s->maxBuffSz = BS_BUFF_SZ;
+ s->isEOF = 0;
+ return s;
+}
+
+/* int bsbufflength (struct bStream * s, int sz)
+ *
+ * Set the length of the buffer used by the bStream. If sz is zero, the
+ * length is not set. This function returns with the previous length.
+ */
+int bsbufflength (struct bStream * s, int sz) {
+int oldSz;
+ if (s == NULL || sz < 0) return BSTR_ERR;
+ oldSz = s->maxBuffSz;
+ if (sz > 0) s->maxBuffSz = sz;
+ return oldSz;
+}
+
+int bseof (const struct bStream * s) {
+ if (s == NULL || s->readFnPtr == NULL) return BSTR_ERR;
+ return s->isEOF && (s->buff->slen == 0);
+}
+
+/* void * bsclose (struct bStream * s)
+ *
+ * Close the bStream, and return the handle to the stream that was originally
+ * used to open the given stream.
+ */
+void * bsclose (struct bStream * s) {
+void * parm;
+ if (s == NULL) return NULL;
+ s->readFnPtr = NULL;
+ if (s->buff) bdestroy (s->buff);
+ s->buff = NULL;
+ parm = s->parm;
+ s->parm = NULL;
+ s->isEOF = 1;
+ bstr__free (s);
+ return parm;
+}
+
+/* int bsreadlna (bstring r, struct bStream * s, char terminator)
+ *
+ * Read a bstring terminated by the terminator character or the end of the
+ * stream from the bStream (s) and return it into the parameter r. This
+ * function may read additional characters from the core stream that are not
+ * returned, but will be retained for subsequent read operations.
+ */
+int bsreadlna (bstring r, struct bStream * s, char terminator) {
+int i, l, ret, rlo;
+char * b;
+struct tagbstring x;
+
+ if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0 ||
+ r->slen < 0 || r->mlen < r->slen) return BSTR_ERR;
+ l = s->buff->slen;
+ if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+ b = (char *) s->buff->data;
+ x.data = (unsigned char *) b;
+
+ /* First check if the current buffer holds the terminator */
+ b[l] = terminator; /* Set sentinel */
+ for (i=0; b[i] != terminator; i++) ;
+ if (i < l) {
+ x.slen = i + 1;
+ ret = bconcat (r, &x);
+ s->buff->slen = l;
+ if (BSTR_OK == ret) bdelete (s->buff, 0, i + 1);
+ return BSTR_OK;
+ }
+
+ rlo = r->slen;
+
+ /* If not then just concatenate the entire buffer to the output */
+ x.slen = l;
+ if (BSTR_OK != bconcat (r, &x)) return BSTR_ERR;
+
+ /* Perform direct in-place reads into the destination to allow for
+ the minimum of data-copies */
+ for (;;) {
+ if (BSTR_OK != balloc (r, r->slen + s->maxBuffSz + 1)) return BSTR_ERR;
+ b = (char *) (r->data + r->slen);
+ l = (int) s->readFnPtr (b, 1, s->maxBuffSz, s->parm);
+ if (l <= 0) {
+ r->data[r->slen] = (unsigned char) '\0';
+ s->buff->slen = 0;
+ s->isEOF = 1;
+ /* If nothing was read return with an error message */
+ return BSTR_ERR & -(r->slen == rlo);
+ }
+ b[l] = terminator; /* Set sentinel */
+ for (i=0; b[i] != terminator; i++) ;
+ if (i < l) break;
+ r->slen += l;
+ }
+
+ /* Terminator found, push over-read back to buffer */
+ i++;
+ r->slen += i;
+ s->buff->slen = l - i;
+ bstr__memcpy (s->buff->data, b + i, l - i);
+ r->data[r->slen] = (unsigned char) '\0';
+ return BSTR_OK;
+}
+
+/* int bsreadlnsa (bstring r, struct bStream * s, bstring term)
+ *
+ * Read a bstring terminated by any character in the term string or the end
+ * of the stream from the bStream (s) and return it into the parameter r.
+ * This function may read additional characters from the core stream that
+ * are not returned, but will be retained for subsequent read operations.
+ */
+int bsreadlnsa (bstring r, struct bStream * s, const_bstring term) {
+int i, l, ret, rlo;
+unsigned char * b;
+struct tagbstring x;
+struct charField cf;
+
+ if (s == NULL || s->buff == NULL || r == NULL || term == NULL ||
+ term->data == NULL || r->mlen <= 0 || r->slen < 0 ||
+ r->mlen < r->slen) return BSTR_ERR;
+ if (term->slen == 1) return bsreadlna (r, s, term->data[0]);
+ if (term->slen < 1 || buildCharField (&cf, term)) return BSTR_ERR;
+
+ l = s->buff->slen;
+ if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+ b = (unsigned char *) s->buff->data;
+ x.data = b;
+
+ /* First check if the current buffer holds the terminator */
+ b[l] = term->data[0]; /* Set sentinel */
+ for (i=0; !testInCharField (&cf, b[i]); i++) ;
+ if (i < l) {
+ x.slen = i + 1;
+ ret = bconcat (r, &x);
+ s->buff->slen = l;
+ if (BSTR_OK == ret) bdelete (s->buff, 0, i + 1);
+ return BSTR_OK;
+ }
+
+ rlo = r->slen;
+
+ /* If not then just concatenate the entire buffer to the output */
+ x.slen = l;
+ if (BSTR_OK != bconcat (r, &x)) return BSTR_ERR;
+
+ /* Perform direct in-place reads into the destination to allow for
+ the minimum of data-copies */
+ for (;;) {
+ if (BSTR_OK != balloc (r, r->slen + s->maxBuffSz + 1)) return BSTR_ERR;
+ b = (unsigned char *) (r->data + r->slen);
+ l = (int) s->readFnPtr (b, 1, s->maxBuffSz, s->parm);
+ if (l <= 0) {
+ r->data[r->slen] = (unsigned char) '\0';
+ s->buff->slen = 0;
+ s->isEOF = 1;
+ /* If nothing was read return with an error message */
+ return BSTR_ERR & -(r->slen == rlo);
+ }
+
+ b[l] = term->data[0]; /* Set sentinel */
+ for (i=0; !testInCharField (&cf, b[i]); i++) ;
+ if (i < l) break;
+ r->slen += l;
+ }
+
+ /* Terminator found, push over-read back to buffer */
+ i++;
+ r->slen += i;
+ s->buff->slen = l - i;
+ bstr__memcpy (s->buff->data, b + i, l - i);
+ r->data[r->slen] = (unsigned char) '\0';
+ return BSTR_OK;
+}
+
+/* int bsreada (bstring r, struct bStream * s, int n)
+ *
+ * Read a bstring of length n (or, if it is fewer, as many bytes as is
+ * remaining) from the bStream. This function may read additional
+ * characters from the core stream that are not returned, but will be
+ * retained for subsequent read operations. This function will not read
+ * additional characters from the core stream beyond virtual stream pointer.
+ */
+int bsreada (bstring r, struct bStream * s, int n) {
+int l, ret, orslen;
+char * b;
+struct tagbstring x;
+
+ if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0
+ || r->slen < 0 || r->mlen < r->slen || n <= 0) return BSTR_ERR;
+
+ n += r->slen;
+ if (n <= 0) return BSTR_ERR;
+
+ l = s->buff->slen;
+
+ orslen = r->slen;
+
+ if (0 == l) {
+ if (s->isEOF) return BSTR_ERR;
+ if (r->mlen > n) {
+ l = (int) s->readFnPtr (r->data + r->slen, 1, n - r->slen, s->parm);
+ if (0 >= l || l > n - r->slen) {
+ s->isEOF = 1;
+ return BSTR_ERR;
+ }
+ r->slen += l;
+ r->data[r->slen] = (unsigned char) '\0';
+ return 0;
+ }
+ }
+
+ if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+ b = (char *) s->buff->data;
+ x.data = (unsigned char *) b;
+
+ do {
+ if (l + r->slen >= n) {
+ x.slen = n - r->slen;
+ ret = bconcat (r, &x);
+ s->buff->slen = l;
+ if (BSTR_OK == ret) bdelete (s->buff, 0, x.slen);
+ return BSTR_ERR & -(r->slen == orslen);
+ }
+
+ x.slen = l;
+ if (BSTR_OK != bconcat (r, &x)) break;
+
+ l = n - r->slen;
+ if (l > s->maxBuffSz) l = s->maxBuffSz;
+
+ l = (int) s->readFnPtr (b, 1, l, s->parm);
+
+ } while (l > 0);
+ if (l < 0) l = 0;
+ if (l == 0) s->isEOF = 1;
+ s->buff->slen = l;
+ return BSTR_ERR & -(r->slen == orslen);
+}
+
+/* int bsreadln (bstring r, struct bStream * s, char terminator)
+ *
+ * Read a bstring terminated by the terminator character or the end of the
+ * stream from the bStream (s) and return it into the parameter r. This
+ * function may read additional characters from the core stream that are not
+ * returned, but will be retained for subsequent read operations.
+ */
+int bsreadln (bstring r, struct bStream * s, char terminator) {
+ if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0)
+ return BSTR_ERR;
+ if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+ r->slen = 0;
+ return bsreadlna (r, s, terminator);
+}
+
+/* int bsreadlns (bstring r, struct bStream * s, bstring term)
+ *
+ * Read a bstring terminated by any character in the term string or the end
+ * of the stream from the bStream (s) and return it into the parameter r.
+ * This function may read additional characters from the core stream that
+ * are not returned, but will be retained for subsequent read operations.
+ */
+int bsreadlns (bstring r, struct bStream * s, const_bstring term) {
+ if (s == NULL || s->buff == NULL || r == NULL || term == NULL
+ || term->data == NULL || r->mlen <= 0) return BSTR_ERR;
+ if (term->slen == 1) return bsreadln (r, s, term->data[0]);
+ if (term->slen < 1) return BSTR_ERR;
+ if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+ r->slen = 0;
+ return bsreadlnsa (r, s, term);
+}
+
+/* int bsread (bstring r, struct bStream * s, int n)
+ *
+ * Read a bstring of length n (or, if it is fewer, as many bytes as is
+ * remaining) from the bStream. This function may read additional
+ * characters from the core stream that are not returned, but will be
+ * retained for subsequent read operations. This function will not read
+ * additional characters from the core stream beyond virtual stream pointer.
+ */
+int bsread (bstring r, struct bStream * s, int n) {
+ if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0
+ || n <= 0) return BSTR_ERR;
+ if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+ r->slen = 0;
+ return bsreada (r, s, n);
+}
+
+/* int bsunread (struct bStream * s, const_bstring b)
+ *
+ * Insert a bstring into the bStream at the current position. These
+ * characters will be read prior to those that actually come from the core
+ * stream.
+ */
+int bsunread (struct bStream * s, const_bstring b) {
+ if (s == NULL || s->buff == NULL) return BSTR_ERR;
+ return binsert (s->buff, 0, b, (unsigned char) '?');
+}
+
+/* int bspeek (bstring r, const struct bStream * s)
+ *
+ * Return the currently buffered characters from the bStream that will be
+ * read prior to reads from the core stream.
+ */
+int bspeek (bstring r, const struct bStream * s) {
+ if (s == NULL || s->buff == NULL) return BSTR_ERR;
+ return bassign (r, s->buff);
+}
+
+/* bstring bjoin (const struct bstrList * bl, const_bstring sep);
+ *
+ * Join the entries of a bstrList into one bstring by sequentially
+ * concatenating them with the sep string in between. If there is an error
+ * NULL is returned, otherwise a bstring with the correct result is returned.
+ */
+bstring bjoin (const struct bstrList * bl, const_bstring sep) {
+bstring b;
+int i, c, v;
+
+ if (bl == NULL || bl->qty < 0) return NULL;
+ if (sep != NULL && (sep->slen < 0 || sep->data == NULL)) return NULL;
+
+ for (i = 0, c = 1; i < bl->qty; i++) {
+ v = bl->entry[i]->slen;
+ if (v < 0) return NULL; /* Invalid input */
+ c += v;
+ if (c < 0) return NULL; /* Wrap around ?? */
+ }
+
+ if (sep != NULL) c += (bl->qty - 1) * sep->slen;
+
+ b = (bstring) bstr__alloc (sizeof (struct tagbstring));
+ if (NULL == b) return NULL; /* Out of memory */
+ b->data = (unsigned char *) bstr__alloc (c);
+ if (b->data == NULL) {
+ bstr__free (b);
+ return NULL;
+ }
+
+ b->mlen = c;
+ b->slen = c-1;
+
+ for (i = 0, c = 0; i < bl->qty; i++) {
+ if (i > 0 && sep != NULL) {
+ bstr__memcpy (b->data + c, sep->data, sep->slen);
+ c += sep->slen;
+ }
+ v = bl->entry[i]->slen;
+ bstr__memcpy (b->data + c, bl->entry[i]->data, v);
+ c += v;
+ }
+ b->data[c] = (unsigned char) '\0';
+ return b;
+}
+
+#define BSSSC_BUFF_LEN (256)
+
+/* int bssplitscb (struct bStream * s, const_bstring splitStr,
+ * int (* cb) (void * parm, int ofs, const_bstring entry), void * parm)
+ *
+ * Iterate the set of disjoint sequential substrings read from a stream
+ * divided by any of the characters in splitStr. An empty splitStr causes
+ * the whole stream to be iterated once.
+ *
+ * Note: At the point of calling the cb function, the bStream pointer is
+ * pointed exactly at the position right after having read the split
+ * character. The cb function can act on the stream by causing the bStream
+ * pointer to move, and bssplitscb will continue by starting the next split
+ * at the position of the pointer after the return from cb.
+ *
+ * However, if the cb causes the bStream s to be destroyed then the cb must
+ * return with a negative value, otherwise bssplitscb will continue in an
+ * undefined manner.
+ */
+int bssplitscb (struct bStream * s, const_bstring splitStr,
+ int (* cb) (void * parm, int ofs, const_bstring entry), void * parm) {
+struct charField chrs;
+bstring buff;
+int i, p, ret;
+
+ if (cb == NULL || s == NULL || s->readFnPtr == NULL
+ || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
+
+ if (NULL == (buff = bfromcstr (""))) return BSTR_ERR;
+
+ if (splitStr->slen == 0) {
+ while (bsreada (buff, s, BSSSC_BUFF_LEN) >= 0) ;
+ if ((ret = cb (parm, 0, buff)) > 0)
+ ret = 0;
+ } else {
+ buildCharField (&chrs, splitStr);
+ ret = p = i = 0;
+ for (;;) {
+ if (i >= buff->slen) {
+ bsreada (buff, s, BSSSC_BUFF_LEN);
+ if (i >= buff->slen) {
+ if (0 < (ret = cb (parm, p, buff))) ret = 0;
+ break;
+ }
+ }
+ if (testInCharField (&chrs, buff->data[i])) {
+ struct tagbstring t;
+ unsigned char c;
+
+ blk2tbstr (t, buff->data + i + 1, buff->slen - (i + 1));
+ if ((ret = bsunread (s, &t)) < 0) break;
+ buff->slen = i;
+ c = buff->data[i];
+ buff->data[i] = (unsigned char) '\0';
+ if ((ret = cb (parm, p, buff)) < 0) break;
+ buff->data[i] = c;
+ buff->slen = 0;
+ p += i + 1;
+ i = -1;
+ }
+ i++;
+ }
+ }
+
+ bdestroy (buff);
+ return ret;
+}
+
+/* int bssplitstrcb (struct bStream * s, const_bstring splitStr,
+ * int (* cb) (void * parm, int ofs, const_bstring entry), void * parm)
+ *
+ * Iterate the set of disjoint sequential substrings read from a stream
+ * divided by the entire substring splitStr. An empty splitStr causes
+ * each character of the stream to be iterated.
+ *
+ * Note: At the point of calling the cb function, the bStream pointer is
+ * pointed exactly at the position right after having read the split
+ * character. The cb function can act on the stream by causing the bStream
+ * pointer to move, and bssplitscb will continue by starting the next split
+ * at the position of the pointer after the return from cb.
+ *
+ * However, if the cb causes the bStream s to be destroyed then the cb must
+ * return with a negative value, otherwise bssplitscb will continue in an
+ * undefined manner.
+ */
+int bssplitstrcb (struct bStream * s, const_bstring splitStr,
+ int (* cb) (void * parm, int ofs, const_bstring entry), void * parm) {
+bstring buff;
+int i, p, ret;
+
+ if (cb == NULL || s == NULL || s->readFnPtr == NULL
+ || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
+
+ if (splitStr->slen == 1) return bssplitscb (s, splitStr, cb, parm);
+
+ if (NULL == (buff = bfromcstr (""))) return BSTR_ERR;
+
+ if (splitStr->slen == 0) {
+ for (i=0; bsreada (buff, s, BSSSC_BUFF_LEN) >= 0; i++) {
+ if ((ret = cb (parm, 0, buff)) < 0) {
+ bdestroy (buff);
+ return ret;
+ }
+ buff->slen = 0;
+ }
+ return BSTR_OK;
+ } else {
+ ret = p = i = 0;
+ for (i=p=0;;) {
+ if ((ret = binstr (buff, 0, splitStr)) >= 0) {
+ struct tagbstring t;
+ blk2tbstr (t, buff->data, ret);
+ i = ret + splitStr->slen;
+ if ((ret = cb (parm, p, &t)) < 0) break;
+ p += i;
+ bdelete (buff, 0, i);
+ } else {
+ bsreada (buff, s, BSSSC_BUFF_LEN);
+ if (bseof (s)) {
+ if ((ret = cb (parm, p, buff)) > 0) ret = 0;
+ break;
+ }
+ }
+ }
+ }
+
+ bdestroy (buff);
+ return ret;
+}
+
+/* int bstrListCreate (void)
+ *
+ * Create a bstrList.
+ */
+struct bstrList * bstrListCreate (void) {
+struct bstrList * sl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
+ if (sl) {
+ sl->entry = (bstring *) bstr__alloc (1*sizeof (bstring));
+ if (!sl->entry) {
+ bstr__free (sl);
+ sl = NULL;
+ } else {
+ sl->qty = 0;
+ sl->mlen = 1;
+ }
+ }
+ return sl;
+}
+
+/* int bstrListDestroy (struct bstrList * sl)
+ *
+ * Destroy a bstrList that has been created by bsplit, bsplits or bstrListCreate.
+ */
+int bstrListDestroy (struct bstrList * sl) {
+int i;
+ if (sl == NULL || sl->qty < 0) return BSTR_ERR;
+ for (i=0; i < sl->qty; i++) {
+ if (sl->entry[i]) {
+ bdestroy (sl->entry[i]);
+ sl->entry[i] = NULL;
+ }
+ }
+ sl->qty = -1;
+ sl->mlen = -1;
+ bstr__free (sl->entry);
+ sl->entry = NULL;
+ bstr__free (sl);
+ return BSTR_OK;
+}
+
+/* int bstrListAlloc (struct bstrList * sl, int msz)
+ *
+ * Ensure that there is memory for at least msz number of entries for the
+ * list.
+ */
+int bstrListAlloc (struct bstrList * sl, int msz) {
+bstring * l;
+int smsz;
+size_t nsz;
+ if (!sl || msz <= 0 || !sl->entry || sl->qty < 0 || sl->mlen <= 0 || sl->qty > sl->mlen) return BSTR_ERR;
+ if (sl->mlen >= msz) return BSTR_OK;
+ smsz = snapUpSize (msz);
+ nsz = ((size_t) smsz) * sizeof (bstring);
+ if (nsz < (size_t) smsz) return BSTR_ERR;
+ l = (bstring *) bstr__realloc (sl->entry, nsz);
+ if (!l) {
+ smsz = msz;
+ nsz = ((size_t) smsz) * sizeof (bstring);
+ l = (bstring *) bstr__realloc (sl->entry, nsz);
+ if (!l) return BSTR_ERR;
+ }
+ sl->mlen = smsz;
+ sl->entry = l;
+ return BSTR_OK;
+}
+
+/* int bstrListAllocMin (struct bstrList * sl, int msz)
+ *
+ * Try to allocate the minimum amount of memory for the list to include at
+ * least msz entries or sl->qty whichever is greater.
+ */
+int bstrListAllocMin (struct bstrList * sl, int msz) {
+bstring * l;
+size_t nsz;
+ if (!sl || msz <= 0 || !sl->entry || sl->qty < 0 || sl->mlen <= 0 || sl->qty > sl->mlen) return BSTR_ERR;
+ if (msz < sl->qty) msz = sl->qty;
+ if (sl->mlen == msz) return BSTR_OK;
+ nsz = ((size_t) msz) * sizeof (bstring);
+ if (nsz < (size_t) msz) return BSTR_ERR;
+ l = (bstring *) bstr__realloc (sl->entry, nsz);
+ if (!l) return BSTR_ERR;
+ sl->mlen = msz;
+ sl->entry = l;
+ return BSTR_OK;
+}
+
+/* int bsplitcb (const_bstring str, unsigned char splitChar, int pos,
+ * int (* cb) (void * parm, int ofs, int len), void * parm)
+ *
+ * Iterate the set of disjoint sequential substrings over str divided by the
+ * character in splitChar.
+ *
+ * Note: Non-destructive modification of str from within the cb function
+ * while performing this split is not undefined. bsplitcb behaves in
+ * sequential lock step with calls to cb. I.e., after returning from a cb
+ * that return a non-negative integer, bsplitcb continues from the position
+ * 1 character after the last detected split character and it will halt
+ * immediately if the length of str falls below this point. However, if the
+ * cb function destroys str, then it *must* return with a negative value,
+ * otherwise bsplitcb will continue in an undefined manner.
+ */
+int bsplitcb (const_bstring str, unsigned char splitChar, int pos,
+ int (* cb) (void * parm, int ofs, int len), void * parm) {
+int i, p, ret;
+
+ if (cb == NULL || str == NULL || pos < 0 || pos > str->slen)
+ return BSTR_ERR;
+
+ p = pos;
+ do {
+ for (i=p; i < str->slen; i++) {
+ if (str->data[i] == splitChar) break;
+ }
+ if ((ret = cb (parm, p, i - p)) < 0) return ret;
+ p = i + 1;
+ } while (p <= str->slen);
+ return BSTR_OK;
+}
+
+/* int bsplitscb (const_bstring str, const_bstring splitStr, int pos,
+ * int (* cb) (void * parm, int ofs, int len), void * parm)
+ *
+ * Iterate the set of disjoint sequential substrings over str divided by any
+ * of the characters in splitStr. An empty splitStr causes the whole str to
+ * be iterated once.
+ *
+ * Note: Non-destructive modification of str from within the cb function
+ * while performing this split is not undefined. bsplitscb behaves in
+ * sequential lock step with calls to cb. I.e., after returning from a cb
+ * that return a non-negative integer, bsplitscb continues from the position
+ * 1 character after the last detected split character and it will halt
+ * immediately if the length of str falls below this point. However, if the
+ * cb function destroys str, then it *must* return with a negative value,
+ * otherwise bsplitscb will continue in an undefined manner.
+ */
+int bsplitscb (const_bstring str, const_bstring splitStr, int pos,
+ int (* cb) (void * parm, int ofs, int len), void * parm) {
+struct charField chrs;
+int i, p, ret;
+
+ if (cb == NULL || str == NULL || pos < 0 || pos > str->slen
+ || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
+ if (splitStr->slen == 0) {
+ if ((ret = cb (parm, 0, str->slen)) > 0) ret = 0;
+ return ret;
+ }
+
+ if (splitStr->slen == 1)
+ return bsplitcb (str, splitStr->data[0], pos, cb, parm);
+
+ buildCharField (&chrs, splitStr);
+
+ p = pos;
+ do {
+ for (i=p; i < str->slen; i++) {
+ if (testInCharField (&chrs, str->data[i])) break;
+ }
+ if ((ret = cb (parm, p, i - p)) < 0) return ret;
+ p = i + 1;
+ } while (p <= str->slen);
+ return BSTR_OK;
+}
+
+/* int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos,
+ * int (* cb) (void * parm, int ofs, int len), void * parm)
+ *
+ * Iterate the set of disjoint sequential substrings over str divided by the
+ * substring splitStr. An empty splitStr causes the whole str to be
+ * iterated once.
+ *
+ * Note: Non-destructive modification of str from within the cb function
+ * while performing this split is not undefined. bsplitstrcb behaves in
+ * sequential lock step with calls to cb. I.e., after returning from a cb
+ * that return a non-negative integer, bsplitscb continues from the position
+ * 1 character after the last detected split character and it will halt
+ * immediately if the length of str falls below this point. However, if the
+ * cb function destroys str, then it *must* return with a negative value,
+ * otherwise bsplitscb will continue in an undefined manner.
+ */
+int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos,
+ int (* cb) (void * parm, int ofs, int len), void * parm) {
+int i, p, ret;
+
+ if (cb == NULL || str == NULL || pos < 0 || pos > str->slen
+ || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
+
+ if (0 == splitStr->slen) {
+ for (i=pos; i < str->slen; i++) {
+ if ((ret = cb (parm, i, 1)) < 0) return ret;
+ }
+ return BSTR_OK;
+ }
+
+ if (splitStr->slen == 1)
+ return bsplitcb (str, splitStr->data[0], pos, cb, parm);
+
+ for (i=p=pos; i <= str->slen - splitStr->slen; i++) {
+ if (0 == bstr__memcmp (splitStr->data, str->data + i, splitStr->slen)) {
+ if ((ret = cb (parm, p, i - p)) < 0) return ret;
+ i += splitStr->slen;
+ p = i;
+ }
+ }
+ if ((ret = cb (parm, p, str->slen - p)) < 0) return ret;
+ return BSTR_OK;
+}
+
+struct genBstrList {
+ bstring b;
+ struct bstrList * bl;
+};
+
+static int bscb (void * parm, int ofs, int len) {
+struct genBstrList * g = (struct genBstrList *) parm;
+ if (g->bl->qty >= g->bl->mlen) {
+ int mlen = g->bl->mlen * 2;
+ bstring * tbl;
+
+ while (g->bl->qty >= mlen) {
+ if (mlen < g->bl->mlen) return BSTR_ERR;
+ mlen += mlen;
+ }
+
+ tbl = (bstring *) bstr__realloc (g->bl->entry, sizeof (bstring) * mlen);
+ if (tbl == NULL) return BSTR_ERR;
+
+ g->bl->entry = tbl;
+ g->bl->mlen = mlen;
+ }
+
+ g->bl->entry[g->bl->qty] = bmidstr (g->b, ofs, len);
+ g->bl->qty++;
+ return BSTR_OK;
+}
+
+/* struct bstrList * bsplit (const_bstring str, unsigned char splitChar)
+ *
+ * Create an array of sequential substrings from str divided by the character
+ * splitChar.
+ */
+struct bstrList * bsplit (const_bstring str, unsigned char splitChar) {
+struct genBstrList g;
+
+ if (str == NULL || str->data == NULL || str->slen < 0) return NULL;
+
+ g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
+ if (g.bl == NULL) return NULL;
+ g.bl->mlen = 4;
+ g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
+ if (NULL == g.bl->entry) {
+ bstr__free (g.bl);
+ return NULL;
+ }
+
+ g.b = (bstring) str;
+ g.bl->qty = 0;
+ if (bsplitcb (str, splitChar, 0, bscb, &g) < 0) {
+ bstrListDestroy (g.bl);
+ return NULL;
+ }
+ return g.bl;
+}
+
+/* struct bstrList * bsplitstr (const_bstring str, const_bstring splitStr)
+ *
+ * Create an array of sequential substrings from str divided by the entire
+ * substring splitStr.
+ */
+struct bstrList * bsplitstr (const_bstring str, const_bstring splitStr) {
+struct genBstrList g;
+
+ if (str == NULL || str->data == NULL || str->slen < 0) return NULL;
+
+ g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
+ if (g.bl == NULL) return NULL;
+ g.bl->mlen = 4;
+ g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
+ if (NULL == g.bl->entry) {
+ bstr__free (g.bl);
+ return NULL;
+ }
+
+ g.b = (bstring) str;
+ g.bl->qty = 0;
+ if (bsplitstrcb (str, splitStr, 0, bscb, &g) < 0) {
+ bstrListDestroy (g.bl);
+ return NULL;
+ }
+ return g.bl;
+}
+
+/* struct bstrList * bsplits (const_bstring str, bstring splitStr)
+ *
+ * Create an array of sequential substrings from str divided by any of the
+ * characters in splitStr. An empty splitStr causes a single entry bstrList
+ * containing a copy of str to be returned.
+ */
+struct bstrList * bsplits (const_bstring str, const_bstring splitStr) {
+struct genBstrList g;
+
+ if ( str == NULL || str->slen < 0 || str->data == NULL ||
+ splitStr == NULL || splitStr->slen < 0 || splitStr->data == NULL)
+ return NULL;
+
+ g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
+ if (g.bl == NULL) return NULL;
+ g.bl->mlen = 4;
+ g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
+ if (NULL == g.bl->entry) {
+ bstr__free (g.bl);
+ return NULL;
+ }
+ g.b = (bstring) str;
+ g.bl->qty = 0;
+
+ if (bsplitscb (str, splitStr, 0, bscb, &g) < 0) {
+ bstrListDestroy (g.bl);
+ return NULL;
+ }
+ return g.bl;
+}
+
+#if defined (__TURBOC__) && !defined (__BORLANDC__)
+# ifndef BSTRLIB_NOVSNP
+# define BSTRLIB_NOVSNP
+# endif
+#endif
+
+/* Give WATCOM C/C++, MSVC some latitude for their non-support of vsnprintf */
+#if defined(__WATCOMC__) || defined(_MSC_VER)
+#define exvsnprintf(r,b,n,f,a) {r = _vsnprintf (b,n,f,a);}
+#else
+#ifdef BSTRLIB_NOVSNP
+/* This is just a hack. If you are using a system without a vsnprintf, it is
+ not recommended that bformat be used at all. */
+#define exvsnprintf(r,b,n,f,a) {vsprintf (b,f,a); r = -1;}
+#define START_VSNBUFF (256)
+#else
+
+#ifdef __GNUC__
+/* Something is making gcc complain about this prototype not being here, so
+ I've just gone ahead and put it in. */
+extern int vsnprintf (char *buf, size_t count, const char *format, va_list arg);
+#endif
+
+#define exvsnprintf(r,b,n,f,a) {r = vsnprintf (b,n,f,a);}
+#endif
+#endif
+
+#if !defined (BSTRLIB_NOVSNP)
+
+#ifndef START_VSNBUFF
+#define START_VSNBUFF (16)
+#endif
+
+/* On IRIX vsnprintf returns n-1 when the operation would overflow the target
+ buffer, WATCOM and MSVC both return -1, while C99 requires that the
+ returned value be exactly what the length would be if the buffer would be
+ large enough. This leads to the idea that if the return value is larger
+ than n, then changing n to the return value will reduce the number of
+ iterations required. */
+
+/* int bformata (bstring b, const char * fmt, ...)
+ *
+ * After the first parameter, it takes the same parameters as printf (), but
+ * rather than outputting results to stdio, it appends the results to
+ * a bstring which contains what would have been output. Note that if there
+ * is an early generation of a '\0' character, the bstring will be truncated
+ * to this end point.
+ */
+int bformata (bstring b, const char * fmt, ...) {
+va_list arglist;
+bstring buff;
+int n, r;
+
+ if (b == NULL || fmt == NULL || b->data == NULL || b->mlen <= 0
+ || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
+
+ /* Since the length is not determinable beforehand, a search is
+ performed using the truncating "vsnprintf" call (to avoid buffer
+ overflows) on increasing potential sizes for the output result. */
+
+ if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
+ if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
+ n = 1;
+ if (NULL == (buff = bfromcstralloc (n + 2, ""))) return BSTR_ERR;
+ }
+
+ for (;;) {
+ va_start (arglist, fmt);
+ exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
+ va_end (arglist);
+
+ buff->data[n] = (unsigned char) '\0';
+ buff->slen = (int) (strlen) ((char *) buff->data);
+
+ if (buff->slen < n) break;
+
+ if (r > n) n = r; else n += n;
+
+ if (BSTR_OK != balloc (buff, n + 2)) {
+ bdestroy (buff);
+ return BSTR_ERR;
+ }
+ }
+
+ r = bconcat (b, buff);
+ bdestroy (buff);
+ return r;
+}
+
+/* int bassignformat (bstring b, const char * fmt, ...)
+ *
+ * After the first parameter, it takes the same parameters as printf (), but
+ * rather than outputting results to stdio, it outputs the results to
+ * the bstring parameter b. Note that if there is an early generation of a
+ * '\0' character, the bstring will be truncated to this end point.
+ */
+int bassignformat (bstring b, const char * fmt, ...) {
+va_list arglist;
+bstring buff;
+int n, r;
+
+ if (b == NULL || fmt == NULL || b->data == NULL || b->mlen <= 0
+ || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
+
+ /* Since the length is not determinable beforehand, a search is
+ performed using the truncating "vsnprintf" call (to avoid buffer
+ overflows) on increasing potential sizes for the output result. */
+
+ if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
+ if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
+ n = 1;
+ if (NULL == (buff = bfromcstralloc (n + 2, ""))) return BSTR_ERR;
+ }
+
+ for (;;) {
+ va_start (arglist, fmt);
+ exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
+ va_end (arglist);
+
+ buff->data[n] = (unsigned char) '\0';
+ buff->slen = (int) (strlen) ((char *) buff->data);
+
+ if (buff->slen < n) break;
+
+ if (r > n) n = r; else n += n;
+
+ if (BSTR_OK != balloc (buff, n + 2)) {
+ bdestroy (buff);
+ return BSTR_ERR;
+ }
+ }
+
+ r = bassign (b, buff);
+ bdestroy (buff);
+ return r;
+}
+
+/* bstring bformat (const char * fmt, ...)
+ *
+ * Takes the same parameters as printf (), but rather than outputting results
+ * to stdio, it forms a bstring which contains what would have been output.
+ * Note that if there is an early generation of a '\0' character, the
+ * bstring will be truncated to this end point.
+ */
+bstring bformat (const char * fmt, ...) {
+va_list arglist;
+bstring buff;
+int n, r;
+
+ if (fmt == NULL) return NULL;
+
+ /* Since the length is not determinable beforehand, a search is
+ performed using the truncating "vsnprintf" call (to avoid buffer
+ overflows) on increasing potential sizes for the output result. */
+
+ if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
+ if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
+ n = 1;
+ if (NULL == (buff = bfromcstralloc (n + 2, ""))) return NULL;
+ }
+
+ for (;;) {
+ va_start (arglist, fmt);
+ exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
+ va_end (arglist);
+
+ buff->data[n] = (unsigned char) '\0';
+ buff->slen = (int) (strlen) ((char *) buff->data);
+
+ if (buff->slen < n) break;
+
+ if (r > n) n = r; else n += n;
+
+ if (BSTR_OK != balloc (buff, n + 2)) {
+ bdestroy (buff);
+ return NULL;
+ }
+ }
+
+ return buff;
+}
+
+/* int bvcformata (bstring b, int count, const char * fmt, va_list arglist)
+ *
+ * The bvcformata function formats data under control of the format control
+ * string fmt and attempts to append the result to b. The fmt parameter is
+ * the same as that of the printf function. The variable argument list is
+ * replaced with arglist, which has been initialized by the va_start macro.
+ * The size of the output is upper bounded by count. If the required output
+ * exceeds count, the string b is not augmented with any contents and a value
+ * below BSTR_ERR is returned. If a value below -count is returned then it
+ * is recommended that the negative of this value be used as an update to the
+ * count in a subsequent pass. On other errors, such as running out of
+ * memory, parameter errors or numeric wrap around BSTR_ERR is returned.
+ * BSTR_OK is returned when the output is successfully generated and
+ * appended to b.
+ *
+ * Note: There is no sanity checking of arglist, and this function is
+ * destructive of the contents of b from the b->slen point onward. If there
+ * is an early generation of a '\0' character, the bstring will be truncated
+ * to this end point.
+ */
+int bvcformata (bstring b, int count, const char * fmt, va_list arg) {
+int n, r, l;
+
+ if (b == NULL || fmt == NULL || count <= 0 || b->data == NULL
+ || b->mlen <= 0 || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
+
+ if (count > (n = b->slen + count) + 2) return BSTR_ERR;
+ if (BSTR_OK != balloc (b, n + 2)) return BSTR_ERR;
+
+ exvsnprintf (r, (char *) b->data + b->slen, count + 2, fmt, arg);
+
+ /* Did the operation complete successfully within bounds? */
+
+ if (n >= (l = b->slen + (int) (strlen) ((const char *) b->data + b->slen))) {
+ b->slen = l;
+ return BSTR_OK;
+ }
+
+ /* Abort, since the buffer was not large enough. The return value
+ tries to help set what the retry length should be. */
+
+ b->data[b->slen] = '\0';
+ if (r > count+1) l = r; else {
+ l = count+count;
+ if (count > l) l = INT_MAX;
+ }
+ n = -l;
+ if (n > BSTR_ERR-1) n = BSTR_ERR-1;
+ return n;
+}
+
+#endif
diff --git a/bench/src/strUtil.c b/bench/src/strUtil.c
new file mode 100644
index 0000000..8a4c429
--- /dev/null
+++ b/bench/src/strUtil.c
@@ -0,0 +1,319 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: strUtil.c
+ *
+ * Description: Utility string routines building upon bstrlib
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com.
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+#include <strUtil.h>
+#include <math.h>
+#include <likwid.h>
+
+static int str2int(const char* str)
+{
+ char* endptr;
+ errno = 0;
+ unsigned long val;
+ val = strtoul(str, &endptr, 10);
+
+ if ((errno == ERANGE && val == LONG_MAX)
+ || (errno != 0 && val == 0))
+ {
+ fprintf(stderr, "Value in string %s out of range\n", str);
+ return -EINVAL;
+ }
+
+ if (endptr == str)
+ {
+ fprintf(stderr, "No digits were found in %s\n", str);
+ return -EINVAL;
+ }
+
+ return (int) val;
+}
+
+uint64_t bstr_to_doubleSize(const_bstring str, DataType type)
+{
+ int ret;
+ bstring unit = bmidstr(str, blength(str)-2, 2);
+ bstring sizeStr = bmidstr(str, 0, blength(str)-2);
+ uint64_t sizeU = 0;
+ uint64_t junk = 0;
+ uint64_t bytesize = 0;
+ if (blength(sizeStr) == 0)
+ {
+ return 0;
+ }
+ ret = str2int(bdata(sizeStr));
+ if (ret >= 0)
+ {
+ sizeU = str2int(bdata(sizeStr));
+ }
+ else
+ {
+ return 0;
+ }
+
+ switch (type)
+ {
+ case SINGLE:
+ bytesize = sizeof(float);
+ break;
+
+ case DOUBLE:
+ bytesize = sizeof(double);
+ break;
+
+ case INT:
+ bytesize = sizeof(int);
+ break;
+ }
+
+ if ((biseqcstr(unit, "kB"))||(biseqcstr(unit, "KB")))
+ {
+ junk = (sizeU *1000)/bytesize;
+ }
+ else if (biseqcstr(unit, "MB"))
+ {
+ junk = (sizeU *1000000)/bytesize;
+ }
+ else if (biseqcstr(unit, "GB"))
+ {
+ junk = (sizeU *1000000000)/bytesize;
+ }
+ else if (biseqcstr(unit, "B"))
+ {
+ junk = (sizeU)/bytesize;
+ }
+ bdestroy(unit);
+ bdestroy(sizeStr);
+ return junk;
+}
+
+
+bstring parse_workgroup(Workgroup* group, const_bstring str, DataType type)
+{
+ CpuTopology_t topo;
+ struct bstrList* tokens;
+ bstring cpustr;
+ int numThreads = 0;
+ bstring domain;
+
+
+ tokens = bsplit(str,':');
+ if (tokens->qty == 2)
+ {
+ topo = get_cpuTopology();
+ numThreads = topo->activeHWThreads;
+ cpustr = bformat("E:%s:%d", bdata(tokens->entry[0]), numThreads );
+ }
+ else if (tokens->qty == 3)
+ {
+ cpustr = bformat("E:%s:%s", bdata(tokens->entry[0]), bdata(tokens->entry[2]));
+ numThreads = str2int(bdata(tokens->entry[2]));
+ if (numThreads < 0)
+ {
+ fprintf(stderr, "Cannot convert %s to integer\n", bdata(tokens->entry[2]));
+ bstrListDestroy(tokens);
+ return NULL;
+ }
+ }
+ else if (tokens->qty == 5)
+ {
+ cpustr = bformat("E:%s:%s:%s:%s", bdata(tokens->entry[0]),
+ bdata(tokens->entry[2]),
+ bdata(tokens->entry[3]),
+ bdata(tokens->entry[4]));
+ numThreads = str2int(bdata(tokens->entry[2]));
+ if (numThreads < 0)
+ {
+ fprintf(stderr, "Cannot convert %s to integer\n", bdata(tokens->entry[2]));
+ bstrListDestroy(tokens);
+ return NULL;
+ }
+ }
+ else
+ {
+ fprintf(stderr, "Misformated workgroup string\n");
+ bstrListDestroy(tokens);
+ return NULL;
+ }
+
+ group->size = bstr_to_doubleSize(tokens->entry[1], type);
+ if (group->size == 0)
+ {
+ fprintf(stderr, "Stream size cannot be read, should look like <domain>:<size>\n");
+ bstrListDestroy(tokens);
+ return NULL;
+ }
+ group->processorIds = (int*) malloc(numThreads * sizeof(int));
+ if (group->processorIds == NULL)
+ {
+ fprintf(stderr, "No more memory to allocate list of processors\n");
+ bstrListDestroy(tokens);
+ return NULL;
+ }
+ group->numberOfThreads = numThreads;
+ if (cpustr_to_cpulist(bdata(cpustr),group->processorIds, numThreads) < 0 )
+ {
+ free(group->processorIds);
+ bstrListDestroy(tokens);
+ return NULL;
+ }
+ domain = bstrcpy(tokens->entry[0]);
+ bdestroy(cpustr);
+ bstrListDestroy(tokens);
+ return domain;
+}
+
+int parse_streams(Workgroup* group, const_bstring str, int numberOfStreams)
+{
+ struct bstrList* tokens;
+ struct bstrList* subtokens;
+ tokens = bsplit(str,',');
+
+ if (tokens->qty < numberOfStreams)
+ {
+ fprintf(stderr, "Error: Testcase requires at least %d streams\n", numberOfStreams);
+ bstrListDestroy(tokens);
+ return -1;
+ }
+
+ group->streams = (Stream*) malloc(numberOfStreams * sizeof(Stream));
+ if (group->streams == NULL)
+ {
+ bstrListDestroy(tokens);
+ return -1;
+ }
+ for (int i=0; i<numberOfStreams; i++)
+ {
+ subtokens = bsplit(tokens->entry[i],':');
+ if (subtokens->qty >= 2)
+ {
+ int index = str2int(bdata(subtokens->entry[0]));
+ if ((index < 0) && (index >= numberOfStreams))
+ {
+ free(group->streams);
+ bstrListDestroy(subtokens);
+ bstrListDestroy(tokens);
+ return -1;
+ }
+ group->streams[index].domain = bstrcpy(subtokens->entry[1]);
+ group->streams[index].offset = 0;
+ if (subtokens->qty == 3)
+ {
+ group->streams[index].offset = str2int(bdata(subtokens->entry[2]));
+ if (group->streams[index].offset < 0)
+ {
+ free(group->streams);
+ bstrListDestroy(subtokens);
+ bstrListDestroy(tokens);
+ return -1;
+ }
+ }
+ }
+ else
+ {
+ fprintf(stderr, "Error in parsing stream definition %s\n", bdata(tokens->entry[i]));
+ bstrListDestroy(subtokens);
+ bstrListDestroy(tokens);
+ free(group->streams);
+ return -1;
+ }
+ bstrListDestroy(subtokens);
+ }
+
+ bstrListDestroy(tokens);
+ return 0;
+}
+
+int bstr_to_workgroup(Workgroup* group, const_bstring str, DataType type, int numberOfStreams)
+{
+ int parseStreams = 0;
+ struct bstrList* tokens;
+ tokens = bsplit(str,'-');
+ bstring domain;
+ if (tokens->qty == 2)
+ {
+ domain = parse_workgroup(group, tokens->entry[0], type);
+ if (domain == NULL)
+ {
+ bstrListDestroy(tokens);
+ return 1;
+ }
+ parse_streams(group, tokens->entry[1], numberOfStreams);
+ bdestroy(domain);
+ }
+ else if (tokens->qty == 1)
+ {
+ domain = parse_workgroup(group, tokens->entry[0], type);
+ if (domain == NULL)
+ {
+ bstrListDestroy(tokens);
+ return 1;
+ }
+ group->streams = (Stream*) malloc(numberOfStreams * sizeof(Stream));
+ if (group->streams == NULL)
+ {
+ bstrListDestroy(tokens);
+ return 1;
+ }
+ for (int i = 0; i< numberOfStreams; i++)
+ {
+ group->streams[i].domain = bstrcpy(domain);
+ group->streams[i].offset = 0;
+ }
+ bdestroy(domain);
+ }
+ else
+ {
+ fprintf(stderr, "Error in parsing workgroup string %s\n", bdata(str));
+ bstrListDestroy(tokens);
+ return 1;
+ }
+ bstrListDestroy(tokens);
+ group->size /= numberOfStreams;
+ return 0;
+}
+
+void workgroups_destroy(Workgroup** groupList, int numberOfGroups, int numberOfStreams)
+{
+ int i = 0, j = 0;
+ if (groupList == NULL)
+ return;
+ if (*groupList == NULL)
+ return;
+ Workgroup* list = *groupList;
+ for (i = 0; i < numberOfGroups; i++)
+ {
+ free(list[i].processorIds);
+ for (j = 0; j < numberOfStreams; j++)
+ {
+ bdestroy(list[i].streams[j].domain);
+ }
+ free(list[i].streams);
+ }
+ free(list);
+}
diff --git a/bench/src/threads.c b/bench/src/threads.c
new file mode 100644
index 0000000..70a90ec
--- /dev/null
+++ b/bench/src/threads.c
@@ -0,0 +1,293 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: threads.c
+ *
+ * Description: High level interface to pthreads
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+/* ##### HEADER FILE INCLUDES ######################################### */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+
+#include <errno.h>
+#include <threads.h>
+
+
+
+/* ##### EXPORTED VARIABLES ########################################### */
+
+pthread_barrier_t threads_barrier;
+ThreadData* threads_data;
+ThreadGroup* threads_groups;
+
+/* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */
+
+static pthread_t* threads = NULL;
+static pthread_attr_t attr;
+static int numThreads = 0;
+
+/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ################## */
+static int count_characters(const char *str, char character)
+{
+ if (str == 0)
+ return 0;
+ const char *p = str;
+ int count = 0;
+
+ do {
+ if (*p == character)
+ count++;
+ } while (*(p++));
+
+ return count;
+}
+
+void* dummy_function(void* arg)
+{
+ return 0;
+}
+/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
+
+
+
+int threads_test()
+{
+ int cnt = 0;
+ int err;
+ pthread_t pid;
+ int likwid_pin = count_characters(getenv("LIKWID_PIN"), ',');
+ int max_cpus = sysconf(_SC_NPROCESSORS_CONF);
+ int max = likwid_pin;
+ if (likwid_pin == 0)
+ {
+ max = max_cpus;
+ }
+ while (cnt < max) {
+ err = pthread_create(&pid, NULL, dummy_function, NULL);
+ cnt++;
+ }
+ return cnt;
+}
+
+
+void
+threads_init(int numberOfThreads)
+{
+ int i;
+ numThreads = numberOfThreads;
+
+ threads = (pthread_t*) malloc(numThreads * sizeof(pthread_t));
+ threads_data = (ThreadData*) malloc(numThreads * sizeof(ThreadData));
+
+ for(i = 0; i < numThreads; i++)
+ {
+ threads_data[i].numberOfThreads = numThreads;
+ threads_data[i].globalNumberOfThreads = numThreads;
+ threads_data[i].globalThreadId = i;
+ threads_data[i].threadId = i;
+ }
+
+ pthread_barrier_init(&threads_barrier, NULL, numThreads);
+ pthread_attr_init(&attr);
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+}
+
+
+void
+threads_create(void *(*startRoutine)(void*))
+{
+ int i;
+
+ for(i = 0; i < numThreads; i++)
+ {
+ pthread_create(&threads[i],
+ &attr,
+ startRoutine,
+ (void*) &threads_data[i]);
+ }
+}
+
+void
+threads_createGroups(int numberOfGroups)
+{
+ int i;
+ int j;
+ int numThreadsPerGroup;
+ int globalId = 0;
+
+ if (numThreads % numberOfGroups)
+ {
+ fprintf(stderr, "ERROR: Not enough threads %d to create %d groups\n",numThreads,numberOfGroups);
+ }
+ else
+ {
+ numThreadsPerGroup = numThreads / numberOfGroups;
+ }
+
+ threads_groups = (ThreadGroup*) malloc(numberOfGroups * sizeof(ThreadGroup));
+ if (!threads_groups)
+ {
+ fprintf(stderr, "ERROR: Cannot allocate thread groups - %s\n", strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ for (i = 0; i < numberOfGroups; i++)
+ {
+ threads_groups[i].numberOfThreads = numThreadsPerGroup;
+ threads_groups[i].threadIds = (int*) malloc(numThreadsPerGroup * sizeof(int));
+ if (!threads_groups[i].threadIds)
+ {
+ fprintf(stderr, "ERROR: Cannot allocate threadID list for thread groups - %s\n", strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ for (j = 0; j < numThreadsPerGroup; j++)
+ {
+ threads_data[globalId].threadId = j;
+ threads_data[globalId].groupId = i;
+ threads_data[globalId].numberOfGroups = numberOfGroups;
+ threads_data[globalId].numberOfThreads = numThreadsPerGroup;
+ threads_groups[i].threadIds[j] = globalId++;
+ }
+ }
+}
+
+
+void
+threads_registerDataAll(ThreadUserData* data, threads_copyDataFunc func)
+{
+ int i;
+
+ if (func == NULL)
+ {
+ for(i = 0; i < numThreads; i++)
+ {
+ threads_data[i].data = (*data);
+ }
+ }
+ else
+ {
+ for(i = 0; i < numThreads; i++)
+ {
+ func( data, &threads_data[i].data);
+ }
+ }
+}
+
+void
+threads_registerDataThread(int threadId,
+ ThreadUserData* data,
+ threads_copyDataFunc func)
+{
+ if (func == NULL)
+ {
+ threads_data[threadId].data = (*data);
+ }
+ else
+ {
+ func( data, &threads_data[threadId].data);
+ }
+}
+
+void
+threads_registerDataGroup(int groupId,
+ ThreadUserData* data,
+ threads_copyDataFunc func)
+{
+ int i;
+
+ if (func == NULL)
+ {
+ for (i = 0; i < threads_groups[groupId].numberOfThreads; i++)
+ {
+ threads_data[threads_groups[groupId].threadIds[i]].data = (*data);
+ }
+ }
+ else
+ {
+ for (i = 0; i < threads_groups[groupId].numberOfThreads; i++)
+ {
+ func( data,
+ &threads_data[threads_groups[groupId].threadIds[i]].data);
+ }
+ }
+}
+
+size_t
+threads_updateIterations(int groupId, size_t demandIter)
+{
+ int i = 0;
+ size_t iterations = threads_data[0].data.iter;
+ if (demandIter > 0)
+ {
+ iterations = demandIter;
+ }
+ iterations = (iterations < MIN_ITERATIONS ? MIN_ITERATIONS : iterations);
+
+ for (i = 0; i < threads_groups[groupId].numberOfThreads; i++)
+ {
+ threads_data[threads_groups[groupId].threadIds[i]].data.iter = iterations;
+ threads_data[threads_groups[groupId].threadIds[i]].data.cycles = 0;
+ threads_data[threads_groups[groupId].threadIds[i]].cycles = 0;
+ threads_data[threads_groups[groupId].threadIds[i]].time = 0;
+ }
+ return iterations;
+}
+
+void
+threads_join(void)
+{
+ int i = 0;
+
+ for(i=0; i < numThreads; i++)
+ {
+ pthread_join(threads[i], NULL);
+ }
+}
+
+void
+threads_destroy(int numberOfGroups, int numberOfStreams)
+{
+ int i = 0, j = 0;
+ pthread_attr_destroy(&attr);
+ pthread_barrier_destroy(&threads_barrier);
+
+
+ for(i=0;i<numberOfGroups;i++)
+ {
+ for (j = 0; j < threads_groups[i].numberOfThreads; j++)
+ {
+ free(threads_data[threads_groups[i].threadIds[j]].data.processors);
+ free(threads_data[threads_groups[i].threadIds[j]].data.streams);
+ }
+ free(threads_groups[i].threadIds);
+ }
+ free(threads_groups);
+ free(threads);
+}
diff --git a/bench/x86-64/branch.ptt b/bench/x86-64/branch.ptt
deleted file mode 100644
index e15086d..0000000
--- a/bench/x86-64/branch.ptt
+++ /dev/null
@@ -1,36 +0,0 @@
-STREAMS 4
-TYPE DOUBLE_RAND
-FLOPS 2
-BYTES 32
-LOOP 8
-movaps FPR1, [STR1 + GPR1*8]
-movaps FPR2, [STR1 + GPR1*8+16]
-movaps FPR3, [STR1 + GPR1*8+32]
-movaps FPR4, [STR1 + GPR1*8+48]
-cvtsd2si GPR2, FPR1
-cmp GPR2, 0
-jl sub
-mulpd FPR1, [STR2 + GPR1*8]
-addpd FPR1, [STR3 + GPR1*8]
-mulpd FPR2, [STR2 + GPR1*8+16]
-addpd FPR2, [STR3 + GPR1*8+16]
-mulpd FPR3, [STR2 + GPR1*8+32]
-addpd FPR3, [STR3 + GPR1*8+32]
-mulpd FPR4, [STR2 + GPR1*8+48]
-addpd FPR4, [STR3 + GPR1*8+48]
-jmp end
-sub:
-mulpd FPR1, [STR2 + GPR1*8]
-subpd FPR1, [STR3 + GPR1*8]
-mulpd FPR2, [STR2 + GPR1*8+16]
-subpd FPR2, [STR3 + GPR1*8+16]
-mulpd FPR3, [STR2 + GPR1*8+32]
-subpd FPR3, [STR3 + GPR1*8+32]
-mulpd FPR4, [STR2 + GPR1*8+48]
-subpd FPR4, [STR3 + GPR1*8+48]
-end:
-movaps [STR0 + GPR1*8], FPR1
-movaps [STR0 + GPR1*8+16], FPR2
-movaps [STR0 + GPR1*8+32], FPR3
-movaps [STR0 + GPR1*8+48], FPR4
-
diff --git a/bench/x86-64/clcopy.ptt b/bench/x86-64/clcopy.ptt
index b59c2be..3d95760 100644
--- a/bench/x86-64/clcopy.ptt
+++ b/bench/x86-64/clcopy.ptt
@@ -2,6 +2,12 @@ STREAMS 2
TYPE DOUBLE
FLOPS 0
BYTES 16
+DESC Double-precision cache line copy, only touches first element of each cache line.
+LOADS 1
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 14
LOOP 32
movaps FPR1, [STR0 + GPR1 * 8 ]
movaps FPR2, [STR0 + GPR1 * 8 + 64 ]
diff --git a/bench/x86-64/clload.ptt b/bench/x86-64/clload.ptt
index 8c3ddc2..7cd9c38 100644
--- a/bench/x86-64/clload.ptt
+++ b/bench/x86-64/clload.ptt
@@ -2,6 +2,12 @@ STREAMS 1
TYPE DOUBLE
FLOPS 0
BYTES 8
+DESC Double-precision cache line load, only loads first element of each cache line.
+LOADS 1
+STORES 0
+INSTR_CONST 16
+INSTR_LOOP 7
+UOPS 6
LOOP 32
movaps FPR1, [STR0 + GPR1 * 8]
movaps FPR2, [STR0 + GPR1 * 8 + 64]
diff --git a/bench/x86-64/clstore.ptt b/bench/x86-64/clstore.ptt
index 5541b8e..1b70c45 100644
--- a/bench/x86-64/clstore.ptt
+++ b/bench/x86-64/clstore.ptt
@@ -2,10 +2,16 @@ STREAMS 1
TYPE DOUBLE
FLOPS 0
BYTES 8
-movaps FPR1, [SCALAR]
-movaps FPR2, [SCALAR]
-movaps FPR3, [SCALAR]
-movaps FPR4, [SCALAR]
+DESC Double-precision cache line store, only stores first element of each cache line.
+LOADS 0
+STORES 1
+INSTR_CONST 20
+INSTR_LOOP 7
+UOPS 10
+movaps FPR1, [rip+SCALAR]
+movaps FPR2, [rip+SCALAR]
+movaps FPR3, [rip+SCALAR]
+movaps FPR4, [rip+SCALAR]
LOOP 32
movaps [STR0 + GPR1 * 8], FPR1
movaps [STR0 + GPR1 * 8 + 64], FPR2
diff --git a/bench/x86-64/copy.ptt b/bench/x86-64/copy.ptt
index ffca4f5..b47e322 100644
--- a/bench/x86-64/copy.ptt
+++ b/bench/x86-64/copy.ptt
@@ -2,14 +2,20 @@ STREAMS 2
TYPE DOUBLE
FLOPS 0
BYTES 16
-LOOP 8
-movaps FPR1, [STR0 + GPR1 * 8]
-movaps FPR2, [STR0 + GPR1 * 8 + 16]
-movaps FPR3, [STR0 + GPR1 * 8 + 32]
-movaps FPR4, [STR0 + GPR1 * 8 + 48]
-movaps [STR1 + GPR1 * 8] , FPR1
-movaps [STR1 + GPR1 * 8 + 16], FPR2
-movaps [STR1 + GPR1 * 8 + 32], FPR3
-movaps [STR1 + GPR1 * 8 + 48], FPR4
+DESC Double-precision vector copy, only scalar operations
+LOADS 1
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 14
+LOOP 4
+movsd FPR1, [STR0 + GPR1 * 8]
+movsd FPR2, [STR0 + GPR1 * 8 + 8]
+movsd FPR3, [STR0 + GPR1 * 8 + 16]
+movsd FPR4, [STR0 + GPR1 * 8 + 24]
+movsd [STR1 + GPR1 * 8] , FPR1
+movsd [STR1 + GPR1 * 8 + 8] , FPR2
+movsd [STR1 + GPR1 * 8 + 16], FPR3
+movsd [STR1 + GPR1 * 8 + 24], FPR4
diff --git a/bench/x86-64/copy_avx.ptt b/bench/x86-64/copy_avx.ptt
index 814bb78..53b02b3 100644
--- a/bench/x86-64/copy_avx.ptt
+++ b/bench/x86-64/copy_avx.ptt
@@ -2,6 +2,12 @@ STREAMS 2
TYPE DOUBLE
FLOPS 0
BYTES 16
+DESC Double-precision vector copy, optimized for AVX
+LOADS 1
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 14
LOOP 16
vmovaps ymm1, [STR0 + GPR1 * 8]
vmovaps ymm2, [STR0 + GPR1 * 8 + 32]
diff --git a/bench/x86-64/copy_mem.ptt b/bench/x86-64/copy_mem.ptt
index fab5a66..3fa0b57 100644
--- a/bench/x86-64/copy_mem.ptt
+++ b/bench/x86-64/copy_mem.ptt
@@ -2,14 +2,20 @@ STREAMS 2
TYPE DOUBLE
FLOPS 0
BYTES 16
-LOOP 8
-movaps FPR1, [STR0 + GPR1 * 8]
-movaps FPR2, [STR0 + GPR1 * 8 + 16]
-movaps FPR3, [STR0 + GPR1 * 8 + 32]
-movaps FPR4, [STR0 + GPR1 * 8 + 48]
-movntpd [STR1 + GPR1 * 8] , FPR1
-movntpd [STR1 + GPR1 * 8 + 16], FPR2
-movntpd [STR1 + GPR1 * 8 + 32], FPR3
-movntpd [STR1 + GPR1 * 8 + 48], FPR4
+DESC Double-precision vector copy, only scalar operations but with non-temporal stores
+LOADS 1
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 14
+LOOP 4
+movq mm0, [STR0 + GPR1 * 8]
+movq mm1, [STR0 + GPR1 * 8 + 8]
+movq mm2, [STR0 + GPR1 * 8 + 16]
+movq mm3, [STR0 + GPR1 * 8 + 24]
+movntq [STR1 + GPR1 * 8] , mm0
+movntq [STR1 + GPR1 * 8 + 8] , mm1
+movntq [STR1 + GPR1 * 8 + 16], mm2
+movntq [STR1 + GPR1 * 8 + 24], mm3
diff --git a/bench/x86-64/copy_mem_avx.ptt b/bench/x86-64/copy_mem_avx.ptt
index 651a55e..3c393a4 100644
--- a/bench/x86-64/copy_mem_avx.ptt
+++ b/bench/x86-64/copy_mem_avx.ptt
@@ -2,7 +2,13 @@ STREAMS 2
TYPE DOUBLE
FLOPS 0
BYTES 16
-LOOP 32
+DESC Double-precision vector copy, uses AVX and non-temporal stores
+LOADS 1
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 14
+LOOP 16
vmovaps ymm1, [STR0 + GPR1 * 8]
vmovaps ymm2, [STR0 + GPR1 * 8 + 32]
vmovaps ymm3, [STR0 + GPR1 * 8 + 64]
diff --git a/bench/x86-64/copy_mem_sse.ptt b/bench/x86-64/copy_mem_sse.ptt
index f803bce..5a8c5d6 100644
--- a/bench/x86-64/copy_mem_sse.ptt
+++ b/bench/x86-64/copy_mem_sse.ptt
@@ -2,6 +2,12 @@ STREAMS 2
TYPE DOUBLE
FLOPS 0
BYTES 16
+DESC Double-precision vector copy, uses SSE and non-temporal stores
+LOADS 1
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 14
LOOP 8
movaps FPR1, [STR0 + GPR1 * 8]
movaps FPR2, [STR0 + GPR1 * 8 + 16]
diff --git a/bench/x86-64/copy_plain.ptt b/bench/x86-64/copy_plain.ptt
deleted file mode 100644
index 4fcbbbc..0000000
--- a/bench/x86-64/copy_plain.ptt
+++ /dev/null
@@ -1,16 +0,0 @@
-STREAMS 2
-TYPE DOUBLE
-FLOPS 0
-BYTES 16
-LOOP 4
-movsd FPR1, [STR0 + GPR1 * 8]
-movsd FPR2, [STR0 + GPR1 * 8 + 8]
-movsd FPR3, [STR0 + GPR1 * 8 + 16]
-movsd FPR4, [STR0 + GPR1 * 8 + 24]
-movsd [STR1 + GPR1 * 8] , FPR1
-movsd [STR1 + GPR1 * 8 + 8] , FPR2
-movsd [STR1 + GPR1 * 8 + 16], FPR3
-movsd [STR1 + GPR1 * 8 + 24], FPR4
-
-
-
diff --git a/bench/x86-64/copy_sse.ptt b/bench/x86-64/copy_sse.ptt
index ffca4f5..75aaee4 100644
--- a/bench/x86-64/copy_sse.ptt
+++ b/bench/x86-64/copy_sse.ptt
@@ -2,6 +2,12 @@ STREAMS 2
TYPE DOUBLE
FLOPS 0
BYTES 16
+DESC Double-precision vector copy, optimized for SSE
+LOADS 1
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 14
LOOP 8
movaps FPR1, [STR0 + GPR1 * 8]
movaps FPR2, [STR0 + GPR1 * 8 + 16]
diff --git a/bench/x86-64/daxpy.ptt b/bench/x86-64/daxpy.ptt
new file mode 100644
index 0000000..fae8bbf
--- /dev/null
+++ b/bench/x86-64/daxpy.ptt
@@ -0,0 +1,28 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision linear combination of two vectors, only scalar operations
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 19
+UOPS 26
+movsd FPR7, [rip+SCALAR]
+LOOP 4
+movsd FPR1, [STR0 + GPR1*8]
+movsd FPR2, [STR0 + GPR1*8 + 8]
+mulsd FPR1, FPR7
+mulsd FPR2, FPR7
+movsd FPR3, [STR0 + GPR1*8 + 16]
+movsd FPR4, [STR0 + GPR1*8 + 24]
+mulsd FPR3, FPR7
+mulsd FPR4, FPR7
+addsd FPR1, [STR1 + GPR1*8]
+addsd FPR2, [STR1 + GPR1*8 + 8]
+addsd FPR3, [STR1 + GPR1*8 + 16]
+addsd FPR4, [STR1 + GPR1*8 + 24]
+movsd [STR1 + GPR1*8], FPR1
+movsd [STR1 + GPR1*8 + 8], FPR2
+movsd [STR1 + GPR1*8 + 16], FPR3
+movsd [STR1 + GPR1*8 + 24], FPR4
diff --git a/bench/x86-64/daxpy_avx.ptt b/bench/x86-64/daxpy_avx.ptt
new file mode 100644
index 0000000..7b2ecd8
--- /dev/null
+++ b/bench/x86-64/daxpy_avx.ptt
@@ -0,0 +1,31 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision linear combination of two vectors, optimized for AVX
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 21
+UOPS 38
+vmovaps ymm7, [rip+SCALAR]
+LOOP 24
+vmulpd ymm1, ymm7, [STR0 + GPR1*8]
+vaddpd ymm1, ymm1, [STR1 + GPR1*8]
+vmulpd ymm2, ymm7, [STR0 + GPR1*8+32]
+vaddpd ymm2, ymm2, [STR1 + GPR1*8+32]
+vmovaps [STR1 + GPR1*8], ymm1
+vmovaps [STR1 + GPR1*8+32], ymm2
+vmulpd ymm3, ymm7, [STR0 + GPR1*8+64]
+vaddpd ymm3, ymm3, [STR1 + GPR1*8+64]
+vmulpd ymm4, ymm7, [STR0 + GPR1*8+96]
+vaddpd ymm4, ymm4, [STR1 + GPR1*8+96]
+vmovaps [STR1 + GPR1*8+64], ymm3
+vmovaps [STR1 + GPR1*8+96], ymm4
+vmulpd ymm5, ymm7, [STR0 + GPR1*8+128]
+vaddpd ymm5, ymm5, [STR1 + GPR1*8+128]
+vmulpd ymm6, ymm7, [STR0 + GPR1*8+160]
+vaddpd ymm6, ymm6, [STR1 + GPR1*8+160]
+vmovaps [STR1 + GPR1*8+128], ymm5
+vmovaps [STR1 + GPR1*8+160], ymm6
+
diff --git a/bench/x86-64/daxpy_avx_fma.ptt b/bench/x86-64/daxpy_avx_fma.ptt
new file mode 100644
index 0000000..8a77482
--- /dev/null
+++ b/bench/x86-64/daxpy_avx_fma.ptt
@@ -0,0 +1,25 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision linear combination of two vectors, optimized for AVX FMAs
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+vmovaps ymm7, [rip+SCALAR]
+LOOP 16
+vmovaps ymm1, [STR0 + GPR1*8]
+vmovaps ymm2, [STR0 + GPR1*8+32]
+vmovaps ymm3, [STR0 + GPR1*8+64]
+vmovaps ymm4, [STR0 + GPR1*8+96]
+vfmadd213pd ymm1, ymm7, [STR1 + GPR1*8]
+vfmadd213pd ymm2, ymm7, [STR1 + GPR1*8+32]
+vfmadd213pd ymm3, ymm7, [STR1 + GPR1*8+64]
+vfmadd213pd ymm4, ymm7, [STR1 + GPR1*8+96]
+vmovaps [STR1 + GPR1*8], ymm1
+vmovaps [STR1 + GPR1*8+32], ymm2
+vmovaps [STR1 + GPR1*8+64], ymm3
+vmovaps [STR1 + GPR1*8+96], ymm4
+
diff --git a/bench/x86-64/daxpy_mem_avx.ptt b/bench/x86-64/daxpy_mem_avx.ptt
new file mode 100644
index 0000000..fbbee94
--- /dev/null
+++ b/bench/x86-64/daxpy_mem_avx.ptt
@@ -0,0 +1,30 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision linear combination of two vectors, optimized for AVX and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 21
+UOPS 38
+vmovaps ymm7, [rip+SCALAR]
+LOOP 24
+vmulpd ymm1, ymm7, [STR0 + GPR1*8]
+vaddpd ymm1, ymm1, [STR1 + GPR1*8]
+vmulpd ymm2, ymm7, [STR0 + GPR1*8+32]
+vaddpd ymm2, ymm2, [STR1 + GPR1*8+32]
+vmovntps [STR1 + GPR1*8], ymm1
+vmovntps [STR1 + GPR1*8+32], ymm2
+vmulpd ymm3, ymm7, [STR0 + GPR1*8+64]
+vaddpd ymm3, ymm3, [STR1 + GPR1*8+64]
+vmulpd ymm4, ymm7, [STR0 + GPR1*8+96]
+vaddpd ymm4, ymm4, [STR1 + GPR1*8+96]
+vmovntps [STR1 + GPR1*8+64], ymm3
+vmovntps [STR1 + GPR1*8+96], ymm4
+vmulpd ymm5, ymm7, [STR0 + GPR1*8+128]
+vaddpd ymm5, ymm5, [STR1 + GPR1*8+128]
+vmulpd ymm6, ymm7, [STR0 + GPR1*8+160]
+vaddpd ymm6, ymm6, [STR1 + GPR1*8+160]
+vmovntps [STR1 + GPR1*8+128], ymm5
+vmovntps [STR1 + GPR1*8+160], ymm6
diff --git a/bench/x86-64/daxpy_mem_avx_fma.ptt b/bench/x86-64/daxpy_mem_avx_fma.ptt
new file mode 100644
index 0000000..1c7e434
--- /dev/null
+++ b/bench/x86-64/daxpy_mem_avx_fma.ptt
@@ -0,0 +1,24 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision linear combination of two vectors, optimized for AVX FMAs and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+vmovaps ymm7, [rip+SCALAR]
+LOOP 16
+vmovaps ymm1, [STR0 + GPR1*8]
+vmovaps ymm2, [STR0 + GPR1*8+32]
+vmovaps ymm3, [STR0 + GPR1*8+64]
+vmovaps ymm4, [STR0 + GPR1*8+96]
+vfmadd213pd ymm1, ymm7, [STR1 + GPR1*8]
+vfmadd213pd ymm2, ymm7, [STR1 + GPR1*8+32]
+vfmadd213pd ymm3, ymm7, [STR1 + GPR1*8+64]
+vfmadd213pd ymm4, ymm7, [STR1 + GPR1*8+96]
+vmovntps [STR1 + GPR1*8], ymm1
+vmovntps [STR1 + GPR1*8+32], ymm2
+vmovntps [STR1 + GPR1*8+64], ymm3
+vmovntps [STR1 + GPR1*8+96], ymm4
diff --git a/bench/x86-64/daxpy_mem_sse.ptt b/bench/x86-64/daxpy_mem_sse.ptt
new file mode 100644
index 0000000..37d538e
--- /dev/null
+++ b/bench/x86-64/daxpy_mem_sse.ptt
@@ -0,0 +1,28 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision linear combination of two vectors, optimized for SSE and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 26
+movaps FPR7, [rip+SCALAR]
+LOOP 8
+movaps FPR1, [STR0 + GPR1*8]
+movaps FPR2, [STR0 + GPR1*8+16]
+movaps FPR3, [STR0 + GPR1*8+32]
+movaps FPR4, [STR0 + GPR1*8+48]
+mulpd FPR1, FPR7
+addpd FPR1, [STR1 + GPR1*8]
+mulpd FPR2, FPR7
+addpd FPR2, [STR1 + GPR1*8+16]
+mulpd FPR3, FPR7
+addpd FPR3, [STR1 + GPR1*8+32]
+mulpd FPR4, FPR7
+addpd FPR4, [STR1 + GPR1*8+48]
+vmovntps [STR1 + GPR1*8], FPR1
+vmovntps [STR1 + GPR1*8+16], FPR2
+vmovntps [STR1 + GPR1*8+32], FPR3
+vmovntps [STR1 + GPR1*8+48], FPR4
diff --git a/bench/x86-64/daxpy_mem_sse_fma.ptt b/bench/x86-64/daxpy_mem_sse_fma.ptt
new file mode 100644
index 0000000..ba90537
--- /dev/null
+++ b/bench/x86-64/daxpy_mem_sse_fma.ptt
@@ -0,0 +1,24 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision linear combination of two vectors, optimized for SSE FMAs and non temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+movaps FPR7, [rip+SCALAR]
+LOOP 8
+movaps FPR1, [STR0 + GPR1*8]
+movaps FPR2, [STR0 + GPR1*8+16]
+movaps FPR3, [STR0 + GPR1*8+32]
+movaps FPR4, [STR0 + GPR1*8+48]
+vfmadd213pd FPR1, FPR7, [STR1 + GPR1*8]
+vfmadd213pd FPR2, FPR7, [STR1 + GPR1*8+16]
+vfmadd213pd FPR3, FPR7, [STR1 + GPR1*8+32]
+vfmadd213pd FPR4, FPR7, [STR1 + GPR1*8+48]
+movntps [STR1 + GPR1*8], FPR1
+movntps [STR1 + GPR1*8+16], FPR2
+movntps [STR1 + GPR1*8+32], FPR3
+movntps [STR1 + GPR1*8+48], FPR4
diff --git a/bench/x86-64/daxpy_sp.ptt b/bench/x86-64/daxpy_sp.ptt
new file mode 100644
index 0000000..3f4a326
--- /dev/null
+++ b/bench/x86-64/daxpy_sp.ptt
@@ -0,0 +1,44 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision linear combination of two vectors, only scalar operations
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 32
+UOPS 50
+movss FPR9, [rip+SCALAR]
+LOOP 8
+movss FPR1, [STR0 + GPR1*4]
+movss FPR2, [STR0 + GPR1*4 + 4]
+movss FPR3, [STR0 + GPR1*4 + 8]
+movss FPR4, [STR0 + GPR1*4 + 12]
+mulss FPR1, FPR9
+addss FPR1, [STR1 + GPR1*4]
+mulss FPR2, FPR9
+addss FPR2, [STR1 + GPR1*4 + 4]
+mulss FPR3, FPR9
+addss FPR3, [STR1 + GPR1*4 + 8]
+mulss FPR4, FPR9
+addss FPR4, [STR1 + GPR1*4 + 12]
+movss FPR5, [STR0 + GPR1*4 + 16]
+movss FPR6, [STR0 + GPR1*4 + 20]
+movss FPR7, [STR0 + GPR1*4 + 24]
+movss FPR8, [STR0 + GPR1*4 + 28]
+mulss FPR5, FPR9
+addss FPR5, [STR1 + GPR1*4 + 16]
+mulss FPR6, FPR9
+addss FPR6, [STR1 + GPR1*4 + 20]
+mulss FPR7, FPR9
+addss FPR7, [STR1 + GPR1*4 + 24]
+mulss FPR8, FPR9
+addss FPR8, [STR1 + GPR1*4 + 28]
+movss [STR1 + GPR1*4], FPR1
+movss [STR1 + GPR1*4 + 4], FPR2
+movss [STR1 + GPR1*4 + 8], FPR3
+movss [STR1 + GPR1*4 + 12], FPR4
+movss [STR1 + GPR1*4 + 16], FPR5
+movss [STR1 + GPR1*4 + 20], FPR6
+movss [STR1 + GPR1*4 + 24], FPR7
+movss [STR1 + GPR1*4 + 28], FPR8
diff --git a/bench/x86-64/daxpy_sp_avx.ptt b/bench/x86-64/daxpy_sp_avx.ptt
new file mode 100644
index 0000000..4602982
--- /dev/null
+++ b/bench/x86-64/daxpy_sp_avx.ptt
@@ -0,0 +1,19 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision linear combination of two vectors, optimized for AVX
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 9
+UOPS 14
+vmovaps ymm7, [rip+SCALAR]
+LOOP 16
+vmulps ymm1, ymm7, [STR0 + GPR1*4]
+vaddps ymm1, ymm1, [STR1 + GPR1*4]
+vmovaps [STR1 + GPR1*4], ymm1
+vmulps ymm2, ymm7, [STR0 + GPR1*4+32]
+vaddps ymm2, ymm2, [STR1 + GPR1*4+32]
+vmovaps [STR1 + GPR1*4+32], ymm2
+
diff --git a/bench/x86-64/daxpy_sp_avx_fma.ptt b/bench/x86-64/daxpy_sp_avx_fma.ptt
new file mode 100644
index 0000000..f5216a1
--- /dev/null
+++ b/bench/x86-64/daxpy_sp_avx_fma.ptt
@@ -0,0 +1,25 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision linear combination of two vectors, optimized for AVX FMAs
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+vmovaps ymm7, [rip+SCALAR]
+LOOP 16
+vmovaps ymm1, [STR0 + GPR1*4]
+vmovaps ymm2, [STR0 + GPR1*4+32]
+vmovaps ymm3, [STR0 + GPR1*4+64]
+vmovaps ymm4, [STR0 + GPR1*4+96]
+vfmadd213ps ymm1, ymm7, [STR1 + GPR1*4]
+vfmadd213ps ymm2, ymm7, [STR1 + GPR1*4+32]
+vfmadd213ps ymm3, ymm7, [STR1 + GPR1*4+64]
+vfmadd213ps ymm4, ymm7, [STR1 + GPR1*4+96]
+vmovaps [STR1 + GPR1*4], ymm1
+vmovaps [STR1 + GPR1*4+32], ymm2
+vmovaps [STR1 + GPR1*4+64], ymm3
+vmovaps [STR1 + GPR1*4+96], ymm4
+
diff --git a/bench/x86-64/daxpy_sp_mem_avx.ptt b/bench/x86-64/daxpy_sp_mem_avx.ptt
new file mode 100644
index 0000000..0f26304
--- /dev/null
+++ b/bench/x86-64/daxpy_sp_mem_avx.ptt
@@ -0,0 +1,19 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision linear combination of two vectors, optimized for AVX and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 9
+UOPS 14
+vmovaps ymm7, [rip+SCALAR]
+LOOP 16
+vmulps ymm1, ymm7, [STR0 + GPR1*4]
+vaddps ymm1, ymm1, [STR1 + GPR1*4]
+vmovntps [STR1 + GPR1*4], ymm1
+vmulps ymm2, ymm7, [STR0 + GPR1*4+32]
+vaddps ymm2, ymm2, [STR1 + GPR1*4+32]
+vmovntps [STR1 + GPR1*4+32], ymm2
+
diff --git a/bench/x86-64/daxpy_sp_mem_avx_fma.ptt b/bench/x86-64/daxpy_sp_mem_avx_fma.ptt
new file mode 100644
index 0000000..4c316d4
--- /dev/null
+++ b/bench/x86-64/daxpy_sp_mem_avx_fma.ptt
@@ -0,0 +1,25 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision linear combination of two vectors, optimized for AVX FMAs and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+vmovaps ymm7, [rip+SCALAR]
+LOOP 16
+vmovaps ymm1, [STR0 + GPR1*4]
+vmovaps ymm2, [STR0 + GPR1*4+32]
+vmovaps ymm3, [STR0 + GPR1*4+64]
+vmovaps ymm4, [STR0 + GPR1*4+96]
+vfmadd213ps ymm1, ymm7, [STR1 + GPR1*4]
+vfmadd213ps ymm2, ymm7, [STR1 + GPR1*4+32]
+vfmadd213ps ymm3, ymm7, [STR1 + GPR1*4+64]
+vfmadd213ps ymm4, ymm7, [STR1 + GPR1*4+96]
+vmovntps [STR1 + GPR1*4], ymm1
+vmovntps [STR1 + GPR1*4+32], ymm2
+vmovntps [STR1 + GPR1*4+64], ymm3
+vmovntps [STR1 + GPR1*4+96], ymm4
+
diff --git a/bench/x86-64/daxpy_sp_mem_sse.ptt b/bench/x86-64/daxpy_sp_mem_sse.ptt
new file mode 100644
index 0000000..a9e6ec5
--- /dev/null
+++ b/bench/x86-64/daxpy_sp_mem_sse.ptt
@@ -0,0 +1,20 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision linear combination of two vectors, optimized for SSE and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 11
+UOPS 14
+movaps FPR7, [rip+SCALAR]
+LOOP 8
+movaps FPR1, [STR0 + GPR1*4]
+mulps FPR1, FPR7
+addps FPR1, [STR1 + GPR1*4]
+movntps [STR1 + GPR1*4], FPR1
+movaps FPR2, [STR0 + GPR1*4+16]
+mulps FPR2, FPR7
+addps FPR2, [STR1 + GPR1*4+16]
+movntps [STR1 + GPR1*4+16], FPR2
diff --git a/bench/x86-64/daxpy_sp_mem_sse_fma.ptt b/bench/x86-64/daxpy_sp_mem_sse_fma.ptt
new file mode 100644
index 0000000..ee85f28
--- /dev/null
+++ b/bench/x86-64/daxpy_sp_mem_sse_fma.ptt
@@ -0,0 +1,24 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision linear combination of two vectors, optimized for SSE FMAs and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+movaps FPR7, [rip+SCALAR]
+LOOP 32
+movaps FPR1, [STR0 + GPR1*4]
+movaps FPR2, [STR0 + GPR1*4+32]
+movaps FPR3, [STR0 + GPR1*4+64]
+movaps FPR4, [STR0 + GPR1*4+96]
+vfmadd213ps FPR1, FPR7, [STR1 + GPR1*4]
+vfmadd213ps FPR2, FPR7, [STR1 + GPR1*4+32]
+vfmadd213ps FPR3, FPR7, [STR1 + GPR1*4+64]
+vfmadd213ps FPR4, FPR7, [STR1 + GPR1*4+96]
+movntps [STR1 + GPR1*4], FPR1
+movntps [STR1 + GPR1*4+32], FPR2
+movntps [STR1 + GPR1*4+64], FPR3
+movntps [STR1 + GPR1*4+96], FPR4
diff --git a/bench/x86-64/daxpy_sp_sse.ptt b/bench/x86-64/daxpy_sp_sse.ptt
new file mode 100644
index 0000000..fc20441
--- /dev/null
+++ b/bench/x86-64/daxpy_sp_sse.ptt
@@ -0,0 +1,28 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision linear combination of two vectors, optimized for SSE
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 19
+UOPS 26
+movaps FPR7, [rip+SCALAR]
+LOOP 8
+movaps FPR1, [STR0 + GPR1*4]
+movaps FPR2, [STR0 + GPR1*4+16]
+movaps FPR3, [STR0 + GPR1*4+32]
+movaps FPR4, [STR0 + GPR1*4+48]
+mulps FPR1, FPR7
+addps FPR1, [STR1 + GPR1*4]
+mulps FPR2, FPR7
+addps FPR2, [STR1 + GPR1*4+16]
+mulps FPR3, FPR7
+addps FPR3, [STR1 + GPR1*4+32]
+mulps FPR4, FPR7
+addps FPR4, [STR1 + GPR1*4+48]
+movaps [STR1 + GPR1*4], FPR1
+movaps [STR1 + GPR1*4+16], FPR2
+movaps [STR1 + GPR1*4+32], FPR3
+movaps [STR1 + GPR1*4+48], FPR4
diff --git a/bench/x86-64/daxpy_sp_sse_fma.ptt b/bench/x86-64/daxpy_sp_sse_fma.ptt
new file mode 100644
index 0000000..148d750
--- /dev/null
+++ b/bench/x86-64/daxpy_sp_sse_fma.ptt
@@ -0,0 +1,24 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision linear combination of two vectors, optimized for SSE FMAs
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+movaps FPR7, [rip+SCALAR]
+LOOP 32
+movaps FPR1, [STR0 + GPR1*4]
+movaps FPR2, [STR0 + GPR1*4+32]
+movaps FPR3, [STR0 + GPR1*4+64]
+movaps FPR4, [STR0 + GPR1*4+96]
+vfmadd213ps FPR1, FPR7, [STR1 + GPR1*4]
+vfmadd213ps FPR2, FPR7, [STR1 + GPR1*4+32]
+vfmadd213ps FPR3, FPR7, [STR1 + GPR1*4+64]
+vfmadd213ps FPR4, FPR7, [STR1 + GPR1*4+96]
+movaps [STR1 + GPR1*4], FPR1
+movaps [STR1 + GPR1*4+32], FPR2
+movaps [STR1 + GPR1*4+64], FPR3
+movaps [STR1 + GPR1*4+96], FPR4
diff --git a/bench/x86-64/daxpy_sse.ptt b/bench/x86-64/daxpy_sse.ptt
new file mode 100644
index 0000000..747aed6
--- /dev/null
+++ b/bench/x86-64/daxpy_sse.ptt
@@ -0,0 +1,28 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision linear combination of two vectors, optimized for SSE
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 19
+UOPS 26
+movaps FPR7, [rip+SCALAR]
+LOOP 8
+movaps FPR1, [STR0 + GPR1*8]
+movaps FPR2, [STR0 + GPR1*8+16]
+movaps FPR3, [STR0 + GPR1*8+32]
+movaps FPR4, [STR0 + GPR1*8+48]
+mulpd FPR1, FPR7
+addpd FPR1, [STR1 + GPR1*8]
+mulpd FPR2, FPR7
+addpd FPR2, [STR1 + GPR1*8+16]
+mulpd FPR3, FPR7
+addpd FPR3, [STR1 + GPR1*8+32]
+mulpd FPR4, FPR7
+addpd FPR4, [STR1 + GPR1*8+48]
+movaps [STR1 + GPR1*8], FPR1
+movaps [STR1 + GPR1*8+16], FPR2
+movaps [STR1 + GPR1*8+32], FPR3
+movaps [STR1 + GPR1*8+48], FPR4
diff --git a/bench/x86-64/daxpy_sse_fma.ptt b/bench/x86-64/daxpy_sse_fma.ptt
new file mode 100644
index 0000000..21c022f
--- /dev/null
+++ b/bench/x86-64/daxpy_sse_fma.ptt
@@ -0,0 +1,24 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision linear combination of two vectors, optimized for SSE FMAs
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+movaps FPR7, [rip+SCALAR]
+LOOP 8
+movaps FPR1, [STR0 + GPR1*8]
+movaps FPR2, [STR0 + GPR1*8+16]
+movaps FPR3, [STR0 + GPR1*8+32]
+movaps FPR4, [STR0 + GPR1*8+48]
+vfmadd213pd FPR1, FPR7, [STR1 + GPR1*8]
+vfmadd213pd FPR2, FPR7, [STR1 + GPR1*8+16]
+vfmadd213pd FPR3, FPR7, [STR1 + GPR1*8+32]
+vfmadd213pd FPR4, FPR7, [STR1 + GPR1*8+48]
+movaps [STR1 + GPR1*8], FPR1
+movaps [STR1 + GPR1*8+16], FPR2
+movaps [STR1 + GPR1*8+32], FPR3
+movaps [STR1 + GPR1*8+48], FPR4
diff --git a/bench/x86-64/ddot.ptt b/bench/x86-64/ddot.ptt
new file mode 100644
index 0000000..e438c49
--- /dev/null
+++ b/bench/x86-64/ddot.ptt
@@ -0,0 +1,27 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 16
+DESC Double-precision dot product of two vectors, only scalar operations
+LOADS 2
+STORES 0
+INSTR_CONST 20
+INSTR_LOOP 15
+UOPS 18
+xorpd FPR1, FPR1
+xorpd FPR6, FPR6
+xorpd FPR7, FPR7
+xorpd FPR8, FPR8
+LOOP 4
+movsd FPR2, [STR0 + GPR1 * 8]
+mulsd FPR2, [STR1 + GPR1 * 8]
+addsd FPR1, FPR2
+movsd FPR3, [STR0 + GPR1 * 8 + 8]
+mulsd FPR3, [STR1 + GPR1 * 8 + 8]
+addsd FPR6, FPR3
+movsd FPR4, [STR0 + GPR1 * 8 + 16]
+mulsd FPR4, [STR1 + GPR1 * 8 + 16]
+addsd FPR7, FPR4
+movsd FPR5, [STR0 + GPR1 * 8 + 24]
+mulsd FPR5, [STR1 + GPR1 * 8 + 24]
+addsd FPR8, FPR5
diff --git a/bench/x86-64/ddot_avx.ptt b/bench/x86-64/ddot_avx.ptt
new file mode 100644
index 0000000..24dc330
--- /dev/null
+++ b/bench/x86-64/ddot_avx.ptt
@@ -0,0 +1,27 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 16
+DESC Double-precision dot product of two vectors, optimized for AVX
+LOADS 2
+STORES 0
+INSTR_CONST 20
+INSTR_LOOP 15
+UOPS 18
+vxorpd ymm0, ymm0, ymm0
+vxorpd ymm5, ymm5, ymm5
+vxorpd ymm6, ymm6, ymm6
+vxorpd ymm7, ymm7, ymm7
+LOOP 16
+vmovaps ymm1, [STR0 + GPR1 * 8]
+vmulpd ymm1, ymm1, [STR1 + GPR1 * 8]
+vaddpd ymm0, ymm0, ymm1
+vmovaps ymm2, [STR0 + GPR1 * 8 + 32]
+vmulpd ymm2, ymm2, [STR1 + GPR1 * 8 + 32]
+vaddpd ymm5, ymm5, ymm2
+vmovaps ymm3, [STR0 + GPR1 * 8 + 64]
+vmulpd ymm3, ymm3, [STR1 + GPR1 * 8 + 64]
+vaddpd ymm6, ymm6, ymm3
+vmovaps ymm4, [STR0 + GPR1 * 8 + 96]
+vmulpd ymm4, ymm4, [STR1 + GPR1 * 8 + 96]
+vaddpd ymm7, ymm7, ymm4
diff --git a/bench/x86-64/ddot_sp.ptt b/bench/x86-64/ddot_sp.ptt
new file mode 100644
index 0000000..4a108b9
--- /dev/null
+++ b/bench/x86-64/ddot_sp.ptt
@@ -0,0 +1,27 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 8
+DESC Single-precision dot product of two vectors, only scalar operations
+LOADS 2
+STORES 0
+INSTR_CONST 20
+INSTR_LOOP 15
+UOPS 18
+xorps FPR1, FPR1
+xorps FPR6, FPR6
+xorps FPR7, FPR7
+xorps FPR8, FPR8
+LOOP 4
+movss FPR2, [STR0 + GPR1 * 4]
+mulss FPR2, [STR1 + GPR1 * 4]
+addss FPR1, FPR2
+movss FPR3, [STR0 + GPR1 * 4 + 4]
+mulss FPR3, [STR1 + GPR1 * 4 + 4]
+addss FPR6, FPR3
+movss FPR4, [STR0 + GPR1 * 4 + 8]
+mulss FPR4, [STR1 + GPR1 * 4 + 8]
+addss FPR7, FPR4
+movss FPR5, [STR0 + GPR1 * 4 + 12]
+mulss FPR5, [STR1 + GPR1 * 4 + 12]
+addss FPR8, FPR5
diff --git a/bench/x86-64/ddot_sp_avx.ptt b/bench/x86-64/ddot_sp_avx.ptt
new file mode 100644
index 0000000..45afad6
--- /dev/null
+++ b/bench/x86-64/ddot_sp_avx.ptt
@@ -0,0 +1,19 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 8
+DESC Single-precision dot product of two vectors, optimized for AVX
+LOADS 2
+STORES 0
+INSTR_CONST 18
+INSTR_LOOP 9
+UOPS 10
+vxorps ymm0, ymm0, ymm0
+vxorps ymm3, ymm3, ymm3
+LOOP 16
+vmovaps ymm1, [STR0 + GPR1 * 4]
+vmulps ymm1, ymm1, [STR1 + GPR1 * 4]
+vaddps ymm0, ymm0, ymm1
+vmovaps ymm2, [STR0 + GPR1 * 4 + 32]
+vmulps ymm2, ymm2, [STR1 + GPR1 * 4 + 32]
+vaddps ymm3, ymm3, ymm2
diff --git a/bench/x86-64/ddot_sp_sse.ptt b/bench/x86-64/ddot_sp_sse.ptt
new file mode 100644
index 0000000..b445cb3
--- /dev/null
+++ b/bench/x86-64/ddot_sp_sse.ptt
@@ -0,0 +1,19 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 8
+DESC Single-precision dot product of two vectors, optimized for SSE
+LOADS 2
+STORES 0
+INSTR_CONST 18
+INSTR_LOOP 9
+UOPS 10
+xorps FPR1, FPR1
+xorps FPR4, FPR4
+LOOP 8
+movaps FPR2, [STR0 + GPR1 * 4]
+mulps FPR2, [STR1 + GPR1 * 4]
+addps FPR1, FPR2
+movaps FPR3, [STR0 + GPR1 * 4 + 16]
+mulps FPR3, [STR1 + GPR1 * 4 + 16]
+addps FPR4, FPR3
diff --git a/bench/x86-64/ddot_sse.ptt b/bench/x86-64/ddot_sse.ptt
new file mode 100644
index 0000000..42ff526
--- /dev/null
+++ b/bench/x86-64/ddot_sse.ptt
@@ -0,0 +1,27 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 16
+DESC Double-precision dot product of two vectors, optimized for SSE
+LOADS 2
+STORES 0
+INSTR_CONST 20
+INSTR_LOOP 15
+UOPS 18
+xorpd FPR1, FPR1
+xorpd FPR6, FPR6
+xorpd FPR7, FPR7
+xorpd FPR8, FPR8
+LOOP 8
+movaps FPR2, [STR0 + GPR1 * 8]
+mulpd FPR2, [STR1 + GPR1 * 8]
+addpd FPR1, FPR2
+movaps FPR3, [STR0 + GPR1 * 8 + 16]
+mulpd FPR3, [STR1 + GPR1 * 8 + 16]
+addpd FPR6, FPR3
+movaps FPR4, [STR0 + GPR1 * 8 + 32]
+mulpd FPR4, [STR1 + GPR1 * 8 + 32]
+addpd FPR7, FPR4
+movaps FPR5, [STR0 + GPR1 * 8 + 48]
+mulpd FPR5, [STR1 + GPR1 * 8 + 48]
+addpd FPR8, FPR5
diff --git a/bench/x86-64/load.ptt b/bench/x86-64/load.ptt
index 36aaab1..eb1b954 100644
--- a/bench/x86-64/load.ptt
+++ b/bench/x86-64/load.ptt
@@ -2,11 +2,19 @@ STREAMS 1
TYPE DOUBLE
FLOPS 0
BYTES 8
+DESC Double-precision load, only scalar operations
+LOADS 1
+STORES 0
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 10
LOOP 8
-mov GPR12, [STR0 + GPR1 * 8 + 256]
-movaps FPR1, [STR0 + GPR1 * 8]
-movaps FPR2, [STR0 + GPR1 * 8 + 16]
-movaps FPR3, [STR0 + GPR1 * 8 + 32]
-movaps FPR4, [STR0 + GPR1 * 8 + 48]
-
+movsd FPR1, [STR0 + GPR1 * 8]
+movsd FPR2, [STR0 + GPR1 * 8 + 8]
+movsd FPR3, [STR0 + GPR1 * 8 + 16]
+movsd FPR4, [STR0 + GPR1 * 8 + 24]
+movsd FPR5, [STR0 + GPR1 * 8 + 32]
+movsd FPR6, [STR0 + GPR1 * 8 + 40]
+movsd FPR7, [STR0 + GPR1 * 8 + 48]
+movsd FPR8, [STR0 + GPR1 * 8 + 56]
diff --git a/bench/x86-64/load_avx.ptt b/bench/x86-64/load_avx.ptt
index 93b45c7..7ce7989 100644
--- a/bench/x86-64/load_avx.ptt
+++ b/bench/x86-64/load_avx.ptt
@@ -2,8 +2,14 @@ STREAMS 1
TYPE DOUBLE
FLOPS 0
BYTES 8
+DESC Double-precision load, optimized for AVX
+LOADS 1
+STORES 0
+INSTR_CONST 16
+INSTR_LOOP 7
+UOPS 6
LOOP 16
-mov GPR12, [STR0 + GPR1 * 8 + 256]
+#mov GPR12, [STR0 + GPR1 * 8 + 256]
vmovaps ymm1, [STR0 + GPR1 * 8]
vmovaps ymm2, [STR0 + GPR1 * 8 + 32]
vmovaps ymm3, [STR0 + GPR1 * 8 + 64]
diff --git a/bench/x86-64/load_mem.ptt b/bench/x86-64/load_mem.ptt
new file mode 100644
index 0000000..06b0440
--- /dev/null
+++ b/bench/x86-64/load_mem.ptt
@@ -0,0 +1,15 @@
+STREAMS 1
+TYPE DOUBLE
+FLOPS 0
+BYTES 8
+DESC Double-precision load, using non-temporal loads
+LOADS 1
+STORES 0
+INSTR_CONST 16
+INSTR_LOOP 7
+UOPS 6
+LOOP 8
+MOVNTDQA FPR1, [STR0 + GPR1 * 8]
+MOVNTDQA FPR2, [STR0 + GPR1 * 8 + 16]
+MOVNTDQA FPR3, [STR0 + GPR1 * 8 + 32]
+MOVNTDQA FPR4, [STR0 + GPR1 * 8 + 48]
diff --git a/bench/x86-64/load_plain.ptt b/bench/x86-64/load_plain.ptt
deleted file mode 100644
index be6d21c..0000000
--- a/bench/x86-64/load_plain.ptt
+++ /dev/null
@@ -1,12 +0,0 @@
-STREAMS 1
-TYPE DOUBLE
-FLOPS 0
-BYTES 8
-LOOP 4
-mov GPR12, [STR0 + GPR1 * 8 + 256]
-movsd FPR1, [STR0 + GPR1 * 8]
-movsd FPR2, [STR0 + GPR1 * 8 + 8]
-movsd FPR3, [STR0 + GPR1 * 8 + 16]
-movsd FPR4, [STR0 + GPR1 * 8 + 24]
-
-
diff --git a/bench/x86-64/load_sse.ptt b/bench/x86-64/load_sse.ptt
index 36aaab1..fa95f51 100644
--- a/bench/x86-64/load_sse.ptt
+++ b/bench/x86-64/load_sse.ptt
@@ -2,8 +2,14 @@ STREAMS 1
TYPE DOUBLE
FLOPS 0
BYTES 8
+DESC Double-precision load, optimized for SSE
+LOADS 1
+STORES 0
+INSTR_CONST 16
+INSTR_LOOP 7
+UOPS 6
LOOP 8
-mov GPR12, [STR0 + GPR1 * 8 + 256]
+#mov GPR12, [STR0 + GPR1 * 8 + 256]
movaps FPR1, [STR0 + GPR1 * 8]
movaps FPR2, [STR0 + GPR1 * 8 + 16]
movaps FPR3, [STR0 + GPR1 * 8 + 32]
diff --git a/bench/x86-64/peak.ptt b/bench/x86-64/peak.ptt
deleted file mode 100644
index c03e2c8..0000000
--- a/bench/x86-64/peak.ptt
+++ /dev/null
@@ -1,49 +0,0 @@
-STREAMS 2
-TYPE DOUBLE
-FLOPS 2
-BYTES 16
-INC 8
-movaps FPR1, [SCALAR]
-sub GPR2, 4
-sub STR0, 32
-sub STR1, 32
-mov GPR1, GPR2
-neg GPR1
-.align 16
-1:
-movaps FPR2, [STR0 + GPR1 * 8 ]
-addpd FPR2, FPR1
-mulpd FPR2, FPR1
-movaps FPR6, [STR0 + GPR1 * 8 ]
-addpd FPR2, FPR1
-mulpd FPR2, FPR1
-pshufd FPR2, FPR1, 0x1
-#movaps [STR1 + GPR1 * 8], FPR2
-movaps FPR3, [STR0 + GPR1 * 8 + 16]
-addpd FPR3, FPR1
-mulpd FPR3, FPR1
-movaps FPR7, [STR0 + GPR1 * 8 + 16 ]
-addpd FPR3, FPR1
-mulpd FPR3, FPR1
-pshufd FPR3, FPR1, 0x1
-#movaps [STR1 + GPR1 * 8 + 16], FPR3
-movaps FPR4, [STR0 + GPR1 * 8 + 32]
-addpd FPR4, FPR1
-mulpd FPR4, FPR1
-movaps FPR8, [STR0 + GPR1 * 8 + 32 ]
-addpd FPR4, FPR1
-mulpd FPR4, FPR1
-pshufd FPR4, FPR1, 0x1
-#movaps [STR1 + GPR1 * 8 + 32], FPR4
-movaps FPR5, [STR0 + GPR1 * 8 + 48]
-addpd FPR5, FPR1
-mulpd FPR5, FPR1
-movaps FPR9, [STR0 + GPR1 * 8 + 48 ]
-addpd FPR5, FPR1
-mulpd FPR5, FPR1
-pshufd FPR5, FPR1, 0x1
-#movaps [STR1 + GPR1 * 8 + 48], FPR5
-add GPR1, 8
-js 1b
-
-
diff --git a/bench/x86-64/peak_avx.ptt b/bench/x86-64/peak_avx.ptt
deleted file mode 100644
index 047178e..0000000
--- a/bench/x86-64/peak_avx.ptt
+++ /dev/null
@@ -1,49 +0,0 @@
-STREAMS 2
-TYPE DOUBLE
-FLOPS 2
-BYTES 16
-INC 16
-vmovaps ymm1, [SCALAR]
-sub GPR2, 8
-sub STR0, 64
-sub STR1, 64
-mov GPR1, GPR2
-neg GPR1
-.align 32
-1:
-vmovaps ymm2, [STR0 + GPR1 * 8 ]
-vaddpd ymm2, ymm2, ymm1
-vmulpd ymm2, ymm2, ymm1
-vmovaps ymm6, [STR0 + GPR1 * 8 ]
-vaddpd ymm2, ymm2, ymm1
-vmulpd ymm2, ymm2, ymm1
-#vpshufd ymm2, ymm1, 0x1
-vmovaps [STR1 + GPR1 * 8], ymm2
-vmovaps ymm3, [STR0 + GPR1 * 8 + 32]
-vaddpd ymm3, ymm3, ymm1
-vmulpd ymm3, ymm3, ymm1
-vmovaps ymm7, [STR0 + GPR1 * 8 + 32 ]
-vaddpd ymm3, ymm3, ymm1
-vmulpd ymm3, ymm3, ymm1
-#vpshufd ymm3, ymm1, 0x1
-vmovaps [STR1 + GPR1 * 8 + 32], ymm3
-vmovaps ymm4, [STR0 + GPR1 * 8 + 64]
-vaddpd ymm4, ymm4, ymm1
-vmulpd ymm4, ymm4, ymm1
-vmovaps ymm8, [STR0 + GPR1 * 8 + 64 ]
-vaddpd ymm4, ymm4, ymm1
-vmulpd ymm4, ymm4, ymm1
-#vpshufd ymm4, ymm1, 0x1
-vmovaps [STR1 + GPR1 * 8 + 32], ymm4
-vmovaps ymm5, [STR0 + GPR1 * 8 + 96]
-vaddpd ymm5, ymm5, ymm1
-vmulpd ymm5, ymm5, ymm1
-vmovaps ymm9, [STR0 + GPR1 * 8 + 96]
-vaddpd ymm5, ymm5, ymm1
-vmulpd ymm5, ymm5, ymm1
-#vpshufd ymm5, ymm1, 0x1
-vmovaps [STR1 + GPR1 * 8 + 96], ymm5
-add GPR1, 16
-js 1b
-
-
diff --git a/bench/x86-64/peak_sse.ptt b/bench/x86-64/peak_sse.ptt
deleted file mode 100644
index c03e2c8..0000000
--- a/bench/x86-64/peak_sse.ptt
+++ /dev/null
@@ -1,49 +0,0 @@
-STREAMS 2
-TYPE DOUBLE
-FLOPS 2
-BYTES 16
-INC 8
-movaps FPR1, [SCALAR]
-sub GPR2, 4
-sub STR0, 32
-sub STR1, 32
-mov GPR1, GPR2
-neg GPR1
-.align 16
-1:
-movaps FPR2, [STR0 + GPR1 * 8 ]
-addpd FPR2, FPR1
-mulpd FPR2, FPR1
-movaps FPR6, [STR0 + GPR1 * 8 ]
-addpd FPR2, FPR1
-mulpd FPR2, FPR1
-pshufd FPR2, FPR1, 0x1
-#movaps [STR1 + GPR1 * 8], FPR2
-movaps FPR3, [STR0 + GPR1 * 8 + 16]
-addpd FPR3, FPR1
-mulpd FPR3, FPR1
-movaps FPR7, [STR0 + GPR1 * 8 + 16 ]
-addpd FPR3, FPR1
-mulpd FPR3, FPR1
-pshufd FPR3, FPR1, 0x1
-#movaps [STR1 + GPR1 * 8 + 16], FPR3
-movaps FPR4, [STR0 + GPR1 * 8 + 32]
-addpd FPR4, FPR1
-mulpd FPR4, FPR1
-movaps FPR8, [STR0 + GPR1 * 8 + 32 ]
-addpd FPR4, FPR1
-mulpd FPR4, FPR1
-pshufd FPR4, FPR1, 0x1
-#movaps [STR1 + GPR1 * 8 + 32], FPR4
-movaps FPR5, [STR0 + GPR1 * 8 + 48]
-addpd FPR5, FPR1
-mulpd FPR5, FPR1
-movaps FPR9, [STR0 + GPR1 * 8 + 48 ]
-addpd FPR5, FPR1
-mulpd FPR5, FPR1
-pshufd FPR5, FPR1, 0x1
-#movaps [STR1 + GPR1 * 8 + 48], FPR5
-add GPR1, 8
-js 1b
-
-
diff --git a/bench/x86-64/peakflops.ptt b/bench/x86-64/peakflops.ptt
deleted file mode 100644
index 94c769a..0000000
--- a/bench/x86-64/peakflops.ptt
+++ /dev/null
@@ -1,37 +0,0 @@
-STREAMS 2
-TYPE DOUBLE
-FLOPS 2
-BYTES 16
-INC 8
-movaps FPR1, [SCALAR]
-sub GPR2, 4
-sub STR0, 32
-sub STR1, 32
-mov GPR1, GPR2
-neg GPR1
-.align 32
-1:
-movaps FPR2, [STR0 + GPR1 * 8 ]
-addpd FPR2, FPR1
-mulpd FPR2, FPR1
-addpd FPR2, FPR1
-mulpd FPR2, FPR1
-movaps FPR3, [STR0 + GPR1 * 8 + 16]
-add GPR1, 8
-addpd FPR3, FPR1
-mulpd FPR3, FPR1
-addpd FPR3, FPR1
-mulpd FPR3, FPR1
-movaps FPR4, [STR0 + GPR1 * 8 - 32]
-addpd FPR4, FPR1
-mulpd FPR4, FPR1
-addpd FPR4, FPR1
-mulpd FPR4, FPR1
-movaps FPR5, [STR0 + GPR1 * 8 - 16]
-addpd FPR5, FPR1
-mulpd FPR5, FPR1
-addpd FPR5, FPR1
-mulpd FPR5, FPR1
-js 1b
-
-
diff --git a/bench/x86-64/peakflops_avx.ptt b/bench/x86-64/peakflops_avx.ptt
deleted file mode 100644
index d9f9885..0000000
--- a/bench/x86-64/peakflops_avx.ptt
+++ /dev/null
@@ -1,37 +0,0 @@
-STREAMS 2
-TYPE DOUBLE
-FLOPS 2
-BYTES 16
-INC 16
-vmovaps ymm1, [SCALAR]
-sub GPR2, 8
-sub STR0, 64
-sub STR1, 64
-mov GPR1, GPR2
-neg GPR1
-.align 32
-1:
-vmovaps ymm2, [STR0 + GPR1 * 8 ]
-vaddpd ymm2, ymm2, ymm1
-vmulpd ymm2, ymm2, ymm1
-vaddpd ymm2, ymm2, ymm1
-vmulpd ymm2, ymm2, ymm1
-vmovaps ymm3, [STR0 + GPR1 * 8 + 32]
-add GPR1, 16
-vaddpd ymm3, ymm3, ymm1
-vmulpd ymm3, ymm3, ymm1
-vaddpd ymm3, ymm3, ymm1
-vmulpd ymm3, ymm3, ymm1
-vmovaps ymm4, [STR0 + GPR1 * 8 - 64]
-vaddpd ymm4, ymm4, ymm1
-vmulpd ymm4, ymm4, ymm1
-vaddpd ymm4, ymm4, ymm1
-vmulpd ymm4, ymm4, ymm1
-vmovaps ymm5, [STR0 + GPR1 * 8 - 32]
-vaddpd ymm5, ymm5, ymm1
-vmulpd ymm5, ymm5, ymm1
-vaddpd ymm5, ymm5, ymm1
-vmulpd ymm5, ymm5, ymm1
-js 1b
-
-
diff --git a/bench/x86-64/peakflops_sse.ptt b/bench/x86-64/peakflops_sse.ptt
deleted file mode 100644
index 94c769a..0000000
--- a/bench/x86-64/peakflops_sse.ptt
+++ /dev/null
@@ -1,37 +0,0 @@
-STREAMS 2
-TYPE DOUBLE
-FLOPS 2
-BYTES 16
-INC 8
-movaps FPR1, [SCALAR]
-sub GPR2, 4
-sub STR0, 32
-sub STR1, 32
-mov GPR1, GPR2
-neg GPR1
-.align 32
-1:
-movaps FPR2, [STR0 + GPR1 * 8 ]
-addpd FPR2, FPR1
-mulpd FPR2, FPR1
-addpd FPR2, FPR1
-mulpd FPR2, FPR1
-movaps FPR3, [STR0 + GPR1 * 8 + 16]
-add GPR1, 8
-addpd FPR3, FPR1
-mulpd FPR3, FPR1
-addpd FPR3, FPR1
-mulpd FPR3, FPR1
-movaps FPR4, [STR0 + GPR1 * 8 - 32]
-addpd FPR4, FPR1
-mulpd FPR4, FPR1
-addpd FPR4, FPR1
-mulpd FPR4, FPR1
-movaps FPR5, [STR0 + GPR1 * 8 - 16]
-addpd FPR5, FPR1
-mulpd FPR5, FPR1
-addpd FPR5, FPR1
-mulpd FPR5, FPR1
-js 1b
-
-
diff --git a/bench/x86-64/store.ptt b/bench/x86-64/store.ptt
index 4ef9ab9..196f9dc 100644
--- a/bench/x86-64/store.ptt
+++ b/bench/x86-64/store.ptt
@@ -1,15 +1,20 @@
STREAMS 1
TYPE DOUBLE
FLOPS 0
+DESC Double-precision store, only scalar operations
BYTES 8
-movaps FPR1, [SCALAR]
-movaps FPR2, [SCALAR]
-movaps FPR3, [SCALAR]
-movaps FPR4, [SCALAR]
-LOOP 8
-#mov GPR14, [STR0 + GPR1 * 8 + 256]
-movaps [STR0 + GPR1 * 8] , FPR1
-movaps [STR0 + GPR1 * 8 + 16], FPR2
-movaps [STR0 + GPR1 * 8 + 32], FPR3
-movaps [STR0 + GPR1 * 8 + 48], FPR4
+LOADS 0
+STORES 1
+INSTR_CONST 20
+INSTR_LOOP 7
+UOPS 10
+movsd FPR1, [rip+SCALAR]
+movsd FPR2, [rip+SCALAR]
+movsd FPR3, [rip+SCALAR]
+movsd FPR4, [rip+SCALAR]
+LOOP 4
+movsd [STR0 + GPR1 * 8] , FPR1
+movsd [STR0 + GPR1 * 8 + 8] , FPR2
+movsd [STR0 + GPR1 * 8 + 16], FPR3
+movsd [STR0 + GPR1 * 8 + 24], FPR4
diff --git a/bench/x86-64/store_avx.ptt b/bench/x86-64/store_avx.ptt
index 7b589a8..71ba7e1 100644
--- a/bench/x86-64/store_avx.ptt
+++ b/bench/x86-64/store_avx.ptt
@@ -2,12 +2,17 @@ STREAMS 1
TYPE DOUBLE
FLOPS 0
BYTES 8
-vmovaps ymm1, [SCALAR]
-vmovaps ymm2, [SCALAR]
-vmovaps ymm3, [SCALAR]
-vmovaps ymm4, [SCALAR]
+DESC Double-precision store, optimized for AVX
+LOADS 0
+STORES 1
+INSTR_CONST 20
+INSTR_LOOP 7
+UOPS 10
+vmovaps ymm1, [rip+SCALAR]
+vmovaps ymm2, [rip+SCALAR]
+vmovaps ymm3, [rip+SCALAR]
+vmovaps ymm4, [rip+SCALAR]
LOOP 16
-#mov GPR14, [STR0 + GPR1 * 8 + 256]
vmovaps [STR0 + GPR1 * 8] , ymm1
vmovaps [STR0 + GPR1 * 8 + 32], ymm2
vmovaps [STR0 + GPR1 * 8 + 64], ymm3
diff --git a/bench/x86-64/store_mem.ptt b/bench/x86-64/store_mem.ptt
index 0a0222d..4b511c0 100644
--- a/bench/x86-64/store_mem.ptt
+++ b/bench/x86-64/store_mem.ptt
@@ -1,11 +1,17 @@
STREAMS 1
TYPE DOUBLE
FLOPS 0
+DESC Double-precision store, uses non-temporal stores
+LOADS 0
+STORES 1
+INSTR_CONST 20
+INSTR_LOOP 7
+UOPS 10
BYTES 8
-movaps FPR1, [SCALAR]
-movaps FPR2, [SCALAR]
-movaps FPR3, [SCALAR]
-movaps FPR4, [SCALAR]
+movaps FPR1, [rip+SCALAR]
+movaps FPR2, [rip+SCALAR]
+movaps FPR3, [rip+SCALAR]
+movaps FPR4, [rip+SCALAR]
LOOP 8
movntpd [STR0 + GPR1 * 8] , FPR1
movntpd [STR0 + GPR1 * 8 + 16], FPR2
diff --git a/bench/x86-64/store_mem_avx.ptt b/bench/x86-64/store_mem_avx.ptt
index e023fd0..c4dd0a4 100644
--- a/bench/x86-64/store_mem_avx.ptt
+++ b/bench/x86-64/store_mem_avx.ptt
@@ -2,10 +2,16 @@ STREAMS 1
TYPE DOUBLE
FLOPS 0
BYTES 8
-vmovaps ymm1, [SCALAR]
-vmovaps ymm2, [SCALAR]
-vmovaps ymm3, [SCALAR]
-vmovaps ymm4, [SCALAR]
+DESC Double-precision store, uses AVX and non-temporal stores
+LOADS 0
+STORES 1
+INSTR_CONST 20
+INSTR_LOOP 7
+UOPS 10
+vmovaps ymm1, [rip+SCALAR]
+vmovaps ymm2, [rip+SCALAR]
+vmovaps ymm3, [rip+SCALAR]
+vmovaps ymm4, [rip+SCALAR]
LOOP 16
vmovntpd [STR0 + GPR1 * 8] , ymm1
vmovntpd [STR0 + GPR1 * 8 + 32], ymm2
diff --git a/bench/x86-64/store_mem_sse.ptt b/bench/x86-64/store_mem_sse.ptt
index 0a0222d..54aeed3 100644
--- a/bench/x86-64/store_mem_sse.ptt
+++ b/bench/x86-64/store_mem_sse.ptt
@@ -2,10 +2,16 @@ STREAMS 1
TYPE DOUBLE
FLOPS 0
BYTES 8
-movaps FPR1, [SCALAR]
-movaps FPR2, [SCALAR]
-movaps FPR3, [SCALAR]
-movaps FPR4, [SCALAR]
+DESC Double-precision store, uses SSE and non-temporal stores
+LOADS 0
+STORES 1
+INSTR_CONST 20
+INSTR_LOOP 7
+UOPS 10
+movaps FPR1, [rip+SCALAR]
+movaps FPR2, [rip+SCALAR]
+movaps FPR3, [rip+SCALAR]
+movaps FPR4, [rip+SCALAR]
LOOP 8
movntpd [STR0 + GPR1 * 8] , FPR1
movntpd [STR0 + GPR1 * 8 + 16], FPR2
diff --git a/bench/x86-64/store_plain.ptt b/bench/x86-64/store_plain.ptt
deleted file mode 100644
index 0f667cd..0000000
--- a/bench/x86-64/store_plain.ptt
+++ /dev/null
@@ -1,15 +0,0 @@
-STREAMS 1
-TYPE DOUBLE
-FLOPS 0
-BYTES 8
-movsd FPR1, [SCALAR]
-movsd FPR2, [SCALAR]
-movsd FPR3, [SCALAR]
-movsd FPR4, [SCALAR]
-LOOP 4
-#mov GPR14, [STR0 + GPR1 * 8 + 256]
-movsd [STR0 + GPR1 * 8] , FPR1
-movsd [STR0 + GPR1 * 8 + 8], FPR2
-movsd [STR0 + GPR1 * 8 + 16], FPR3
-movsd [STR0 + GPR1 * 8 + 24], FPR4
-
diff --git a/bench/x86-64/store_sse.ptt b/bench/x86-64/store_sse.ptt
index 4ef9ab9..8e124b2 100644
--- a/bench/x86-64/store_sse.ptt
+++ b/bench/x86-64/store_sse.ptt
@@ -2,12 +2,17 @@ STREAMS 1
TYPE DOUBLE
FLOPS 0
BYTES 8
-movaps FPR1, [SCALAR]
-movaps FPR2, [SCALAR]
-movaps FPR3, [SCALAR]
-movaps FPR4, [SCALAR]
+DESC Double-precision store, optimized for SSE
+LOADS 0
+STORES 1
+INSTR_CONST 20
+INSTR_LOOP 7
+UOPS 10
+movaps FPR1, [rip+SCALAR]
+movaps FPR2, [rip+SCALAR]
+movaps FPR3, [rip+SCALAR]
+movaps FPR4, [rip+SCALAR]
LOOP 8
-#mov GPR14, [STR0 + GPR1 * 8 + 256]
movaps [STR0 + GPR1 * 8] , FPR1
movaps [STR0 + GPR1 * 8 + 16], FPR2
movaps [STR0 + GPR1 * 8 + 32], FPR3
diff --git a/bench/x86-64/stream.ptt b/bench/x86-64/stream.ptt
index 7c84c3c..554243a 100644
--- a/bench/x86-64/stream.ptt
+++ b/bench/x86-64/stream.ptt
@@ -2,22 +2,28 @@ STREAMS 3
TYPE DOUBLE
FLOPS 2
BYTES 24
-movaps FPR5, [SCALAR]
-LOOP 8
-movaps FPR1, [STR1 + GPR1*8]
-movaps FPR2, [STR1 + GPR1*8+16]
-movaps FPR3, [STR1 + GPR1*8+32]
-movaps FPR4, [STR1 + GPR1*8+48]
-mulpd FPR1, FPR5
-addpd FPR1, [STR2 + GPR1*8]
-mulpd FPR2, FPR5
-addpd FPR2, [STR2 + GPR1*8+16]
-mulpd FPR3, FPR5
-addpd FPR3, [STR2 + GPR1*8+32]
-mulpd FPR4, FPR5
-addpd FPR4, [STR2 + GPR1*8+48]
-movaps [STR0 + GPR1*8] , FPR1
-movaps [STR0 + GPR1*8+16], FPR2
-movaps [STR0 + GPR1*8+32], FPR3
-movaps [STR0 + GPR1*8+48], FPR4
+DESC Double-precision stream triad A(i) = B(i)*c + C(i), only scalar operations
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 19
+UOPS 26
+movss FPR5, [rip+SCALAR]
+LOOP 4
+movsd FPR1, [STR1 + GPR1*8]
+movsd FPR2, [STR1 + GPR1*8+8]
+movsd FPR3, [STR1 + GPR1*8+16]
+movsd FPR4, [STR1 + GPR1*8+24]
+mulsd FPR1, FPR5
+addsd FPR1, [STR2 + GPR1*8]
+mulsd FPR2, FPR5
+addsd FPR2, [STR2 + GPR1*8+8]
+mulsd FPR3, FPR5
+addsd FPR3, [STR2 + GPR1*8+16]
+mulsd FPR4, FPR5
+addsd FPR4, [STR2 + GPR1*8+24]
+movsd [STR0 + GPR1*8] , FPR1
+movsd [STR0 + GPR1*8+8] , FPR2
+movsd [STR0 + GPR1*8+16], FPR3
+movsd [STR0 + GPR1*8+24], FPR4
diff --git a/bench/x86-64/stream_avx.ptt b/bench/x86-64/stream_avx.ptt
index 8fbaf7c..0ebbb74 100644
--- a/bench/x86-64/stream_avx.ptt
+++ b/bench/x86-64/stream_avx.ptt
@@ -1,22 +1,29 @@
STREAMS 3
-TYPE SINGLE
-FLOPS 4
-BYTES 48
-vbroadcastss ymm1, [SCALAR]
-LOOP 8
-vmovaps ymm2, [STR1 + GPR1*8]
-vmovaps ymm3, [STR1 + GPR1*8+16]
-vmovaps ymm4, [STR1 + GPR1*8+32]
-vmovaps ymm5, [STR1 + GPR1*8+48]
-vmulps ymm2, ymm2, ymm1
-vaddps ymm2, ymm2, [STR2 + GPR1*8]
-vmulps ymm3, ymm3, ymm1
-vaddps ymm3, ymm3, [STR2 + GPR1*8]
-vmulps ymm4, ymm4, ymm1
-vaddps ymm4, ymm4, [STR2 + GPR1*8]
-vmulps ymm5, ymm5, ymm1
-vaddps ymm5, ymm5, [STR2 + GPR1*8]
-vmovaps [STR0 + GPR1*8], ymm2
-vmovaps [STR0 + GPR1*8+16], ymm3
-vmovaps [STR0 + GPR1*8+32], ymm4
-vmovaps [STR0 + GPR1*8+48], ymm5
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision stream triad A(i) = B(i)*c + C(i), optimized for AVX
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 19
+UOPS 26
+vmovaps ymm5, [rip+SCALAR]
+LOOP 16
+vmovaps ymm1, [STR1 + GPR1*8]
+vmovaps ymm2, [STR1 + GPR1*8+32]
+vmovaps ymm3, [STR1 + GPR1*8+64]
+vmovaps ymm4, [STR1 + GPR1*8+96]
+vmulpd ymm1, ymm1, ymm5
+vaddpd ymm1, ymm1, [STR2 + GPR1*8]
+vmulpd ymm2, ymm2, ymm5
+vaddpd ymm2, ymm2, [STR2 + GPR1*8+32]
+vmulpd ymm3, ymm3, ymm5
+vaddpd ymm3, ymm3, [STR2 + GPR1*8+64]
+vmulpd ymm4, ymm4, ymm5
+vaddpd ymm4, ymm4, [STR2 + GPR1*8+96]
+vmovaps [STR0 + GPR1*8] , ymm1
+vmovaps [STR0 + GPR1*8+32], ymm2
+vmovaps [STR0 + GPR1*8+64], ymm3
+vmovaps [STR0 + GPR1*8+96], ymm4
+
diff --git a/bench/x86-64/stream_avx_fma.ptt b/bench/x86-64/stream_avx_fma.ptt
new file mode 100644
index 0000000..a868f61
--- /dev/null
+++ b/bench/x86-64/stream_avx_fma.ptt
@@ -0,0 +1,24 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision stream triad A(i) = B(i)*c + C(i), optimized for AVX FMAs
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+vmovaps ymm5, [rip+SCALAR]
+LOOP 16
+vmovaps ymm1, [STR1 + GPR1*8]
+vmovaps ymm2, [STR1 + GPR1*8+32]
+vmovaps ymm3, [STR1 + GPR1*8+64]
+vmovaps ymm4, [STR1 + GPR1*8+96]
+vfmadd213pd ymm1, ymm5, [STR2 + GPR1*8]
+vfmadd213pd ymm2, ymm5, [STR2 + GPR1*8+32]
+vfmadd213pd ymm3, ymm5, [STR2 + GPR1*8+64]
+vfmadd213pd ymm4, ymm5, [STR2 + GPR1*8+96]
+vmovaps [STR0 + GPR1*8] , ymm1
+vmovaps [STR0 + GPR1*8+32], ymm2
+vmovaps [STR0 + GPR1*8+64], ymm3
+vmovaps [STR0 + GPR1*8+96], ymm4
diff --git a/bench/x86-64/stream_mem.ptt b/bench/x86-64/stream_mem.ptt
index b8364cc..fd0f8f1 100644
--- a/bench/x86-64/stream_mem.ptt
+++ b/bench/x86-64/stream_mem.ptt
@@ -2,10 +2,27 @@ STREAMS 3
TYPE DOUBLE
FLOPS 2
BYTES 24
-movaps FPR5, [SCALAR]
-LOOP 2
-movaps FPR1, [STR2 + GPR1*8]
-mulpd FPR1, FPR5
-addpd FPR1, [STR1 + GPR1*8]
-movntpd [STR0 + GPR1*8], FPR1
-
+DESC Double-precision stream triad A(i) = B(i)*c + C(i), uses SSE and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 7
+UOPS 8
+movsd FPR5, [rip+SCALAR]
+LOOP 4
+movsd FPR1, [STR1 + GPR1*8]
+movsd FPR2, [STR1 + GPR1*8+8]
+movsd FPR3, [STR1 + GPR1*8+16]
+movsd FPR4, [STR1 + GPR1*8+24]
+mulsd FPR1, FPR5
+addsd FPR1, [STR2 + GPR1*8]
+mulsd FPR2, FPR5
+addsd FPR2, [STR2 + GPR1*8+8]
+mulsd FPR3, FPR5
+addsd FPR3, [STR2 + GPR1*8+16]
+mulsd FPR4, FPR5
+addsd FPR4, [STR2 + GPR1*8+24]
+movntdq [STR0 + GPR1*8], FPR1
+movntdq [STR0 + GPR1*8+8], FPR2
+movntdq [STR0 + GPR1*8+16], FPR3
+movntdq [STR0 + GPR1*8+24], FPR4
diff --git a/bench/x86-64/stream_mem_avx.ptt b/bench/x86-64/stream_mem_avx.ptt
new file mode 100644
index 0000000..1a138f4
--- /dev/null
+++ b/bench/x86-64/stream_mem_avx.ptt
@@ -0,0 +1,17 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision stream triad A(i) = B(i)*c + C(i), uses AVX and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 7
+UOPS 8
+vmovaps ymm5, [rip+SCALAR]
+LOOP 4
+vmovaps ymm1, [STR2 + GPR1*8]
+vmulpd ymm1, ymm1, ymm5
+vaddpd ymm1, ymm1, [STR1 + GPR1*8]
+vmovntpd [STR0 + GPR1*8], ymm1
+
diff --git a/bench/x86-64/stream_mem_avx_fma.ptt b/bench/x86-64/stream_mem_avx_fma.ptt
new file mode 100644
index 0000000..05bbbc2
--- /dev/null
+++ b/bench/x86-64/stream_mem_avx_fma.ptt
@@ -0,0 +1,24 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision stream triad A(i) = B(i)*c + C(i), optimized for AVX FMAs and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+vmovaps ymm5, [rip+SCALAR]
+LOOP 16
+vmovaps ymm1, [STR1 + GPR1*8]
+vmovaps ymm2, [STR1 + GPR1*8+32]
+vmovaps ymm3, [STR1 + GPR1*8+64]
+vmovaps ymm4, [STR1 + GPR1*8+96]
+vfmadd213pd ymm1, ymm5, [STR2 + GPR1*8]
+vfmadd213pd ymm2, ymm5, [STR2 + GPR1*8+32]
+vfmadd213pd ymm3, ymm5, [STR2 + GPR1*8+64]
+vfmadd213pd ymm4, ymm5, [STR2 + GPR1*8+96]
+vmovntpd [STR0 + GPR1*8], ymm1
+vmovntpd [STR0 + GPR1*8+32], ymm2
+vmovntpd [STR0 + GPR1*8+64], ymm3
+vmovntpd [STR0 + GPR1*8+96], ymm4
diff --git a/bench/x86-64/stream_mem_sse.ptt b/bench/x86-64/stream_mem_sse.ptt
new file mode 100644
index 0000000..6b7106a
--- /dev/null
+++ b/bench/x86-64/stream_mem_sse.ptt
@@ -0,0 +1,17 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision stream triad A(i) = B(i)*c + C(i), uses SSE and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 7
+UOPS 8
+movaps FPR5, [rip+SCALAR]
+LOOP 2
+movaps FPR1, [STR2 + GPR1*8]
+mulpd FPR1, FPR5
+addpd FPR1, [STR1 + GPR1*8]
+movntpd [STR0 + GPR1*8], FPR1
+
diff --git a/bench/x86-64/stream_mem_sse_fma.ptt b/bench/x86-64/stream_mem_sse_fma.ptt
new file mode 100644
index 0000000..22b2758
--- /dev/null
+++ b/bench/x86-64/stream_mem_sse_fma.ptt
@@ -0,0 +1,15 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision stream triad A(i) = B(i)*c + C(i), uses SSE FMAs and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 6
+UOPS 7
+movaps FPR5, [rip+SCALAR]
+LOOP 2
+movapd FPR1, [STR2 + GPR1*8]
+vfmadd213pd FPR1, FPR5, [STR1 + GPR1*8]
+movntpd [STR0 + GPR1*8], FPR1
diff --git a/bench/x86-64/stream_sp.ptt b/bench/x86-64/stream_sp.ptt
new file mode 100644
index 0000000..cedba15
--- /dev/null
+++ b/bench/x86-64/stream_sp.ptt
@@ -0,0 +1,45 @@
+STREAMS 3
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision stream triad A(i) = B(i)*c + C(i), only scalar operations
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 35
+UOPS 50
+movss FPR9, [rip+SCALAR]
+LOOP 8
+movss FPR1, [STR1 + GPR1*4]
+movss FPR2, [STR1 + GPR1*4+4]
+movss FPR3, [STR1 + GPR1*4+8]
+movss FPR4, [STR1 + GPR1*4+12]
+movss FPR5, [STR1 + GPR1*4+16]
+movss FPR6, [STR1 + GPR1*4+20]
+movss FPR7, [STR1 + GPR1*4+24]
+movss FPR8, [STR1 + GPR1*4+28]
+mulss FPR1, FPR9
+addss FPR1, [STR2 + GPR1*4]
+mulss FPR2, FPR9
+addss FPR2, [STR2 + GPR1*4+4]
+mulss FPR3, FPR9
+addss FPR3, [STR2 + GPR1*4+8]
+mulss FPR4, FPR9
+addss FPR4, [STR2 + GPR1*4+12]
+mulss FPR5, FPR9
+addss FPR5, [STR2 + GPR1*4+16]
+mulss FPR6, FPR9
+addss FPR6, [STR2 + GPR1*4+20]
+mulss FPR7, FPR9
+addss FPR7, [STR2 + GPR1*4+24]
+mulss FPR8, FPR9
+addss FPR8, [STR2 + GPR1*4+28]
+movss [STR0 + GPR1*4] , FPR1
+movss [STR0 + GPR1*4+4] , FPR2
+movss [STR0 + GPR1*4+8] , FPR3
+movss [STR0 + GPR1*4+12], FPR4
+movss [STR0 + GPR1*4+16], FPR5
+movss [STR0 + GPR1*4+20], FPR6
+movss [STR0 + GPR1*4+24], FPR7
+movss [STR0 + GPR1*4+28], FPR8
+
diff --git a/bench/x86-64/stream_sp_avx.ptt b/bench/x86-64/stream_sp_avx.ptt
new file mode 100644
index 0000000..f01a6ff
--- /dev/null
+++ b/bench/x86-64/stream_sp_avx.ptt
@@ -0,0 +1,28 @@
+STREAMS 3
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision stream triad A(i) = B(i)*c + C(i), optimized for AVX
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 19
+UOPS 26
+vmovaps ymm1, [rip+SCALAR]
+LOOP 32
+vmovaps ymm2, [STR1 + GPR1*4]
+vmovaps ymm3, [STR1 + GPR1*4+32]
+vmovaps ymm4, [STR1 + GPR1*4+64]
+vmovaps ymm5, [STR1 + GPR1*4+96]
+vmulps ymm2, ymm2, ymm1
+vaddps ymm2, ymm2, [STR2 + GPR1*4]
+vmulps ymm3, ymm3, ymm1
+vaddps ymm3, ymm3, [STR2 + GPR1*4+32]
+vmulps ymm4, ymm4, ymm1
+vaddps ymm4, ymm4, [STR2 + GPR1*4+64]
+vmulps ymm5, ymm5, ymm1
+vaddps ymm5, ymm5, [STR2 + GPR1*4+96]
+vmovaps [STR0 + GPR1*4], ymm2
+vmovaps [STR0 + GPR1*4+32], ymm3
+vmovaps [STR0 + GPR1*4+64], ymm4
+vmovaps [STR0 + GPR1*4+96], ymm5
diff --git a/bench/x86-64/stream_sp_avx_fma.ptt b/bench/x86-64/stream_sp_avx_fma.ptt
new file mode 100644
index 0000000..351b84f
--- /dev/null
+++ b/bench/x86-64/stream_sp_avx_fma.ptt
@@ -0,0 +1,24 @@
+STREAMS 3
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision stream triad A(i) = B(i)*c + C(i), optimized for AVX FMAs
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+vmovaps ymm1, [rip+SCALAR]
+LOOP 32
+vmovaps ymm2, [STR1 + GPR1*4]
+vmovaps ymm3, [STR1 + GPR1*4+32]
+vmovaps ymm4, [STR1 + GPR1*4+64]
+vmovaps ymm5, [STR1 + GPR1*4+96]
+vfmadd213ps ymm2, ymm1, [STR2 + GPR1*4]
+vfmadd213ps ymm3, ymm1, [STR2 + GPR1*4+32]
+vfmadd213ps ymm4, ymm1, [STR2 + GPR1*4+64]
+vfmadd213ps ymm5, ymm1, [STR2 + GPR1*4+96]
+vmovaps [STR0 + GPR1*4], ymm2
+vmovaps [STR0 + GPR1*4+32], ymm3
+vmovaps [STR0 + GPR1*4+64], ymm4
+vmovaps [STR0 + GPR1*4+96], ymm5
diff --git a/bench/x86-64/stream_sp_mem_avx.ptt b/bench/x86-64/stream_sp_mem_avx.ptt
new file mode 100644
index 0000000..5fee0ec
--- /dev/null
+++ b/bench/x86-64/stream_sp_mem_avx.ptt
@@ -0,0 +1,28 @@
+STREAMS 3
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision stream triad A(i) = B(i)*c + C(i), optimized for AVX and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 19
+UOPS 26
+vmovaps ymm1, [rip+SCALAR]
+LOOP 32
+vmovaps ymm2, [STR1 + GPR1*4]
+vmovaps ymm3, [STR1 + GPR1*4+32]
+vmovaps ymm4, [STR1 + GPR1*4+64]
+vmovaps ymm5, [STR1 + GPR1*4+96]
+vmulps ymm2, ymm2, ymm1
+vaddps ymm2, ymm2, [STR2 + GPR1*4]
+vmulps ymm3, ymm3, ymm1
+vaddps ymm3, ymm3, [STR2 + GPR1*4+32]
+vmulps ymm4, ymm4, ymm1
+vaddps ymm4, ymm4, [STR2 + GPR1*4+64]
+vmulps ymm5, ymm5, ymm1
+vaddps ymm5, ymm5, [STR2 + GPR1*4+96]
+vmovntps [STR0 + GPR1*4], ymm2
+vmovntps [STR0 + GPR1*4+32], ymm3
+vmovntps [STR0 + GPR1*4+64], ymm4
+vmovntps [STR0 + GPR1*4+96], ymm5
diff --git a/bench/x86-64/stream_sp_mem_avx_fma.ptt b/bench/x86-64/stream_sp_mem_avx_fma.ptt
new file mode 100644
index 0000000..22c6a4d
--- /dev/null
+++ b/bench/x86-64/stream_sp_mem_avx_fma.ptt
@@ -0,0 +1,24 @@
+STREAMS 3
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision stream triad A(i) = B(i)*c + C(i), optimized for AVX FMAs and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+vmovaps ymm1, [rip+SCALAR]
+LOOP 32
+vmovaps ymm2, [STR1 + GPR1*4]
+vmovaps ymm3, [STR1 + GPR1*4+32]
+vmovaps ymm4, [STR1 + GPR1*4+64]
+vmovaps ymm5, [STR1 + GPR1*4+96]
+vfmadd213ps ymm2, ymm1, [STR2 + GPR1*4]
+vfmadd213ps ymm3, ymm1, [STR2 + GPR1*4+32]
+vfmadd213ps ymm4, ymm1, [STR2 + GPR1*4+64]
+vfmadd213ps ymm5, ymm1, [STR2 + GPR1*4+96]
+vmovntps [STR0 + GPR1*4], ymm2
+vmovntps [STR0 + GPR1*4+32], ymm3
+vmovntps [STR0 + GPR1*4+64], ymm4
+vmovntps [STR0 + GPR1*4+96], ymm5
diff --git a/bench/x86-64/stream_sp_mem_sse.ptt b/bench/x86-64/stream_sp_mem_sse.ptt
new file mode 100644
index 0000000..b92c3ae
--- /dev/null
+++ b/bench/x86-64/stream_sp_mem_sse.ptt
@@ -0,0 +1,16 @@
+STREAMS 3
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision stream triad A(i) = B(i)*c + C(i), optimized for SSE and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 7
+UOPS 8
+movss FPR5, [rip+SCALAR]
+LOOP 4
+movaps FPR1, [STR2 + GPR1*4]
+mulps FPR1, FPR5
+addps FPR1, [STR1 + GPR1*4]
+movntps [STR0 + GPR1*4], FPR1
diff --git a/bench/x86-64/stream_sp_mem_sse_fma.ptt b/bench/x86-64/stream_sp_mem_sse_fma.ptt
new file mode 100644
index 0000000..9ca42ca
--- /dev/null
+++ b/bench/x86-64/stream_sp_mem_sse_fma.ptt
@@ -0,0 +1,15 @@
+STREAMS 3
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision stream triad A(i) = B(i)*c + C(i), optimized for SSE FMAs and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 6
+UOPS 7
+movss FPR5, [rip+SCALAR]
+LOOP 4
+movaps FPR1, [STR2 + GPR1*4]
+vfmadd213ps FPR1, FPR5, [STR1 + GPR1*4]
+movntps [STR0 + GPR1*4], FPR1
diff --git a/bench/x86-64/stream_sp_sse.ptt b/bench/x86-64/stream_sp_sse.ptt
new file mode 100644
index 0000000..f82e299
--- /dev/null
+++ b/bench/x86-64/stream_sp_sse.ptt
@@ -0,0 +1,16 @@
+STREAMS 3
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision stream triad A(i) = B(i)*c + C(i), optimized for SSE
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 7
+UOPS 8
+movss FPR5, [rip+SCALAR]
+LOOP 4
+movaps FPR1, [STR2 + GPR1*4]
+mulps FPR1, FPR5
+addps FPR1, [STR1 + GPR1*4]
+movaps [STR0 + GPR1*4], FPR1
diff --git a/bench/x86-64/stream_sp_sse_fma.ptt b/bench/x86-64/stream_sp_sse_fma.ptt
new file mode 100644
index 0000000..28a87d4
--- /dev/null
+++ b/bench/x86-64/stream_sp_sse_fma.ptt
@@ -0,0 +1,15 @@
+STREAMS 3
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision stream triad A(i) = B(i)*c + C(i), optimized for SSE FMAs
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 6
+UOPS 7
+movss FPR5, [rip+SCALAR]
+LOOP 4
+movaps FPR1, [STR2 + GPR1*4]
+vfmadd213ps FPR1, FPR5, [STR1 + GPR1*4]
+movaps [STR0 + GPR1*4], FPR1
diff --git a/bench/x86-64/stream_sse.ptt b/bench/x86-64/stream_sse.ptt
new file mode 100644
index 0000000..c373336
--- /dev/null
+++ b/bench/x86-64/stream_sse.ptt
@@ -0,0 +1,29 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision stream triad A(i) = B(i)*c + C(i), optimized for SSE
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 19
+UOPS 26
+movaps FPR5, [rip+SCALAR]
+LOOP 8
+movaps FPR1, [STR1 + GPR1*8]
+movaps FPR2, [STR1 + GPR1*8+16]
+movaps FPR3, [STR1 + GPR1*8+32]
+movaps FPR4, [STR1 + GPR1*8+48]
+mulpd FPR1, FPR5
+addpd FPR1, [STR2 + GPR1*8]
+mulpd FPR2, FPR5
+addpd FPR2, [STR2 + GPR1*8+16]
+mulpd FPR3, FPR5
+addpd FPR3, [STR2 + GPR1*8+32]
+mulpd FPR4, FPR5
+addpd FPR4, [STR2 + GPR1*8+48]
+movaps [STR0 + GPR1*8] , FPR1
+movaps [STR0 + GPR1*8+16], FPR2
+movaps [STR0 + GPR1*8+32], FPR3
+movaps [STR0 + GPR1*8+48], FPR4
+
diff --git a/bench/x86-64/stream_sse_fma.ptt b/bench/x86-64/stream_sse_fma.ptt
new file mode 100644
index 0000000..7b3a338
--- /dev/null
+++ b/bench/x86-64/stream_sse_fma.ptt
@@ -0,0 +1,24 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision stream triad A(i) = B(i)*c + C(i), optimized for SSE FMAs
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+movaps FPR5, [rip+SCALAR]
+LOOP 8
+movaps FPR1, [STR1 + GPR1*8]
+movaps FPR2, [STR1 + GPR1*8+16]
+movaps FPR3, [STR1 + GPR1*8+32]
+movaps FPR4, [STR1 + GPR1*8+48]
+vfmadd213pd FPR1, FPR5, [STR2 + GPR1*8]
+vfmadd213pd FPR2, FPR5, [STR2 + GPR1*8+16]
+vfmadd213pd FPR3, FPR5, [STR2 + GPR1*8+32]
+vfmadd213pd FPR4, FPR5, [STR2 + GPR1*8+48]
+movaps [STR0 + GPR1*8] , FPR1
+movaps [STR0 + GPR1*8+16], FPR2
+movaps [STR0 + GPR1*8+32], FPR3
+movaps [STR0 + GPR1*8+48], FPR4
diff --git a/bench/x86-64/striad_avx.ptt b/bench/x86-64/striad_avx.ptt
deleted file mode 100644
index b3c1317..0000000
--- a/bench/x86-64/striad_avx.ptt
+++ /dev/null
@@ -1,23 +0,0 @@
-STREAMS 3
-TYPE DOUBLE
-FLOPS 2
-BYTES 24
-vmovaps ymm5, [SCALAR]
-LOOP 16
-vmovaps ymm1, [STR1 + GPR1*8]
-vmovaps ymm2, [STR1 + GPR1*8+32]
-vmovaps ymm3, [STR1 + GPR1*8+64]
-vmovaps ymm4, [STR1 + GPR1*8+96]
-vmulpd ymm1, ymm1, ymm5
-vaddpd ymm1, ymm1, [STR2 + GPR1*8]
-vmulpd ymm2, ymm2, ymm5
-vaddpd ymm2, ymm2, [STR2 + GPR1*8+32]
-vmulpd ymm3, ymm3, ymm5
-vaddpd ymm3, ymm3, [STR2 + GPR1*8+64]
-vmulpd ymm4, ymm4, ymm5
-vaddpd ymm4, ymm4, [STR2 + GPR1*8+96]
-vmovaps [STR0 + GPR1*8] , ymm1
-vmovaps [STR0 + GPR1*8+32], ymm2
-vmovaps [STR0 + GPR1*8+64], ymm3
-vmovaps [STR0 + GPR1*8+96], ymm4
-
diff --git a/bench/x86-64/striad_mem_avx.ptt b/bench/x86-64/striad_mem_avx.ptt
deleted file mode 100644
index cef2688..0000000
--- a/bench/x86-64/striad_mem_avx.ptt
+++ /dev/null
@@ -1,11 +0,0 @@
-STREAMS 3
-TYPE DOUBLE
-FLOPS 2
-BYTES 24
-vmovaps ymm5, [SCALAR]
-LOOP 4
-vmovaps ymm1, [STR2 + GPR1*8]
-vmulpd ymm1, ymm1, ymm5
-vaddpd ymm1, ymm1, [STR1 + GPR1*8]
-vmovntpd [STR0 + GPR1*8], ymm1
-
diff --git a/bench/x86-64/striad_mem_sse.ptt b/bench/x86-64/striad_mem_sse.ptt
deleted file mode 100644
index b8364cc..0000000
--- a/bench/x86-64/striad_mem_sse.ptt
+++ /dev/null
@@ -1,11 +0,0 @@
-STREAMS 3
-TYPE DOUBLE
-FLOPS 2
-BYTES 24
-movaps FPR5, [SCALAR]
-LOOP 2
-movaps FPR1, [STR2 + GPR1*8]
-mulpd FPR1, FPR5
-addpd FPR1, [STR1 + GPR1*8]
-movntpd [STR0 + GPR1*8], FPR1
-
diff --git a/bench/x86-64/striad_plain.ptt b/bench/x86-64/striad_plain.ptt
deleted file mode 100644
index 7b29664..0000000
--- a/bench/x86-64/striad_plain.ptt
+++ /dev/null
@@ -1,23 +0,0 @@
-STREAMS 3
-TYPE DOUBLE
-FLOPS 2
-BYTES 24
-movss FPR5, [SCALAR]
-LOOP 4
-movsd FPR1, [STR1 + GPR1*8]
-movsd FPR2, [STR1 + GPR1*8+8]
-movsd FPR3, [STR1 + GPR1*8+16]
-movsd FPR4, [STR1 + GPR1*8+24]
-mulsd FPR1, FPR5
-addsd FPR1, [STR2 + GPR1*8]
-mulsd FPR2, FPR5
-addsd FPR2, [STR2 + GPR1*8+8]
-mulsd FPR3, FPR5
-addsd FPR3, [STR2 + GPR1*8+16]
-mulsd FPR4, FPR5
-addsd FPR4, [STR2 + GPR1*8+24]
-movsd [STR0 + GPR1*8] , FPR1
-movsd [STR0 + GPR1*8+8] , FPR2
-movsd [STR0 + GPR1*8+16], FPR3
-movsd [STR0 + GPR1*8+24], FPR4
-
diff --git a/bench/x86-64/striad_sse.ptt b/bench/x86-64/striad_sse.ptt
deleted file mode 100644
index 7c84c3c..0000000
--- a/bench/x86-64/striad_sse.ptt
+++ /dev/null
@@ -1,23 +0,0 @@
-STREAMS 3
-TYPE DOUBLE
-FLOPS 2
-BYTES 24
-movaps FPR5, [SCALAR]
-LOOP 8
-movaps FPR1, [STR1 + GPR1*8]
-movaps FPR2, [STR1 + GPR1*8+16]
-movaps FPR3, [STR1 + GPR1*8+32]
-movaps FPR4, [STR1 + GPR1*8+48]
-mulpd FPR1, FPR5
-addpd FPR1, [STR2 + GPR1*8]
-mulpd FPR2, FPR5
-addpd FPR2, [STR2 + GPR1*8+16]
-mulpd FPR3, FPR5
-addpd FPR3, [STR2 + GPR1*8+32]
-mulpd FPR4, FPR5
-addpd FPR4, [STR2 + GPR1*8+48]
-movaps [STR0 + GPR1*8] , FPR1
-movaps [STR0 + GPR1*8+16], FPR2
-movaps [STR0 + GPR1*8+32], FPR3
-movaps [STR0 + GPR1*8+48], FPR4
-
diff --git a/bench/x86-64/sum.ptt b/bench/x86-64/sum.ptt
index 3374843..a75fa93 100644
--- a/bench/x86-64/sum.ptt
+++ b/bench/x86-64/sum.ptt
@@ -1,23 +1,29 @@
STREAMS 1
-TYPE SINGLE
+TYPE DOUBLE
FLOPS 1
-BYTES 4
-xorps FPR1, FPR1
-movaps FPR2, FPR1
-movaps FPR3, FPR1
-movaps FPR4, FPR1
-movaps FPR5, FPR1
-movaps FPR6, FPR1
-movaps FPR7, FPR1
-movaps FPR8, FPR1
-LOOP 32
-addps FPR1, [STR0 + GPR1 * 4]
-addps FPR2, [STR0 + GPR1 * 4 + 16]
-addps FPR3, [STR0 + GPR1 * 4 + 32]
-addps FPR4, [STR0 + GPR1 * 4 + 48]
-addps FPR5, [STR0 + GPR1 * 4 + 64]
-addps FPR6, [STR0 + GPR1 * 4 + 80]
-addps FPR7, [STR0 + GPR1 * 4 + 96]
-addps FPR8, [STR0 + GPR1 * 4 + 112]
+BYTES 8
+DESC Double-precision sum of a vector, only scalar operations
+LOADS 1
+STORES 0
+INSTR_CONST 24
+INSTR_LOOP 11
+UOPS 18
+xorpd FPR1, FPR1
+movapd FPR2, FPR1
+movapd FPR3, FPR1
+movapd FPR4, FPR1
+movapd FPR5, FPR1
+movapd FPR6, FPR1
+movapd FPR7, FPR1
+movapd FPR8, FPR1
+LOOP 8
+addsd FPR1, [STR0 + GPR1 * 8]
+addsd FPR2, [STR0 + GPR1 * 8 + 8]
+addsd FPR3, [STR0 + GPR1 * 8 + 16]
+addsd FPR4, [STR0 + GPR1 * 8 + 24]
+addsd FPR5, [STR0 + GPR1 * 8 + 32]
+addsd FPR6, [STR0 + GPR1 * 8 + 40]
+addsd FPR7, [STR0 + GPR1 * 8 + 48]
+addsd FPR8, [STR0 + GPR1 * 8 + 56]
diff --git a/bench/x86-64/sum_avx.ptt b/bench/x86-64/sum_avx.ptt
index e2e8e40..29d8ff0 100644
--- a/bench/x86-64/sum_avx.ptt
+++ b/bench/x86-64/sum_avx.ptt
@@ -1,14 +1,30 @@
STREAMS 1
-TYPE SINGLE
+TYPE DOUBLE
FLOPS 1
-BYTES 4
-vxorps ymm1, ymm1, ymm1
-vmovaps ymm2, ymm1
-vmovaps ymm3, ymm1
-vmovaps ymm4, ymm1
+BYTES 8
+DESC Double-precision sum of a vector, optimized for AVX
+LOADS 1
+STORES 0
+INSTR_CONST 24
+INSTR_LOOP 11
+UOPS 18
+vxorpd FPR9, FPR9, FPR9
+vxorpd FPR1, FPR1, FPR1
+vmovapd FPR2, FPR1
+vmovapd FPR3, FPR1
+vmovapd FPR4, FPR1
+vmovapd FPR5, FPR1
+vmovapd FPR6, FPR1
+vmovapd FPR7, FPR1
+vmovapd FPR8, FPR1
LOOP 32
-vaddps ymm1, ymm1, [STR0 + GPR1*4]
-vaddps ymm2, ymm2, [STR0 + GPR1*4+32]
-vaddps ymm3, ymm3, [STR0 + GPR1*4+64]
-vaddps ymm4, ymm4, [STR0 + GPR1*4+96]
+vaddpd FPR1, FPR1, [STR0 + GPR1 * 8]
+vaddpd FPR2, FPR2, [STR0 + GPR1 * 8 + 32]
+vaddpd FPR3, FPR3, [STR0 + GPR1 * 8 + 64]
+vaddpd FPR4, FPR4, [STR0 + GPR1 * 8 + 96]
+vaddpd FPR5, FPR5, [STR0 + GPR1 * 8 + 128]
+vaddpd FPR6, FPR6, [STR0 + GPR1 * 8 + 160]
+vaddpd FPR7, FPR7, [STR0 + GPR1 * 8 + 192]
+vaddpd FPR8, FPR8, [STR0 + GPR1 * 8 + 224]
+
diff --git a/bench/x86-64/sum_plain.ptt b/bench/x86-64/sum_plain.ptt
deleted file mode 100644
index 23fe237..0000000
--- a/bench/x86-64/sum_plain.ptt
+++ /dev/null
@@ -1,15 +0,0 @@
-STREAMS 1
-TYPE SINGLE
-FLOPS 1
-BYTES 4
-xorps FPR1, FPR1
-xorps FPR2, FPR2
-xorps FPR3, FPR3
-xorps FPR4, FPR4
-LOOP 4
-addss FPR1, [STR0 + GPR1 * 4]
-addss FPR2, [STR0 + GPR1 * 4 + 4]
-addss FPR3, [STR0 + GPR1 * 4 + 8]
-addss FPR4, [STR0 + GPR1 * 4 + 12]
-
-
diff --git a/bench/x86-64/sum_sp.ptt b/bench/x86-64/sum_sp.ptt
new file mode 100644
index 0000000..21a6702
--- /dev/null
+++ b/bench/x86-64/sum_sp.ptt
@@ -0,0 +1,21 @@
+STREAMS 1
+TYPE SINGLE
+FLOPS 1
+BYTES 4
+DESC Single-precision sum of a vector, only scalar operations
+LOADS 1
+STORES 0
+INSTR_CONST 20
+INSTR_LOOP 7
+UOPS 10
+xorps FPR1, FPR1
+xorps FPR2, FPR2
+xorps FPR3, FPR3
+xorps FPR4, FPR4
+LOOP 4
+addss FPR1, [STR0 + GPR1 * 4]
+addss FPR2, [STR0 + GPR1 * 4 + 4]
+addss FPR3, [STR0 + GPR1 * 4 + 8]
+addss FPR4, [STR0 + GPR1 * 4 + 12]
+
+
diff --git a/bench/x86-64/sum_sp_avx.ptt b/bench/x86-64/sum_sp_avx.ptt
new file mode 100644
index 0000000..4a3a6e9
--- /dev/null
+++ b/bench/x86-64/sum_sp_avx.ptt
@@ -0,0 +1,20 @@
+STREAMS 1
+TYPE SINGLE
+FLOPS 1
+BYTES 4
+DESC Single-precision sum of a vector, optimized for AVX
+LOADS 1
+STORES 0
+INSTR_CONST 20
+INSTR_LOOP 7
+UOPS 10
+vxorps ymm1, ymm1, ymm1
+vmovaps ymm2, ymm1
+vmovaps ymm3, ymm1
+vmovaps ymm4, ymm1
+LOOP 32
+vaddps ymm1, ymm1, [STR0 + GPR1*4]
+vaddps ymm2, ymm2, [STR0 + GPR1*4+32]
+vaddps ymm3, ymm3, [STR0 + GPR1*4+64]
+vaddps ymm4, ymm4, [STR0 + GPR1*4+96]
+
diff --git a/bench/x86-64/sum_sp_sse.ptt b/bench/x86-64/sum_sp_sse.ptt
new file mode 100644
index 0000000..21cff6d
--- /dev/null
+++ b/bench/x86-64/sum_sp_sse.ptt
@@ -0,0 +1,29 @@
+STREAMS 1
+TYPE SINGLE
+FLOPS 1
+BYTES 4
+DESC Single-precision sum of a vector, optimized for SSE
+LOADS 1
+STORES 0
+INSTR_CONST 24
+INSTR_LOOP 11
+UOPS 18
+xorps FPR1, FPR1
+movaps FPR2, FPR1
+movaps FPR3, FPR1
+movaps FPR4, FPR1
+movaps FPR5, FPR1
+movaps FPR6, FPR1
+movaps FPR7, FPR1
+movaps FPR8, FPR1
+LOOP 32
+addps FPR1, [STR0 + GPR1 * 4]
+addps FPR2, [STR0 + GPR1 * 4 + 16]
+addps FPR3, [STR0 + GPR1 * 4 + 32]
+addps FPR4, [STR0 + GPR1 * 4 + 48]
+addps FPR5, [STR0 + GPR1 * 4 + 64]
+addps FPR6, [STR0 + GPR1 * 4 + 80]
+addps FPR7, [STR0 + GPR1 * 4 + 96]
+addps FPR8, [STR0 + GPR1 * 4 + 112]
+
+
diff --git a/bench/x86-64/sum_sse.ptt b/bench/x86-64/sum_sse.ptt
index 3e7a2bb..8aad8cf 100644
--- a/bench/x86-64/sum_sse.ptt
+++ b/bench/x86-64/sum_sse.ptt
@@ -2,6 +2,12 @@ STREAMS 1
TYPE DOUBLE
FLOPS 1
BYTES 8
+DESC Double-precision sum of a vector, optimized for SSE
+LOADS 1
+STORES 0
+INSTR_CONST 24
+INSTR_LOOP 11
+UOPS 18
xorpd FPR1, FPR1
movapd FPR2, FPR1
movapd FPR3, FPR1
diff --git a/bench/x86-64/triad.ptt b/bench/x86-64/triad.ptt
index d521aa0..4eec70f 100644
--- a/bench/x86-64/triad.ptt
+++ b/bench/x86-64/triad.ptt
@@ -2,21 +2,27 @@ STREAMS 4
TYPE DOUBLE
FLOPS 2
BYTES 32
-LOOP 8
-movaps FPR1, [STR1 + GPR1*8]
-movaps FPR2, [STR1 + GPR1*8+16]
-movaps FPR3, [STR1 + GPR1*8+32]
-movaps FPR4, [STR1 + GPR1*8+48]
-mulpd FPR1, [STR2 + GPR1*8]
-addpd FPR1, [STR3 + GPR1*8]
-mulpd FPR2, [STR2 + GPR1*8+16]
-addpd FPR2, [STR3 + GPR1*8+16]
-mulpd FPR3, [STR2 + GPR1*8+32]
-addpd FPR3, [STR3 + GPR1*8+32]
-mulpd FPR4, [STR2 + GPR1*8+48]
-addpd FPR4, [STR3 + GPR1*8+48]
-movaps [STR0 + GPR1*8], FPR1
-movaps [STR0 + GPR1*8+16], FPR2
-movaps [STR0 + GPR1*8+32], FPR3
-movaps [STR0 + GPR1*8+48], FPR4
+DESC Double-precision triad A(i) = B(i) * C(i) + D(i), only scalar operations
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 19
+UOPS 30
+LOOP 4
+movsd FPR1, [STR1 + GPR1*8]
+movsd FPR2, [STR1 + GPR1*8+8]
+movsd FPR3, [STR1 + GPR1*8+16]
+movsd FPR4, [STR1 + GPR1*8+24]
+mulsd FPR1, [STR2 + GPR1*8]
+addsd FPR1, [STR3 + GPR1*8]
+mulsd FPR2, [STR2 + GPR1*8+8]
+addsd FPR2, [STR3 + GPR1*8+8]
+mulsd FPR3, [STR2 + GPR1*8+16]
+addsd FPR3, [STR3 + GPR1*8+16]
+mulsd FPR4, [STR2 + GPR1*8+24]
+addsd FPR4, [STR3 + GPR1*8+24]
+movsd [STR0 + GPR1*8], FPR1
+movsd [STR0 + GPR1*8+8], FPR2
+movsd [STR0 + GPR1*8+16], FPR3
+movsd [STR0 + GPR1*8+24], FPR4
diff --git a/bench/x86-64/triad_avx.ptt b/bench/x86-64/triad_avx.ptt
index 3514cfd..7e83b0b 100644
--- a/bench/x86-64/triad_avx.ptt
+++ b/bench/x86-64/triad_avx.ptt
@@ -1,12 +1,28 @@
STREAMS 4
TYPE DOUBLE
FLOPS 2
-BYTES 16
-LOOP 32
-vmovapd ymm1, [STR1 + GPR1]
-vmovapd ymm2, [STR2 + GPR1]
-vmovapd ymm3, [STR3 + GPR1]
-vmulpd ymm0, ymm1, ymm2
-vaddpd ymm0, ymm0, ymm3
-vmovapd [STR0 + GPR1], ymm0
+BYTES 32
+DESC Double-precision triad A(i) = B(i) * C(i) + D(i), optimized for AVX
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 19
+UOPS 30
+LOOP 16
+vmovaps ymm1, [STR1 + GPR1*8]
+vmovaps ymm2, [STR1 + GPR1*8+32]
+vmovaps ymm3, [STR1 + GPR1*8+64]
+vmovaps ymm4, [STR1 + GPR1*8+96]
+vmulpd ymm1, ymm1, [STR2 + GPR1*8]
+vaddpd ymm1, ymm1, [STR3 + GPR1*8]
+vmulpd ymm2, ymm2, [STR2 + GPR1*8+32]
+vaddpd ymm2, ymm2, [STR3 + GPR1*8+32]
+vmulpd ymm3, ymm3, [STR2 + GPR1*8+64]
+vaddpd ymm3, ymm3, [STR3 + GPR1*8+64]
+vmulpd ymm4, ymm4, [STR2 + GPR1*8+96]
+vaddpd ymm4, ymm4, [STR3 + GPR1*8+96]
+vmovaps [STR0 + GPR1*8], ymm1
+vmovaps [STR0 + GPR1*8+32], ymm2
+vmovaps [STR0 + GPR1*8+64], ymm3
+vmovaps [STR0 + GPR1*8+96], ymm4
diff --git a/bench/x86-64/triad_avx_fma.ptt b/bench/x86-64/triad_avx_fma.ptt
new file mode 100644
index 0000000..535fe8a
--- /dev/null
+++ b/bench/x86-64/triad_avx_fma.ptt
@@ -0,0 +1,27 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 32
+DESC Double-precision triad A(i) = B(i) * C(i) + D(i), optimized for AVX FMAs
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 19
+UOPS 26
+LOOP 16
+vmovaps ymm1, [STR1 + GPR1*8]
+vmovaps ymm2, [STR1 + GPR1*8+32]
+vmovaps ymm3, [STR1 + GPR1*8+64]
+vmovaps ymm4, [STR1 + GPR1*8+96]
+vmovaps ymm5, [STR2 + GPR1*8]
+vmovaps ymm6, [STR2 + GPR1*8+32]
+vmovaps ymm7, [STR2 + GPR1*8+64]
+vmovaps ymm8, [STR2 + GPR1*8+96]
+vfmadd213pd ymm1, ymm5, [STR3 + GPR1*8]
+vfmadd213pd ymm2, ymm6, [STR3 + GPR1*8+32]
+vfmadd213pd ymm3, ymm7, [STR3 + GPR1*8+64]
+vfmadd213pd ymm4, ymm8, [STR3 + GPR1*8+96]
+vmovaps [STR0 + GPR1*8], ymm1
+vmovaps [STR0 + GPR1*8+32], ymm2
+vmovaps [STR0 + GPR1*8+64], ymm3
+vmovaps [STR0 + GPR1*8+96], ymm4
diff --git a/bench/x86-64/triad_mem.ptt b/bench/x86-64/triad_mem.ptt
deleted file mode 100644
index 7c24748..0000000
--- a/bench/x86-64/triad_mem.ptt
+++ /dev/null
@@ -1,10 +0,0 @@
-STREAMS 4
-TYPE DOUBLE
-FLOPS 2
-BYTES 32
-LOOP 2
-movaps FPR1, [STR1 + GPR1*8]
-mulpd FPR1, [STR2 + GPR1*8]
-addpd FPR1, [STR3 + GPR1*8]
-movntpd [STR0 + GPR1*8], FPR1
-
diff --git a/bench/x86-64/triad_mem_avx.ptt b/bench/x86-64/triad_mem_avx.ptt
new file mode 100644
index 0000000..45af749
--- /dev/null
+++ b/bench/x86-64/triad_mem_avx.ptt
@@ -0,0 +1,18 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 32
+DESC Double-precision triad A(i) = B(i) * C(i) + D(i), uses AVX and non-temporal stores
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 9
+UOPS 9
+LOOP 4
+vmovapd ymm1, [STR1 + GPR1*8]
+vmovapd ymm2, [STR2 + GPR1*8]
+vmovapd ymm3, [STR3 + GPR1*8]
+vmulpd ymm0, ymm1, ymm2
+vaddpd ymm0, ymm0, ymm3
+vmovntpd [STR0 + GPR1*8], ymm0
+
diff --git a/bench/x86-64/triad_mem_avx_fma.ptt b/bench/x86-64/triad_mem_avx_fma.ptt
new file mode 100644
index 0000000..fbc73b7
--- /dev/null
+++ b/bench/x86-64/triad_mem_avx_fma.ptt
@@ -0,0 +1,20 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 32
+DESC Double-precision triad A(i) = B(i) * C(i) + D(i), optimized for AVX FMAs and non-temporal stores
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 14
+LOOP 8
+vmovaps ymm1, [STR1 + GPR1*8]
+vmovaps ymm2, [STR2 + GPR1*8]
+vmovaps ymm3, [STR1 + GPR1*8+32]
+vmovaps ymm4, [STR2 + GPR1*8+32]
+vfmadd213pd ymm1, ymm2, [STR3 + GPR1*8]
+vfmadd213pd ymm3, ymm4, [STR3 + GPR1*8+32]
+vmovntpd [STR0 + GPR1*8], ymm1
+vmovntpd [STR0 + GPR1*8+32], ymm3
+
diff --git a/bench/x86-64/triad_mem_sse.ptt b/bench/x86-64/triad_mem_sse.ptt
new file mode 100644
index 0000000..32e10a0
--- /dev/null
+++ b/bench/x86-64/triad_mem_sse.ptt
@@ -0,0 +1,27 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 32
+DESC Double-precision triad A(i) = B(i) * C(i) + D(i), optimized for SSE and non-temporal stores
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 19
+UOPS 30
+LOOP 8
+movaps FPR1, [STR1 + GPR1*8]
+movaps FPR2, [STR1 + GPR1*8+16]
+movaps FPR3, [STR1 + GPR1*8+32]
+movaps FPR4, [STR1 + GPR1*8+48]
+mulpd FPR1, [STR2 + GPR1*8]
+addpd FPR1, [STR3 + GPR1*8]
+mulpd FPR2, [STR2 + GPR1*8+16]
+addpd FPR2, [STR3 + GPR1*8+16]
+mulpd FPR3, [STR2 + GPR1*8+32]
+addpd FPR3, [STR3 + GPR1*8+32]
+mulpd FPR4, [STR2 + GPR1*8+48]
+addpd FPR4, [STR3 + GPR1*8+48]
+movntpd [STR0 + GPR1*8], FPR1
+movntpd [STR0 + GPR1*8+16], FPR2
+movntpd [STR0 + GPR1*8+32], FPR3
+movntpd [STR0 + GPR1*8+48], FPR4
diff --git a/bench/x86-64/triad_mem_sse_fma.ptt b/bench/x86-64/triad_mem_sse_fma.ptt
new file mode 100644
index 0000000..f96f194
--- /dev/null
+++ b/bench/x86-64/triad_mem_sse_fma.ptt
@@ -0,0 +1,27 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 32
+DESC Double-precision triad A(i) = B(i) * C(i) + D(i), optimized for SSE FMAs and non-temporal stores
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 19
+UOPS 26
+LOOP 8
+movaps FPR1, [STR1 + GPR1*8]
+movaps FPR2, [STR1 + GPR1*8+16]
+movaps FPR3, [STR1 + GPR1*8+32]
+movaps FPR4, [STR1 + GPR1*8+48]
+movaps FPR5, [STR2 + GPR1*8]
+movaps FPR6, [STR2 + GPR1*8+16]
+movaps FPR7, [STR2 + GPR1*8+32]
+movaps FPR8, [STR2 + GPR1*8+48]
+vfmadd213pd FPR1, FPR5, [STR3 + GPR1*8]
+vfmadd213pd FPR2, FPR6, [STR3 + GPR1*8+16]
+vfmadd213pd FPR3, FPR7, [STR3 + GPR1*8+32]
+vfmadd213pd FPR4, FPR8, [STR3 + GPR1*8+48]
+movntpd [STR0 + GPR1*8], FPR1
+movntpd [STR0 + GPR1*8+16], FPR2
+movntpd [STR0 + GPR1*8+32], FPR3
+movntpd [STR0 + GPR1*8+48], FPR4
diff --git a/bench/x86-64/triad_sp.ptt b/bench/x86-64/triad_sp.ptt
new file mode 100644
index 0000000..17ba5f4
--- /dev/null
+++ b/bench/x86-64/triad_sp.ptt
@@ -0,0 +1,43 @@
+STREAMS 4
+TYPE SINGLE
+FLOPS 2
+BYTES 16
+DESC Single-precision triad A(i) = B(i) * C(i) + D(i), only scalar operations
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 35
+UOPS 58
+LOOP 8
+movss FPR1, [STR1 + GPR1*4]
+movss FPR2, [STR1 + GPR1*4 + 4]
+movss FPR3, [STR1 + GPR1*4 + 8]
+movss FPR4, [STR1 + GPR1*4 + 12]
+mulss FPR1, [STR2 + GPR1*4]
+addss FPR1, [STR3 + GPR1*4]
+movss FPR5, [STR1 + GPR1*4 + 16]
+mulss FPR2, [STR2 + GPR1*4 + 4]
+addss FPR2, [STR3 + GPR1*4 + 4]
+movss FPR6, [STR1 + GPR1*4 + 20]
+mulss FPR3, [STR2 + GPR1*4 + 8]
+addss FPR3, [STR3 + GPR1*4 + 8]
+movss FPR7, [STR1 + GPR1*4 + 24]
+mulss FPR4, [STR2 + GPR1*4 + 12]
+addss FPR4, [STR3 + GPR1*4 + 12]
+movss FPR8, [STR1 + GPR1*4 + 28]
+mulss FPR5, [STR2 + GPR1*4 + 16]
+addss FPR5, [STR3 + GPR1*4 + 16]
+mulss FPR6, [STR2 + GPR1*4 + 20]
+addss FPR6, [STR3 + GPR1*4 + 20]
+mulss FPR7, [STR2 + GPR1*4 + 24]
+addss FPR7, [STR3 + GPR1*4 + 24]
+mulss FPR8, [STR2 + GPR1*4 + 28]
+addss FPR8, [STR3 + GPR1*4 + 28]
+movss [STR0 + GPR1*4], FPR1
+movss [STR0 + GPR1*4 + 4], FPR2
+movss [STR0 + GPR1*4 + 8], FPR3
+movss [STR0 + GPR1*4 + 12], FPR4
+movss [STR0 + GPR1*4 + 16], FPR5
+movss [STR0 + GPR1*4 + 20], FPR6
+movss [STR0 + GPR1*4 + 24], FPR7
+movss [STR0 + GPR1*4 + 28], FPR8
diff --git a/bench/x86-64/triad_sp_avx.ptt b/bench/x86-64/triad_sp_avx.ptt
new file mode 100644
index 0000000..a977776
--- /dev/null
+++ b/bench/x86-64/triad_sp_avx.ptt
@@ -0,0 +1,18 @@
+STREAMS 4
+TYPE SINGLE
+FLOPS 2
+BYTES 16
+DESC Single-precision triad A(i) = B(i) * C(i) + D(i), optimized for AVX
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 9
+UOPS 9
+LOOP 8
+vmovaps ymm1, [STR1 + GPR1*4]
+vmovaps ymm2, [STR2 + GPR1*4]
+vmovaps ymm3, [STR3 + GPR1*4]
+vmulps ymm0, ymm1, ymm2
+vaddps ymm0, ymm0, ymm3
+vmovaps [STR0 + GPR1*4], ymm0
+
diff --git a/bench/x86-64/triad_sp_avx_fma.ptt b/bench/x86-64/triad_sp_avx_fma.ptt
new file mode 100644
index 0000000..4d78a58
--- /dev/null
+++ b/bench/x86-64/triad_sp_avx_fma.ptt
@@ -0,0 +1,16 @@
+STREAMS 4
+TYPE SINGLE
+FLOPS 2
+BYTES 16
+DESC Single-precision triad A(i) = B(i) * C(i) + D(i), optimized for AVX FMAs
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 7
+UOPS 8
+LOOP 8
+vmovaps ymm1, [STR1 + GPR1*4]
+vmovaps ymm2, [STR2 + GPR1*4]
+vfmadd213ps ymm1, ymm2, [STR3 + GPR1*4]
+vmovaps [STR0 + GPR1*4], ymm1
+
diff --git a/bench/x86-64/triad_sp_mem_avx.ptt b/bench/x86-64/triad_sp_mem_avx.ptt
new file mode 100644
index 0000000..2a04586
--- /dev/null
+++ b/bench/x86-64/triad_sp_mem_avx.ptt
@@ -0,0 +1,16 @@
+STREAMS 4
+TYPE SINGLE
+FLOPS 2
+BYTES 16
+DESC Single-precision triad A(i) = B(i) * C(i) + D(i), optimized for AVX and non-temporal stores
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 9
+UOPS 9
+LOOP 8
+vmovaps ymm1, [STR1 + GPR1*4]
+vmovaps ymm2, [STR2 + GPR1*4]
+vmulps ymm0, ymm1, ymm2
+vaddps ymm0, ymm0, [STR3 + GPR1*4]
+vmovntps [STR0 + GPR1*4], ymm0
diff --git a/bench/x86-64/triad_sp_mem_avx_fma.ptt b/bench/x86-64/triad_sp_mem_avx_fma.ptt
new file mode 100644
index 0000000..72b5a8e
--- /dev/null
+++ b/bench/x86-64/triad_sp_mem_avx_fma.ptt
@@ -0,0 +1,16 @@
+STREAMS 4
+TYPE SINGLE
+FLOPS 2
+BYTES 16
+DESC Single-precision triad A(i) = B(i) * C(i) + D(i), optimized for AVX FMAs and non-temporal stores
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 7
+UOPS 8
+LOOP 8
+vmovaps ymm1, [STR1 + GPR1*4]
+vmovaps ymm2, [STR2 + GPR1*4]
+vfmadd213ps ymm1, ymm2, [STR3 + GPR1*4]
+vmovntps [STR0 + GPR1*4], ymm1
+
diff --git a/bench/x86-64/triad_sp_mem_sse.ptt b/bench/x86-64/triad_sp_mem_sse.ptt
new file mode 100644
index 0000000..38e8abc
--- /dev/null
+++ b/bench/x86-64/triad_sp_mem_sse.ptt
@@ -0,0 +1,27 @@
+STREAMS 4
+TYPE SINGLE
+FLOPS 2
+BYTES 16
+DESC Single-precision triad A(i) = B(i) * C(i) + D(i), optimized for SSE and non-temporal stores
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 19
+UOPS 30
+LOOP 16
+movaps FPR1, [STR1 + GPR1*4]
+movaps FPR2, [STR1 + GPR1*4+16]
+movaps FPR3, [STR1 + GPR1*4+32]
+movaps FPR4, [STR1 + GPR1*4+48]
+mulps FPR1, [STR2 + GPR1*4]
+addps FPR1, [STR3 + GPR1*4]
+mulps FPR2, [STR2 + GPR1*4+16]
+addps FPR2, [STR3 + GPR1*4+16]
+mulps FPR3, [STR2 + GPR1*4+32]
+addps FPR3, [STR3 + GPR1*4+32]
+mulps FPR4, [STR2 + GPR1*4+48]
+addps FPR4, [STR3 + GPR1*4+48]
+movntps [STR0 + GPR1*4], FPR1
+movntps [STR0 + GPR1*4+16], FPR2
+movntps [STR0 + GPR1*4+32], FPR3
+movntps [STR0 + GPR1*4+48], FPR4
diff --git a/bench/x86-64/triad_sp_mem_sse_fma.ptt b/bench/x86-64/triad_sp_mem_sse_fma.ptt
new file mode 100644
index 0000000..6b3ba66
--- /dev/null
+++ b/bench/x86-64/triad_sp_mem_sse_fma.ptt
@@ -0,0 +1,27 @@
+STREAMS 4
+TYPE SINGLE
+FLOPS 2
+BYTES 16
+DESC Single-precision triad A(i) = B(i) * C(i) + D(i), optimized for SSE FMAs and non-temporal stores
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 19
+UOPS 30
+LOOP 16
+movaps FPR1, [STR1 + GPR1*4]
+movaps FPR2, [STR1 + GPR1*4+16]
+movaps FPR3, [STR1 + GPR1*4+32]
+movaps FPR4, [STR1 + GPR1*4+48]
+movaps FPR5, [STR2 + GPR1*4]
+movaps FPR6, [STR2 + GPR1*4+16]
+movaps FPR7, [STR2 + GPR1*4+32]
+movaps FPR8, [STR2 + GPR1*4+48]
+vfmadd213ps FPR1, FPR5, [STR3 + GPR1*4]
+vfmadd213ps FPR2, FPR6, [STR3 + GPR1*4+16]
+vfmadd213ps FPR3, FPR7, [STR3 + GPR1*4+32]
+vfmadd213ps FPR4, FPR8, [STR3 + GPR1*4+48]
+movntps [STR0 + GPR1*4], FPR1
+movntps [STR0 + GPR1*4+16], FPR2
+movntps [STR0 + GPR1*4+32], FPR3
+movntps [STR0 + GPR1*4+48], FPR4
diff --git a/bench/x86-64/triad_sp_sse.ptt b/bench/x86-64/triad_sp_sse.ptt
new file mode 100644
index 0000000..deba3c5
--- /dev/null
+++ b/bench/x86-64/triad_sp_sse.ptt
@@ -0,0 +1,27 @@
+STREAMS 4
+TYPE SINGLE
+FLOPS 2
+BYTES 16
+DESC Single-precision triad A(i) = B(i) * C(i) + D(i), optimized for SSE
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 19
+UOPS 30
+LOOP 16
+movaps FPR1, [STR1 + GPR1*4]
+movaps FPR2, [STR1 + GPR1*4+16]
+movaps FPR3, [STR1 + GPR1*4+32]
+movaps FPR4, [STR1 + GPR1*4+48]
+mulps FPR1, [STR2 + GPR1*4]
+addps FPR1, [STR3 + GPR1*4]
+mulps FPR2, [STR2 + GPR1*4+16]
+addps FPR2, [STR3 + GPR1*4+16]
+mulps FPR3, [STR2 + GPR1*4+32]
+addps FPR3, [STR3 + GPR1*4+32]
+mulps FPR4, [STR2 + GPR1*4+48]
+addps FPR4, [STR3 + GPR1*4+48]
+movaps [STR0 + GPR1*4], FPR1
+movaps [STR0 + GPR1*4+16], FPR2
+movaps [STR0 + GPR1*4+32], FPR3
+movaps [STR0 + GPR1*4+48], FPR4
diff --git a/bench/x86-64/triad_sp_sse_fma.ptt b/bench/x86-64/triad_sp_sse_fma.ptt
new file mode 100644
index 0000000..f2147da
--- /dev/null
+++ b/bench/x86-64/triad_sp_sse_fma.ptt
@@ -0,0 +1,27 @@
+STREAMS 4
+TYPE SINGLE
+FLOPS 2
+BYTES 16
+DESC Single-precision triad A(i) = B(i) * C(i) + D(i), optimized for SSE FMAs
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 19
+UOPS 30
+LOOP 16
+movaps FPR1, [STR1 + GPR1*4]
+movaps FPR2, [STR1 + GPR1*4+16]
+movaps FPR3, [STR1 + GPR1*4+32]
+movaps FPR4, [STR1 + GPR1*4+48]
+movaps FPR5, [STR2 + GPR1*4]
+movaps FPR6, [STR2 + GPR1*4+16]
+movaps FPR7, [STR2 + GPR1*4+32]
+movaps FPR8, [STR2 + GPR1*4+48]
+vfmadd213ps FPR1, FPR5, [STR3 + GPR1*4]
+vfmadd213ps FPR2, FPR6, [STR3 + GPR1*4+16]
+vfmadd213ps FPR3, FPR7, [STR3 + GPR1*4+32]
+vfmadd213ps FPR4, FPR8, [STR3 + GPR1*4+48]
+movaps [STR0 + GPR1*4], FPR1
+movaps [STR0 + GPR1*4+16], FPR2
+movaps [STR0 + GPR1*4+32], FPR3
+movaps [STR0 + GPR1*4+48], FPR4
diff --git a/bench/x86-64/triad_split.ptt b/bench/x86-64/triad_split.ptt
deleted file mode 100644
index 7b30e47..0000000
--- a/bench/x86-64/triad_split.ptt
+++ /dev/null
@@ -1,30 +0,0 @@
-STREAMS 4
-TYPE DOUBLE
-FLOPS 2
-BYTES 32
-LOOP 8
-movapd FPR1, [STR1 + GPR1*8]
-movapd FPR2, [STR1 + GPR1*8+16]
-movapd FPR3, [STR1 + GPR1*8+32]
-movapd FPR4, [STR1 + GPR1*8+48]
-movapd FPR5, [STR2 + GPR1*8]
-movapd FPR6, [STR3 + GPR1*8]
-movapd FPR7, [STR2 + GPR1*8+16]
-movapd FPR8, [STR3 + GPR1*8+16]
-movapd FPR9, [STR2 + GPR1*8+32]
-movapd FPR10, [STR3 + GPR1*8+32]
-movapd FPR11, [STR2 + GPR1*8+48]
-movapd FPR12, [STR3 + GPR1*8+48]
-mulpd FPR1, FPR5
-addpd FPR1, FPR6
-mulpd FPR2, FPR7
-addpd FPR2, FPR8
-mulpd FPR3, FPR9
-addpd FPR3, FPR10
-mulpd FPR4, FPR11
-addpd FPR4, FPR12
-movapd [STR0 + GPR1*8], FPR1
-movapd [STR0 + GPR1*8+16], FPR2
-movapd [STR0 + GPR1*8+32], FPR3
-movapd [STR0 + GPR1*8+48], FPR4
-
diff --git a/bench/x86-64/triad_sse.ptt b/bench/x86-64/triad_sse.ptt
new file mode 100644
index 0000000..11aabe3
--- /dev/null
+++ b/bench/x86-64/triad_sse.ptt
@@ -0,0 +1,28 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 32
+DESC Double-precision triad A(i) = B(i) * C(i) + D(i), optimized for SSE
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 19
+UOPS 30
+LOOP 8
+movaps FPR1, [STR1 + GPR1*8]
+movaps FPR2, [STR1 + GPR1*8+16]
+movaps FPR3, [STR1 + GPR1*8+32]
+movaps FPR4, [STR1 + GPR1*8+48]
+mulpd FPR1, [STR2 + GPR1*8]
+addpd FPR1, [STR3 + GPR1*8]
+mulpd FPR2, [STR2 + GPR1*8+16]
+addpd FPR2, [STR3 + GPR1*8+16]
+mulpd FPR3, [STR2 + GPR1*8+32]
+addpd FPR3, [STR3 + GPR1*8+32]
+mulpd FPR4, [STR2 + GPR1*8+48]
+addpd FPR4, [STR3 + GPR1*8+48]
+movaps [STR0 + GPR1*8], FPR1
+movaps [STR0 + GPR1*8+16], FPR2
+movaps [STR0 + GPR1*8+32], FPR3
+movaps [STR0 + GPR1*8+48], FPR4
+
diff --git a/bench/x86-64/triad_sse_fma.ptt b/bench/x86-64/triad_sse_fma.ptt
new file mode 100644
index 0000000..d6822fa
--- /dev/null
+++ b/bench/x86-64/triad_sse_fma.ptt
@@ -0,0 +1,27 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 32
+DESC Double-precision triad A(i) = B(i) * C(i) + D(i), optimized for SSE FMAs
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 19
+UOPS 26
+LOOP 8
+movaps FPR1, [STR1 + GPR1*8]
+movaps FPR2, [STR1 + GPR1*8+16]
+movaps FPR3, [STR1 + GPR1*8+32]
+movaps FPR4, [STR1 + GPR1*8+48]
+movaps FPR5, [STR2 + GPR1*8]
+movaps FPR6, [STR2 + GPR1*8+16]
+movaps FPR7, [STR2 + GPR1*8+32]
+movaps FPR8, [STR2 + GPR1*8+48]
+vfmadd213pd FPR1, FPR5, [STR3 + GPR1*8]
+vfmadd213pd FPR2, FPR6, [STR3 + GPR1*8+16]
+vfmadd213pd FPR3, FPR7, [STR3 + GPR1*8+32]
+vfmadd213pd FPR4, FPR8, [STR3 + GPR1*8+48]
+movaps [STR0 + GPR1*8], FPR1
+movaps [STR0 + GPR1*8+16], FPR2
+movaps [STR0 + GPR1*8+32], FPR3
+movaps [STR0 + GPR1*8+48], FPR4
diff --git a/bench/x86-64/update.ptt b/bench/x86-64/update.ptt
index ac1129b..422981e 100644
--- a/bench/x86-64/update.ptt
+++ b/bench/x86-64/update.ptt
@@ -2,14 +2,20 @@ STREAMS 1
TYPE DOUBLE
FLOPS 0
BYTES 16
-LOOP 8
-movaps FPR1, [STR0 + GPR1 * 8]
-movaps [STR0 + GPR1 * 8] , FPR1
-movaps FPR2, [STR0 + GPR1 * 8 + 16]
-movaps FPR3, [STR0 + GPR1 * 8 + 32]
-movaps FPR4, [STR0 + GPR1 * 8 + 48]
-movaps [STR0 + GPR1 * 8 + 16], FPR2
-movaps [STR0 + GPR1 * 8 + 32], FPR3
-movaps [STR0 + GPR1 * 8 + 48], FPR4
+DESC Double-precision vector update, only scalar operations
+LOADS 1
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 14
+LOOP 4
+movsd FPR1, [STR0 + GPR1 * 8]
+movsd [STR0 + GPR1 * 8] , FPR1
+movsd FPR2, [STR0 + GPR1 * 8 + 8]
+movsd FPR3, [STR0 + GPR1 * 8 + 16]
+movsd FPR4, [STR0 + GPR1 * 8 + 24]
+movsd [STR0 + GPR1 * 8 + 8], FPR2
+movsd [STR0 + GPR1 * 8 + 16], FPR3
+movsd [STR0 + GPR1 * 8 + 24], FPR4
diff --git a/bench/x86-64/update_avx.ptt b/bench/x86-64/update_avx.ptt
index 2e9178e..eeca2fb 100644
--- a/bench/x86-64/update_avx.ptt
+++ b/bench/x86-64/update_avx.ptt
@@ -2,6 +2,12 @@ STREAMS 1
TYPE DOUBLE
FLOPS 0
BYTES 16
+DESC Double-precision vector update, optimized for AVX
+LOADS 1
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 14
LOOP 16
vmovaps ymm1, [STR0 + GPR1 * 8]
vmovaps [STR0 + GPR1 * 8] , ymm1
diff --git a/bench/x86-64/update_plain.ptt b/bench/x86-64/update_plain.ptt
deleted file mode 100644
index b5a3e4a..0000000
--- a/bench/x86-64/update_plain.ptt
+++ /dev/null
@@ -1,15 +0,0 @@
-STREAMS 1
-TYPE DOUBLE
-FLOPS 0
-BYTES 16
-LOOP 4
-movss FPR1, [STR0 + GPR1 * 8]
-movss [STR0 + GPR1 * 8] , FPR1
-movss FPR2, [STR0 + GPR1 * 8 + 8]
-movss FPR3, [STR0 + GPR1 * 8 + 16]
-movss FPR4, [STR0 + GPR1 * 8 + 24]
-movss [STR0 + GPR1 * 8 + 8], FPR2
-movss [STR0 + GPR1 * 8 + 16], FPR3
-movss [STR0 + GPR1 * 8 + 24], FPR4
-
-
diff --git a/bench/x86-64/update_sse.ptt b/bench/x86-64/update_sse.ptt
index ac1129b..fe1be1d 100644
--- a/bench/x86-64/update_sse.ptt
+++ b/bench/x86-64/update_sse.ptt
@@ -2,6 +2,12 @@ STREAMS 1
TYPE DOUBLE
FLOPS 0
BYTES 16
+DESC Double-precision vector update, optimized for SSE
+LOADS 1
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 14
LOOP 8
movaps FPR1, [STR0 + GPR1 * 8]
movaps [STR0 + GPR1 * 8] , FPR1
diff --git a/bench/x86-64/vtriad_avx.ptt b/bench/x86-64/vtriad_avx.ptt
deleted file mode 100644
index 4a542d2..0000000
--- a/bench/x86-64/vtriad_avx.ptt
+++ /dev/null
@@ -1,22 +0,0 @@
-STREAMS 4
-TYPE DOUBLE
-FLOPS 2
-BYTES 32
-LOOP 16
-vmovaps ymm1, [STR1 + GPR1*8]
-vmovaps ymm2, [STR1 + GPR1*8+32]
-vmovaps ymm3, [STR1 + GPR1*8+64]
-vmovaps ymm4, [STR1 + GPR1*8+96]
-vmulpd ymm1, ymm1, [STR2 + GPR1*8]
-vaddpd ymm1, ymm1, [STR3 + GPR1*8]
-vmulpd ymm2, ymm2, [STR2 + GPR1*8+32]
-vaddpd ymm2, ymm2, [STR3 + GPR1*8+32]
-vmulpd ymm3, ymm3, [STR2 + GPR1*8+64]
-vaddpd ymm3, ymm3, [STR3 + GPR1*8+64]
-vmulpd ymm4, ymm4, [STR2 + GPR1*8+96]
-vaddpd ymm4, ymm4, [STR3 + GPR1*8+96]
-vmovaps [STR0 + GPR1*8], ymm1
-vmovaps [STR0 + GPR1*8+32], ymm2
-vmovaps [STR0 + GPR1*8+64], ymm3
-vmovaps [STR0 + GPR1*8+96], ymm4
-
diff --git a/bench/x86-64/vtriad_mem_avx.ptt b/bench/x86-64/vtriad_mem_avx.ptt
deleted file mode 100644
index 315ef14..0000000
--- a/bench/x86-64/vtriad_mem_avx.ptt
+++ /dev/null
@@ -1,10 +0,0 @@
-STREAMS 4
-TYPE DOUBLE
-FLOPS 2
-BYTES 32
-LOOP 4
-vmovaps ymm1, [STR1 + GPR1*8]
-vmulpd ymm1, ymm1, [STR2 + GPR1*8]
-vaddpd ymm1, ymm1, [STR3 + GPR1*8]
-vmovntpd [STR0 + GPR1*8], ymm1
-
diff --git a/bench/x86-64/vtriad_mem_sse.ptt b/bench/x86-64/vtriad_mem_sse.ptt
deleted file mode 100644
index 7c24748..0000000
--- a/bench/x86-64/vtriad_mem_sse.ptt
+++ /dev/null
@@ -1,10 +0,0 @@
-STREAMS 4
-TYPE DOUBLE
-FLOPS 2
-BYTES 32
-LOOP 2
-movaps FPR1, [STR1 + GPR1*8]
-mulpd FPR1, [STR2 + GPR1*8]
-addpd FPR1, [STR3 + GPR1*8]
-movntpd [STR0 + GPR1*8], FPR1
-
diff --git a/bench/x86-64/vtriad_plain.ptt b/bench/x86-64/vtriad_plain.ptt
deleted file mode 100644
index 120331c..0000000
--- a/bench/x86-64/vtriad_plain.ptt
+++ /dev/null
@@ -1,22 +0,0 @@
-STREAMS 4
-TYPE DOUBLE
-FLOPS 2
-BYTES 32
-LOOP 4
-movsd FPR1, [STR1 + GPR1*8]
-movsd FPR2, [STR1 + GPR1*8+8]
-movsd FPR3, [STR1 + GPR1*8+16]
-movss FPR4, [STR1 + GPR1*8+24]
-mulsd FPR1, [STR2 + GPR1*8]
-addsd FPR1, [STR3 + GPR1*8]
-mulsd FPR2, [STR2 + GPR1*8+8]
-addsd FPR2, [STR3 + GPR1*8+8]
-mulsd FPR3, [STR2 + GPR1*8+16]
-addsd FPR3, [STR3 + GPR1*8+16]
-mulsd FPR4, [STR2 + GPR1*8+24]
-addsd FPR4, [STR3 + GPR1*8+24]
-movsd [STR0 + GPR1*8], FPR1
-movsd [STR0 + GPR1*8+8], FPR2
-movsd [STR0 + GPR1*8+16], FPR3
-movsd [STR0 + GPR1*8+24], FPR4
-
diff --git a/bench/x86-64/vtriad_sse.ptt b/bench/x86-64/vtriad_sse.ptt
deleted file mode 100644
index d521aa0..0000000
--- a/bench/x86-64/vtriad_sse.ptt
+++ /dev/null
@@ -1,22 +0,0 @@
-STREAMS 4
-TYPE DOUBLE
-FLOPS 2
-BYTES 32
-LOOP 8
-movaps FPR1, [STR1 + GPR1*8]
-movaps FPR2, [STR1 + GPR1*8+16]
-movaps FPR3, [STR1 + GPR1*8+32]
-movaps FPR4, [STR1 + GPR1*8+48]
-mulpd FPR1, [STR2 + GPR1*8]
-addpd FPR1, [STR3 + GPR1*8]
-mulpd FPR2, [STR2 + GPR1*8+16]
-addpd FPR2, [STR3 + GPR1*8+16]
-mulpd FPR3, [STR2 + GPR1*8+32]
-addpd FPR3, [STR3 + GPR1*8+32]
-mulpd FPR4, [STR2 + GPR1*8+48]
-addpd FPR4, [STR3 + GPR1*8+48]
-movaps [STR0 + GPR1*8], FPR1
-movaps [STR0 + GPR1*8+16], FPR2
-movaps [STR0 + GPR1*8+32], FPR3
-movaps [STR0 + GPR1*8+48], FPR4
-
diff --git a/bench/x86/copy.ptt b/bench/x86/copy.ptt
index 111d38b..737b087 100644
--- a/bench/x86/copy.ptt
+++ b/bench/x86/copy.ptt
@@ -2,17 +2,23 @@ STREAMS 2
TYPE DOUBLE
FLOPS 0
BYTES 16
+DESC Double-precision vector copy, only scalar operations
+LOADS 1
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 14
mov GPR6, ARG1
mov GPR2, STR0
mov GPR3, STR1
-LOOP 8
-movaps FPR1, [GPR2 + GPR1 * 8]
-movaps FPR2, [GPR2 + GPR1 * 8 + 16]
-movaps FPR3, [GPR2 + GPR1 * 8 + 32]
-movaps FPR4, [GPR2 + GPR1 * 8 + 48]
-movaps [GPR3 + GPR1 * 8] , FPR1
-movaps [GPR3 + GPR1 * 8 + 16], FPR2
-movaps [GPR3 + GPR1 * 8 + 32], FPR3
-movaps [GPR3 + GPR1 * 8 + 48], FPR4
+LOOP 4
+movsd FPR1, [GPR2 + GPR1 * 8]
+movsd FPR2, [GPR2 + GPR1 * 8 + 8]
+movsd FPR3, [GPR2 + GPR1 * 8 + 16]
+movsd FPR4, [GPR2 + GPR1 * 8 + 24]
+movsd [GPR3 + GPR1 * 8] , FPR1
+movsd [GPR3 + GPR1 * 8 + 8], FPR2
+movsd [GPR3 + GPR1 * 8 + 16], FPR3
+movsd [GPR3 + GPR1 * 8 + 24], FPR4
diff --git a/bench/x86/load.ptt b/bench/x86/load.ptt
index cf001a4..473d8aa 100644
--- a/bench/x86/load.ptt
+++ b/bench/x86/load.ptt
@@ -2,12 +2,21 @@ STREAMS 1
TYPE DOUBLE
FLOPS 0
BYTES 8
+DESC Double-precision load, only scalar operations
+LOADS 1
+STORES 0
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 10
mov GPR6, ARG1
mov GPR2, STR0
LOOP 8
-movaps FPR1, [GPR2 + GPR1 * 8]
-movaps FPR2, [GPR2 + GPR1 * 8 + 16]
-movaps FPR3, [GPR2 + GPR1 * 8 + 32]
-movaps FPR4, [GPR2 + GPR1 * 8 + 48]
-
+movsd FPR1, [GPR2 + GPR1 * 8]
+movsd FPR2, [GPR2 + GPR1 * 8 + 8]
+movsd FPR3, [GPR2 + GPR1 * 8 + 16]
+movsd FPR4, [GPR2 + GPR1 * 8 + 24]
+movsd FPR5, [GPR2 + GPR1 * 8 + 32]
+movsd FPR6, [GPR2 + GPR1 * 8 + 40]
+movsd FPR7, [GPR2 + GPR1 * 8 + 48]
+movsd FPR8, [GPR2 + GPR1 * 8 + 56]
diff --git a/bench/x86/store.ptt b/bench/x86/store.ptt
index 1cf15da..07ed59a 100644
--- a/bench/x86/store.ptt
+++ b/bench/x86/store.ptt
@@ -1,16 +1,22 @@
STREAMS 1
TYPE DOUBLE
FLOPS 0
+DESC Double-precision store, only scalar operations
BYTES 8
-movaps FPR1, [SCALAR]
-movaps FPR2, [SCALAR]
-movaps FPR3, [SCALAR]
-movaps FPR4, [SCALAR]
+LOADS 0
+STORES 1
+INSTR_CONST 22
+INSTR_LOOP 7
+UOPS 10
+movsd FPR1, [SCALAR]
+movsd FPR2, [SCALAR]
+movsd FPR3, [SCALAR]
+movsd FPR4, [SCALAR]
mov GPR6, ARG1
mov GPR2, STR0
-LOOP 8
-movaps [GPR2 + GPR1 * 8] , FPR1
-movaps [GPR2 + GPR1 * 8 + 16], FPR2
-movaps [GPR2 + GPR1 * 8 + 32], FPR3
-movaps [GPR2 + GPR1 * 8 + 48], FPR4
+LOOP 4
+movsd [GPR2 + GPR1 * 8] , FPR1
+movsd [GPR2 + GPR1 * 8 + 8], FPR2
+movsd [GPR2 + GPR1 * 8 + 16], FPR3
+movsd [GPR2 + GPR1 * 8 + 24], FPR4
diff --git a/bench/x86/stream.ptt b/bench/x86/stream.ptt
index bab4ecb..067a06c 100644
--- a/bench/x86/stream.ptt
+++ b/bench/x86/stream.ptt
@@ -2,26 +2,32 @@ STREAMS 3
TYPE DOUBLE
FLOPS 2
BYTES 24
-movaps FPR5, [SCALAR]
-mov GPR6, ARG1
+DESC Double-precision stream triad A(i) = B(i)*c + C(i), only scalar operations
+LOADS 2
+STORES 1
+INSTR_CONST 21
+INSTR_LOOP 19
+UOPS 26
+movsd FPR5, [SCALAR]
+mov GPR6, ARG1
mov GPR2, STR0
mov GPR3, STR1
mov GPR4, STR2
LOOP 8
-movaps FPR1, [GPR3 + GPR1*8]
-movaps FPR2, [GPR3 + GPR1*8+16]
-movaps FPR3, [GPR3 + GPR1*8+32]
-movaps FPR4, [GPR3 + GPR1*8+48]
-mulpd FPR1, FPR5
-addpd FPR1, [GPR4 + GPR1*8]
-mulpd FPR2, FPR5
-addpd FPR2, [GPR4 + GPR1*8+16]
-mulpd FPR3, FPR5
-addpd FPR3, [GPR4 + GPR1*8+32]
-mulpd FPR4, FPR5
-addpd FPR4, [GPR4 + GPR1*8+48]
-movaps [GPR2 + GPR1*8] , FPR1
-movaps [GPR2 + GPR1*8+16], FPR2
-movaps [GPR2 + GPR1*8+32], FPR3
-movaps [GPR2 + GPR1*8+48], FPR4
+movsd FPR1, [GPR3 + GPR1*8]
+movsd FPR2, [GPR3 + GPR1*8+8]
+movsd FPR3, [GPR3 + GPR1*8+16]
+movsd FPR4, [GPR3 + GPR1*8+24]
+mulsd FPR1, FPR5
+addsd FPR1, [GPR4 + GPR1*8]
+mulsd FPR2, FPR5
+addsd FPR2, [GPR4 + GPR1*8+8]
+mulsd FPR3, FPR5
+addsd FPR3, [GPR4 + GPR1*8+16]
+mulsd FPR4, FPR5
+addsd FPR4, [GPR4 + GPR1*8+24]
+movsd [GPR2 + GPR1*8], FPR1
+movsd [GPR2 + GPR1*8+8], FPR2
+movsd [GPR2 + GPR1*8+16], FPR3
+movsd [GPR2 + GPR1*8+24], FPR4
diff --git a/config.mk b/config.mk
index 2c3f3be..46fbe78 100644
--- a/config.mk
+++ b/config.mk
@@ -1,6 +1,6 @@
# Please have a look in INSTALL and the WIKI for details on
# configuration options setup steps.
-# supported: GCC, GCCX86, MIC (ICC)
+# supported: GCC, CLANG, ICC, MIC (ICC), GCCX86 (for 32bit systems)
COMPILER = GCC#NO SPACE
# Define the color of the likwid-pin output
@@ -10,12 +10,41 @@ COLOR = BLUE#NO SPACE
# Path were to install likwid
PREFIX = /usr/local#NO SPACE
+
+#################################################################
+# Common users do not need to change values below this comment! #
+#################################################################
+
MANPREFIX = $(PREFIX)/man#NO SPACE
+BINPREFIX = $(PREFIX)/bin#NO SPACE
+LIBPREFIX = $(PREFIX)/lib#NO SPACE
+
+# These paths are hardcoded into executables and libraries. Usually
+# they'll be the same as above, but package maintainers may want to
+# distinguish between the image directories and the final install
+# target.
+# Keep in mind that the access and setFreq daemon need enough
+# privileges that may be deleted when copying the files to
+# the INTSTALLED_PREFIX
+INSTALLED_PREFIX = $(PREFIX)#NO SPACE
+INSTALLED_BINPREFIX = $(INSTALLED_PREFIX)/bin#NO SPACE
+INSTALLED_LIBPREFIX = $(INSTALLED_PREFIX)/lib#NO SPACE
+
+# chown installed tools to this user/group
+# if you change anything here, make sure that the user/group can access
+# the MSR devices and (on Intel) the PCI devices.
+INSTALL_CHOWN = -g root -o root
# For the daemon based secure msr/pci access configure
# the absolute path to the msr daemon executable.
-# Usually you can leave this to the default.
+# $(INSTALLED_PREFIX)/bin/likwid-accessD
ACCESSDAEMON = $(PREFIX)/sbin/likwid-accessD#NO SPACE
+INSTALLED_ACCESSDAEMON = $(INSTALLED_PREFIX)/sbin/likwid-accessD#NO SPACE
+
+# Build the accessDaemon. Have a look in the WIKI for details.
+BUILDDAEMON = true#NO SPACE
+#Build the setFrequencies tool
+BUILDFREQ = true#NO SPACE
# Set the default mode for MSR access.
# This can usually be overriden on the commandline.
@@ -23,7 +52,7 @@ ACCESSDAEMON = $(PREFIX)/sbin/likwid-accessD#NO SPACE
ACCESSMODE = accessdaemon#NO SPACE
# Change to true to a build shared library instead of a static one
-SHARED_LIBRARY = false#NO SPACE
+SHARED_LIBRARY = true#NO SPACE
# Build Fortran90 module interface for marker API. Adopt Fortran compiler
# in ./make/include_<COMPILER>.mk if necessary. Default: ifort .
@@ -32,17 +61,25 @@ FORTRAN_INTERFACE = false#NO SPACE
# Instrument likwid-bench for use with likwid-perfctr
INSTRUMENT_BENCH = false#NO SPACE
-# Usually you do not need to edit below
+# Use recommended Portable Hardware Locality (hwloc) instead of CPUID
+USE_HWLOC = true#NO SPACE
+
+# Build LIKWID with debug flags
+DEBUG = false#NO SPACE
+
+# Basic configuration (compiled into library, can be changed by creating
+# a proper config file at CFG_FILE_PATH)
MAX_NUM_THREADS = 263
-MAX_NUM_NODES = 4
-HASH_TABLE_SIZE = 20
+MAX_NUM_NODES = 64
CFG_FILE_PATH = /etc/likwid.cfg
+TOPO_FILE_PATH = /etc/likwid_topo.cfg
# Versioning Information
-VERSION = 3
+VERSION = 4
RELEASE = 1
-DATE = 12.2.2014
-
-LIBLIKWIDPIN = $(abspath $(PREFIX)/lib/liblikwidpin.so)
-LIKWIDFILTERPATH = $(abspath $(PREFIX)/share/likwid)
+DATE = 19.05.2016
+RPATHS = -Wl,-rpath=$(INSTALLED_LIBPREFIX)
+LIBLIKWIDPIN = $(abspath $(INSTALLED_PREFIX)/lib/liblikwidpin.so.$(VERSION).$(RELEASE))
+LIKWIDFILTERPATH = $(abspath $(INSTALLED_PREFIX)/share/likwid/filter)
+LIKWIDGROUPPATH = $(abspath $(INSTALLED_PREFIX)/share/likwid/perfgroups)
diff --git a/doc/Doxyfile b/doc/Doxyfile
new file mode 100644
index 0000000..dbfba97
--- /dev/null
+++ b/doc/Doxyfile
@@ -0,0 +1,1781 @@
+# Doxyfile 1.7.6.1
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING = UTF-8
+
+# The PROJECT_NAME tag is a single word (or sequence of words) that should
+# identify the project. Note that if you do not use Doxywizard you need
+# to put quotes around the project name if it contains spaces.
+
+PROJECT_NAME = "LIKWID"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer
+# a quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF =
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is
+# included in the documentation. The maximum height of the logo should not
+# exceed 55 pixels and the maximum width should not exceed 200 pixels.
+# Doxygen will copy the logo to the output directory.
+
+PROJECT_LOGO = doc/logo.png
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY =
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak,
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful if your file system
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE = 8
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding
+# "class=itcl::class" will allow you to use the command class in the
+# itcl::class meaning.
+
+TCL_SUBST =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given extension.
+# Doxygen has a built-in mapping, but you can override or extend it using this
+# tag. The format is ext=language, where ext is a file extension, and language
+# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C,
+# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make
+# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
+# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions
+# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen.
+
+EXTENSION_MAPPING =
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also makes the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and
+# unions are shown inside the group in which they are included (e.g. using
+# @ingroup) instead of on a separate page (for HTML and Man pages) or
+# section (for LaTeX and RTF).
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and
+# unions with only public data fields will be shown inline in the documentation
+# of the scope in which they are defined (i.e. file, namespace, or group
+# documentation), provided this scope is documented. If set to NO (the default),
+# structs, classes, and unions are shown on a separate page (for HTML and Man
+# pages) or section (for LaTeX and RTF).
+
+INLINE_SIMPLE_STRUCTS = NO
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penalty.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will roughly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols.
+
+SYMBOL_CACHE_SIZE = 0
+
+# Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be
+# set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given
+# their name and scope. Since this can be an expensive process and often the
+# same symbol appear multiple times in the code, doxygen keeps a cache of
+# pre-resolved symbols. If the cache is too small doxygen will become slower.
+# If the cache is too large, memory is wasted. The cache size is given by this
+# formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols.
+
+LOOKUP_CACHE_SIZE = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES = NO
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespaces are hidden.
+
+EXTRACT_ANON_NSPACES = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
+# will list include files with double quotes in the documentation
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS = YES
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
+# will sort the (brief and detailed) documentation of class members so that
+# constructors and destructors are listed first. If set to NO (the default)
+# the constructors will appear in the respective orders defined by
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to
+# do proper type resolution of all parameters of a function it will reject a
+# match between the prototype and the implementation of a member function even
+# if there is only one candidate or it is obvious which candidate to choose
+# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen
+# will still accept a match between prototype and implementation in such cases.
+
+STRICT_PROTO_MATCHING = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or macro consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and macros in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES = YES
+
+# If the sources in your project are distributed over multiple directories
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES = NO
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page.
+# This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. The create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option.
+# You can optionally specify a file name after the option, if omitted
+# DoxygenLayout.xml will be used as the name of the layout file.
+
+LAYOUT_FILE =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files
+# containing the references data. This must be a list of .bib files. The
+# .bib extension is automatically appended if omitted. Using this command
+# requires the bibtex tool to be installed. See also
+# http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style
+# of the bibliography can be controlled using LATEX_BIB_STYLE. To use this
+# feature you need bibtex and perl available in the search path.
+
+CITE_BIB_FILES =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS = NO
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR = YES
+
+# The WARN_NO_PARAMDOC option can be enabled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT = ./src/includes/likwid.h ./doc/likwid-doxygen.md ./src/includes/perfmon_types.h ./src/includes/topology_types.h ./src/includes/power_types.h ./src/includes/tree_types.h ./doc/archs/ ./doc/lua-doxygen.md ./doc/applications/ ./doc/likwid.cfg.md ./src/likwid.f90
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh
+# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py
+# *.f90 *.f *.for *.vhd *.vhdl
+
+FILE_PATTERNS = *.md
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE =
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS = AccessDataRecord
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH = ./examples
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.
+# If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.
+# Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.
+# The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty or if
+# non of the patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any)
+# and it is also possible to disable source filtering for a specific pattern
+# using *.ext= (so without naming a filter). This option only has effect when
+# FILTER_SOURCE_FILES is enabled.
+
+FILTER_SOURCE_PATTERNS =
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.
+# Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX = YES
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT = doc/html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header. Note that when using a custom header you are responsible
+# for the proper inclusion of any scripts and style sheets that doxygen
+# needs, which is dependent on the configuration options used.
+# It is advised to generate a default header using "doxygen -w html
+# header.html footer.html stylesheet.css YourConfigFile" and then modify
+# that header. Note that the header is subject to change so you typically
+# have to redo this when upgrading to a newer version of doxygen or when
+# changing the value of configuration settings such as GENERATE_TREEVIEW!
+
+HTML_HEADER =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# style sheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that
+# the files will be copied as-is; there are no commands or markers available.
+
+HTML_EXTRA_FILES =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output.
+# Doxygen will adjust the colors in the style sheet and background images
+# according to this color. Hue is specified as an angle on a colorwheel,
+# see http://en.wikipedia.org/wiki/Hue for more information.
+# For instance the value 0 represents red, 60 is yellow, 120 is green,
+# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
+# The allowed range is 0 to 359.
+
+HTML_COLORSTYLE_HUE = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
+# the colors in the HTML output. For a value of 0 the output will use
+# grayscales only. A value of 255 will produce the most vivid colors.
+
+HTML_COLORSTYLE_SAT = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
+# the luminance component of the colors in the HTML output. Values below
+# 100 gradually make the output lighter, whereas values above 100 make
+# the output darker. The value divided by 100 is the actual gamma applied,
+# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
+# and 100 does not change the gamma.
+
+HTML_COLORSTYLE_GAMMA = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP = YES
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS = NO
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+
+GENERATE_DOCSET = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID = org.doxygen.Project
+
+# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+
+DOCSET_PUBLISHER_ID = org.doxygen.Publisher
+
+# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.
+
+DOCSET_PUBLISHER_NAME = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated
+# that can be used as input for Qt's qhelpgenerator to generate a
+# Qt Compressed Help (.qch) of the generated HTML documentation.
+
+GENERATE_QHP = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to
+# add. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME =
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">
+# Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's
+# filter section matches.
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">
+# Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS =
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
+# .qhp file.
+
+QHG_LOCATION =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
+# will be generated, which together with the HTML files, form an Eclipse help
+# plugin. To install this plugin and make it available under the help contents
+# menu in Eclipse, the contents of the directory containing the HTML and XML
+# files needs to be copied into the plugins directory of eclipse. The name of
+# the directory within the plugins directory should be the same as
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
+# the help appears.
+
+GENERATE_ECLIPSEHELP = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have
+# this name.
+
+ECLIPSE_DOC_ID = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs)
+# at top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it. Since the tabs have the same information as the
+# navigation tree you can set this option to NO if you already set
+# GENERATE_TREEVIEW to YES.
+
+DISABLE_INDEX = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to YES, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
+# Windows users are probably better off using the HTML help feature.
+# Since the tree basically has the same information as the tab index you
+# could consider to set DISABLE_INDEX to NO when enabling this option.
+
+GENERATE_TREEVIEW = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values
+# (range [0,1..20]) that doxygen will group on one line in the generated HTML
+# documentation. Note that a value of 0 will completely suppress the enum
+# values from appearing in the overview section.
+
+ENUM_VALUES_PER_LINE = 4
+
+# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories,
+# and Class Hierarchy pages using a tree view instead of an ordered list.
+
+USE_INLINE_TREES = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open
+# links to external symbols imported via tag files in a separate window.
+
+EXT_LINKS_IN_WINDOW = NO
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are
+# not supported properly for IE 6.0, but are supported on all modern browsers.
+# Note that when changing this option you need to delete any form_*.png files
+# in the HTML output before the changes have effect.
+
+FORMULA_TRANSPARENT = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax
+# (see http://www.mathjax.org) which uses client side Javascript for the
+# rendering instead of using prerendered bitmaps. Use this if you do not
+# have LaTeX installed or if you want to formulas look prettier in the HTML
+# output. When enabled you also need to install MathJax separately and
+# configure the path to it using the MATHJAX_RELPATH option.
+
+USE_MATHJAX = NO
+
+# When MathJax is enabled you need to specify the location relative to the
+# HTML output directory using the MATHJAX_RELPATH option. The destination
+# directory should contain the MathJax.js script. For instance, if the mathjax
+# directory is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the
+# mathjax.org site, so you can quickly see the result without installing
+# MathJax, but it is strongly recommended to install a local copy of MathJax
+# before deployment.
+
+MATHJAX_RELPATH = http://www.mathjax.org/mathjax
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension
+# names that should be enabled during MathJax rendering.
+
+MATHJAX_EXTENSIONS =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box
+# for the HTML output. The underlying search engine uses javascript
+# and DHTML and should work on any modern browser. Note that when using
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
+# (GENERATE_DOCSET) there is already a search function so this one should
+# typically be disabled. For large projects the javascript based search engine
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a PHP enabled web server instead of at the web client
+# using Javascript. Doxygen will generate the search PHP script and index
+# file to put on the web server. The advantage of the server
+# based approach is that it scales better to large projects and allows
+# full text search. The disadvantages are that it is more difficult to setup
+# and does not have live searching capabilities.
+
+SERVER_BASED_SEARCH = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+# Note that when enabling USE_PDFLATEX this option is only used for
+# generating bitmaps for formulas in the HTML output, but not in the
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE = a4
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for
+# the generated latex document. The footer should contain everything after
+# the last chapter. If it is left blank doxygen will generate a
+# standard footer. Notice: only use this tag if you know what you are doing!
+
+LATEX_FOOTER =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include
+# source code with syntax highlighting in the LaTeX output.
+# Note that which sources are shown also depends on other settings
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See
+# http://en.wikipedia.org/wiki/BibTeX for more info.
+
+LATEX_BIB_STYLE = plain
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS = NO
+
+# Load style sheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.
+# This is useful
+# if you want to understand what is going on.
+# On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# pointed to by INCLUDE_PATH will be searched when a #include is found.
+
+SEARCH_INCLUDES = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition that
+# overrules the definition found in the source code.
+
+EXPAND_AS_DEFINED =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all references to function-like macros
+# that are alone on a line, have an all uppercase name, and do not end with a
+# semicolon, because these will confuse the parser if not removed.
+
+SKIP_FUNCTION_MACROS = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+#
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option also works with HAVE_DOT disabled, but it is recommended to
+# install and use dot, since it yields more powerful graphs.
+
+CLASS_DIAGRAMS = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT = YES
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is
+# allowed to run in parallel. When set to 0 (the default) doxygen will
+# base this on the number of processors available in the system. You can set it
+# explicitly to a value larger than 0 to get control over the balance
+# between CPU load and processing speed.
+
+DOT_NUM_THREADS = 0
+
+# By default doxygen will use the Helvetica font for all dot files that
+# doxygen generates. When you want a differently looking font you can specify
+# the font name using DOT_FONTNAME. You need to make sure dot is able to find
+# the font, which can be done by putting it in a standard location or by setting
+# the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the
+# directory containing the font.
+
+DOT_FONTNAME = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
+# The default size is 10pt.
+
+DOT_FONTSIZE = 10
+
+# By default doxygen will tell dot to use the Helvetica font.
+# If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to
+# set the path where dot can find it.
+
+DOT_FONTPATH =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will generate a graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are svg, png, jpg, or gif.
+# If left blank png will be used. If you choose svg you need to set
+# HTML_FILE_EXTENSION to xhtml in order to make the SVG files
+# visible in IE 9+ (other browsers do not have this requirement).
+
+DOT_IMAGE_FORMAT = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+# Note that this requires a modern browser other than Internet Explorer.
+# Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you
+# need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files
+# visible. Older versions of IE do not have SVG support.
+
+INTERACTIVE_SVG = NO
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the
+# \mscfile command).
+
+MSCFILE_DIRS =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS = YES
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP = YES
diff --git a/doc/applications/likwid-accessD.md b/doc/applications/likwid-accessD.md
new file mode 100644
index 0000000..c80481e
--- /dev/null
+++ b/doc/applications/likwid-accessD.md
@@ -0,0 +1,55 @@
+/*! \page likwid-accessD <CODE>likwid-accessD</CODE>
+
+<H1>Information</H1>
+
+<CODE>likwid-accessD</CODE> is a command line application that opens a UNIX file socket and waits for access
+operations from LIKWID tools that require access to the MSR and PCI device
+files. The MSR and PCI device files are commonly only accessible for users with root
+privileges, therefore <CODE>likwid-accessD</CODE> requires the suid-bit set or a suitable libcap setting.
+Depending on the current system architecture, <CODE>likwid-accessD</CODE> permits only access to registers defined for the architecture.
+
+<!--<H1>Security concerns</H1>
+The <CODE>likwid-accessD</CODE> is a critical part of LIKWID. The accesses to the MSR and often also PCI devices are restricted to users with root privileges. In order to allow users the access to the MSR/PCI devices, the users have to get temporarily elevated privileges. There are currently two ways of achieving this in the Linux operating system. The convenient method are the suid/guid bits that allow an application to execute with the privileges of the owner (suid) or group (guid). Th [...]
+Both methods should be safe but there are exploits for the MSR devices, general suid applications and the <CODE>cap_sys_rawio</CODE>. We checked all exploits we found and built the access daemon so that it is not vulnerable for the exploits. By restricting the accessible registers and closing all file handles -->
+
+<H1>Build</H1>
+The building of <CODE>likwid-accessD</CODE> can be controlled through the <CODE>config.mk</CODE> file. Depending on the variable <CODE>BUILDDAEMON</CODE> the daemon code is built or not. The path to <CODE>likwid-accessD</CODE> is compiled into the LIKWID library, so if you want to use the access daemon from an uncommon path, you have to set the <CODE>ACCESSDAEMON</CODE> variable.
+
+<H1>Setup</H1>
+In order to allow <CODE>likwid-accessD</CODE> to run with elevated priviledges, there are three ways
+<UL>
+<LI>SUID Method:<BR>
+<CODE>
+root: # chown root:root likwid-accessD<BR>
+root: # chmod u+s likwid-accessD<BR>
+</CODE>
+</LI>
+<LI>GUID Method: (PCI devices cannot be accesses with this method but we are working on it)<BR>
+<CODE>
+root: # groupadd likwid<BR>
+root: # chown root:likwid likwid-accessD<BR>
+root: # chmod g+s likwid-accessD<BR>
+</CODE>
+</LI>
+<LI>Libcap Method:<BR>
+<CODE>
+root: # setcap cap_sys_rawio+ep likwid-accessD
+</CODE>
+</LI>
+</UL>
+There are Linux distributions where settings the suid permission on <CODE>likwid-accessD</CODE> is not enough. Try also to set the capabilities for <CODE>likwid-accessD</CODE>.
+
+<H1>Protocol</H1>
+Every likwid instance will start its own daemon. This client-server pair will communicate with a socket file in <CODE>/tmp</CODE> named <CODE>likwid-$PID</CODE>. The daemon only accepts one connection. As soon as the connect is successful the socket file will be deleted.
+
+From there the communication consists of write read pairs issued from the client. The daemon will ensure allowed register ranges relevant for the likwid applications. Other register access will be silently dropped and logged to <CODE>syslog</CODE>.
+
+On shutdown the client will terminate the daemon with a exit message.
+
+The daemon has the following error handling:
+<UL>
+<LI>To prevent daemons not stopped correctly the daemon has a timeout on startup.</LI>
+<LI>If the client prematurely disconnects the daemon terminates.</LI>
+<LI>If the client disconnects between a read and write the daemon catches <CODE>SIGPIPE</CODE> and disconnects.</LI>
+</UL>
+*/
diff --git a/doc/applications/likwid-agent.md b/doc/applications/likwid-agent.md
new file mode 100644
index 0000000..44cbb65
--- /dev/null
+++ b/doc/applications/likwid-agent.md
@@ -0,0 +1,94 @@
+/*! \page likwid-agent <CODE>likwid-agent</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-agent</CODE> is a daemon application that uses \ref likwid-perfctr to measure hardware performance counters and write them to various output back-ends. The basic configuration is in a global configuration file that must be given on command line. The configuration of the hardware event sets is done with extra files suitable for each architecture. Besides the hardware event configuration, the raw data can be transformed using formulas to interested metrics. In order to output [...]
+
+<H1>Config file</H1>
+The global configuration file has the following options:
+<TABLE>
+<TR>
+ <TH>Option
+
+</TH>
+ <TH>Description</TH>
+</TR>
+<TR>
+ <TD>GROUPPATH <path></TD>
+ <TD>Path to the group files containing event set and output definitions. See section <B>Group files</B> for information.</TD>
+</TR>
+<TR>
+ <TD>EVENTSET <group1> <group2> ...</TD>
+ <TD>Space separated list of groups (without .txt) that should be monitored.</TD>
+</TR>
+<TR>
+ <TD>DURATION <time></TD>
+ <TD>Measurement duration in seconds for each group.</TD>
+</TR>
+<TR>
+ <TD>LOGPATH <path></TD>
+ <TD>Sets the output logfile path for the measured data. Each monitoring group logs to its own file likwid.<group>.log</TD>
+</TR>
+<TR>
+ <TD>LOGSTYLE <update/log></TD>
+ <TD>Specifies whether new data should be appended to the files (log) or the file should be emptied first (update).<BR> Update is a common option if you read in the data afterwards by some monitoring tool like cacti, nagios, ... Default is log</TD>
+</TR>
+<TR>
+ <TD>GMETRIC <True/False></TD>
+ <TD>Activates the output to gmetric.</TD>
+</TR>
+<TR>
+ <TD>GMETRICPATH <path></TD>
+ <TD>Set path to the gmetric executable.</TD>
+</TR>
+<TR>
+ <TD>GMETRICCONFIG <path></TD>
+ <TD>Set path to a custom gmetric config file.</TD>
+</TR>
+<TR>
+ <TD>RRD <True/False></TD>
+ <TD>Activates the output to RRD files (Round Robin Database).</TD>
+</TR>
+<TR>
+ <TD>RRDPATH <path></TD>
+ <TD>Output path for the RRD files. The files are named according to the group and each output metric is saved as DS with function GAUGE. The RRD is configured with RRA entries to store average, minimum and maximum of 10 minutes for one hour, of 60 min for one day and daily data for one month.</TD>
+</TR>
+<TR>
+ <TD>SYSLOG <True/False></TD>
+ <TD>Activates the output to system log using logger.</TD>
+</TR>
+<TR>
+ <TD>SYSLOGPRIO <prio></TD>
+ <TD>Set the priority for the system log. The default priority is 'local0.notice'.</TD>
+</TR>
+</TABLE>
+
+<H1>Group files</H1>
+The group files are adapted performance group files as used by <CODE>likwid-perfctr</CODE>.
+This makes it easy to uses the predefined and often used performance groups as basis for the monitoring. The folder structure of for the groups is <CODE><GROUPPATH>/<SHORT_ARCH_NAME>/</CODE> with <SHORT_ARCH_NAME> similar to the ones for the performance groups, like 'sandybridge' or 'haswellEP'.
+
+
+<TABLE>
+<TR>
+ <TH>Option
+
+</TH>
+ <TH>Description</TH>
+</TR>
+<TR>
+ <TD>SHORT <string></TD>
+ <TD>A short descriptive information about the group.</TD>
+</TR>
+<TR>
+ <TD>EVENTSET<BR><counter1> <event1><BR><counter2>:<option1>:<option2> <event2></TD>
+ <TD>Definition of the eventset similar to the performance groups. See performance_groups for details.</TD>
+</TR>
+<TR>
+ <TD>METRICS<BR><metricname> <formula><BR><filter> <metricname> <formula></TD>
+ <TD>Definition of the output metrics. The syntax follows the METRICS definition of the performance groups as used by \ref likwid-perfctr . If no function is set at the beginning of the line, <formula> is evaluated for every CPU and send to the output back-ends. The <metricname> gets the prefix "T<cpuid> ". To avoid writing to much data to the back-ends, the data can be reduced by <filter>. The possible filter options are MIN, MAX, AVG, SUM, ONCE. The ONCE filter [...]
+</TR>
+
+</TABLE>
+
+<H1>Notice</H1>
+There is currently no predefined init script for <CODE>likwid-agent</CODE>, you have to create it yourself for your distribution.
+*/
diff --git a/doc/applications/likwid-bench.md b/doc/applications/likwid-bench.md
new file mode 100644
index 0000000..fc642e1
--- /dev/null
+++ b/doc/applications/likwid-bench.md
@@ -0,0 +1,93 @@
+/*! \page likwid-bench <CODE>likwid-bench</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-bench</CODE> is a benchmark suite for low-level (assembly) benchmarks to measure bandwidths and instruction throughput for specific instruction code on x86 systems. The currently included benchmark codes include common data access patterns like load and store but also calculations like vector triad and sum.
+<CODE>likwid-bench</CODE> includes architecture specific benchmarks for x86, x86_64 and x86 for Intel Xeon Phi coprocessors. The performance values can either be calculated by <CODE>likwid-bench</CODE> or measured using hardware performance counters by using \ref likwid-perfctr as a wrapper to <CODE>likwid-bench</CODE>. This requires to build <CODE>likwid-bench</CODE> with instrumentation enabled in config.mk (<CODE>INSTRUMENT_BENCH</CODE>).
+
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Description</TH>
+</TR>
+<TR>
+ <TD>-h</TD>
+ <TD>Print help message</TD>
+</TR>
+<TR>
+ <TD>-a</TD>
+ <TD>List all available benchmarks</TD>
+</TR>
+<TR>
+ <TD>-p</TD>
+ <TD>List all available thread affinity domains</TD>
+</TR>
+<TR>
+ <TD>-d <delim></TD>
+ <TD>Use <delim> instead of ',' for the output of -p</TD>
+</TR>
+<TR>
+ <TD>-l <test></TD>
+ <TD>List characteristics of <test> like number of streams, data used per loop iteration, ...</TD>
+</TR>
+<TR>
+ <TD>-t <test></TD>
+ <TD>Perform assembly benchmark <test></TD>
+</TR>
+<TR>
+ <TD>-s <min_time></TD>
+ <TD>Minimal time in seconds to run the benchmark.<BR>Using this time, the iteration count is determined automatically to provide reliable results. Default is 1. If the determined iteration count is below 10, it is normalized to 10.</TD>
+</TR>
+<TR>
+ <TD>-w <workgroup></TD>
+ <TD>Set a workgroup for the benchmark. A workgroup can have different formats:<BR>
+ <TABLE>
+ <TR>
+ <TH>Format</TH>
+ <TH>Description</TH>
+ </TR>
+ <TR>
+ <TD><affinity_domain>:<size></TD>
+ <TD>Allocate in total <size> in affinity domain <affinity_domain>.<BR><CODE>likwid-bench</CODE> starts as many threads as available in affinity domain <affinity_domain></TD>
+ </TR>
+ <TR>
+ <TD><affinity_domain>:<size>:<num_threads></TD>
+ <TD>Allocate in total <size> in affinity domain <affinity_domain>.<BR><CODE>likwid-bench</CODE> starts <num_threads> in affinity domain <affinity_domain></TD>
+ </TR>
+ <TR>
+ <TD><affinity_domain>:<size>:<num_threads>:<chunk_size>:<stride></TD>
+ <TD>Allocate in total <size> in affinity domain <affinity_domain>.<BR><CODE>likwid-bench</CODE> starts <num_threads> in affinity domain <affinity_domain> with <chunk_size> selected in row and a distance of <stride>.<BR>See \ref CPU_expressions on the \ref likwid-pin page for further information.</TD>
+ </TR>
+ <TR>
+ <TD><above_formats>-<streamID>:<stream_domain></TD>
+ <TD>In combination with every above mentioned format, the test streams (arrays, vectors) can be place in different affinity domains than the threads.<BR>This can be achieved by adding a stream placement option -<streamID>:<stream_domain> for all streams of the test to the workgroup definition.<BR>The stream with <streamID> is placed in affinity domain <stream_domain>.<BR>The amount of streams of a test can be determined with the -l <test> commandline o [...]
+ </TR>
+ </TD>
+ </TABLE>
+</TR>
+</TABLE>
+
+
+<H1>Examples</H1>
+<UL>
+<LI><CODE>likwid-bench -t copy -w S0:100kB</CODE><BR>
+Run test <CODE>copy</CODE> using all threads in affinity domain <CODE>S0</CODE>. The input and output stream of the <CODE>copy</CODE> benchmark sum up to <CODE>100kB</CODE> placed in affinity domain <CODE>S0</CODE>. The iteration count is calculated automatically.
+</LI>
+<LI><CODE>likwid-bench -t triad -i 100 -w S0:1GB:2:1:2</CODE><BR>
+Run test <CODE>triad</CODE> using <CODE>2</CODE> threads in affinity domain <CODE>S0</CODE>. Assuming <CODE>S0 = 0,4,1,5</CODE> the threads are pinned to CPUs 0 and 1, hence skipping of one thread during selection. The streams of the <CODE>triad</CODE> benchmark sum up to <CODE>1GB</CODE> placed in affinity domain <CODE>S0</CODE>. The number of iteration is explicitly set to <CODE>100</CODE>
+</LI>
+<LI><CODE>likwid-bench -t update -w S0:100kB -w S1:100kB</CODE><BR>
+Run test <CODE>update</CODE> using all threads in affinity domain <CODE>S0</CODE> and <CODE>S1</CODE>. The threads scheduled on <CODE>S0</CODE> use stream that sum up to <CODE>100kB</CODE>. Similar to <CODE>S1</CODE> the threads are placed there working only on their socket-local streams. The results of both workgroups are combined.
+</LI>
+<LI><CODE>likwid-perfctr -c E:S0:4 -g MEM -m likwid-bench -t update -w S0:100kB:4</CODE><BR>
+Run test <CODE>update</CODE> using <CODE>4</CODE> threads in affinity domain <CODE>S0</CODE>. The input and output stream of the <CODE>copy</CODE> benchmark sum up to <CODE>100kB</CODE> placed in affinity domain <CODE>S0</CODE>. The benchmark execution is measured using the \ref Marker_API. It measures the <CODE>MEM</CODE> performance group on the first four CPUs of the <CODE>S0</CODE> affinity domain. For further information about hardware performance counters see \ref likwid-perfctr<BR [...]
+</LI>
+<LI><CODE>likwid-bench -t copy -w S0:1GB:2:1:2-0:S1,1:S1</CODE><BR>
+Run test <CODE>copy</CODE> using <CODE>2</CODE> threads in affinity domain <CODE>S0</CODE> skipping one thread during selection. The two streams used in the <CODE>copy</CODE> benchmark have the IDs 0 and 1 and a summed up size of <CODE>1GB</CODE>. Both streams are placed in affinity domain <CODE>S1</CODE>.
+</LI>
+</UL>
+
+
+
+*/
diff --git a/doc/applications/likwid-genTopoCfg.md b/doc/applications/likwid-genTopoCfg.md
new file mode 100644
index 0000000..ae758c8
--- /dev/null
+++ b/doc/applications/likwid-genTopoCfg.md
@@ -0,0 +1,29 @@
+/*! \page likwid-genTopoCfg <CODE>likwid-genTopoCfg</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-genTopoCfg</CODE> is a command line application that stores the system's CPU and NUMA topology to
+file. LIKWID applications use this file to read in the topology fast instead of re-gathering all values. The path to the topology configuration can be set in the global LIKWID configuration file, see \ref likwid.cfg.
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Description</TH>
+</TR>
+<TR>
+ <TD>-h, --help</TD>
+ <TD>Print help message.</TD>
+</TR>
+<TR>
+ <TD>-v, --version</TD>
+ <TD>Print version information.</TD>
+</TR>
+<TR>
+ <TD>-o <file></TD>
+ <TD>Use <file> instead of the default output /etc/likwid-topo.cfg./TD>
+</TR>
+</TABLE>
+
+
+*/
+
diff --git a/doc/applications/likwid-memsweeper.md b/doc/applications/likwid-memsweeper.md
new file mode 100644
index 0000000..570c7cb
--- /dev/null
+++ b/doc/applications/likwid-memsweeper.md
@@ -0,0 +1,34 @@
+/*! \page likwid-memsweeper <CODE>likwid-memsweeper</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-memsweeper</CODE> is a command line application to shrink the file buffer cache by filling the NUMA domain with random pages. Moreover, the tool invalidates all cachelines in the LLC.
+
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Description</TH>
+</TR>
+<TR>
+ <TD>-h, --help</TD>
+ <TD>Print help message.</TD>
+</TR>
+<TR>
+ <TD>-v, --version</TD>
+ <TD>Print version information.</TD>
+</TR>
+<TR>
+ <TD>-c <list></TD>
+ <TD>Sweeps the memory and LLC cache for NUMA domains listed in <list>.</TD>
+</TR>
+</TABLE>
+
+<H1>Examples</H1>
+<UL>
+<LI><CODE>likwid-memsweeper -c 0,1</CODE><BR>
+Cleans the memory and LLC on NUMA nodes identified by the node IDs 0 and 1.
+</LI>
+</UL>
+
+*/
diff --git a/doc/applications/likwid-mpirun.md b/doc/applications/likwid-mpirun.md
new file mode 100644
index 0000000..aee12d6
--- /dev/null
+++ b/doc/applications/likwid-mpirun.md
@@ -0,0 +1,83 @@
+/*! \page likwid-mpirun <CODE>likwid-mpirun</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-mpirun</CODE>
+A tool to start and monitor MPI applications with LIKWID. It can be used as supplement of the MPI implementations' startup programm like <CODE>mpirun</CODE> or <CODE>mpiexec</CODE> with some enhancements for pinning of OpenMP thread in hybrid jobs. Moreover, <CODE>likwid-mpirun</CODE> can insert calls to \ref likwid-perfctr to measure hardware performance counters for each MPI process and its threads, including Marker API. Since the <A HREF="http://modules.sourceforge.net/">modules</A> s [...]
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Description</TH>
+</TR>
+<TR>
+ <TD>-h, --help</TD>
+ <TD>Print help message</TD>
+</TR>
+<TR>
+ <TD>-v, --version</TD>
+ <TD>Print version information</TD>
+</TR>
+<TR>
+ <TD>-d, --debug</TD>
+ <TD>Print debug information</TD>
+</TR>
+<TR>
+ <TD>-n, -np, --n, --np <arg></TD>
+ <TD>Specify the number of processes for MPI</TD>
+</TR>
+<TR>
+ <TD>--nperdomain <domain>:<arg></TD>
+ <TD>Schedule <arg> MPI processes for each affinity domain starting with <domain>, e.g S:2 translates in two MPI processes per socket.<BR><CODE>likwid-mpirun</CODE> assumes that all participating hosts have the same topology.</TD>
+</TR>
+<TR>
+ <TD>--hostfile <file></TD>
+ <TD>Specify the file that should be used as hostfile.<BR>If not set, <CODE>likwid-mpirun</CODE> checks the <CODE>PBS_NODEFILE</CODE>, <CODE>LOADL_HOSTFILE</CODE> and <CODE>SLURM_HOSTFILE</CODE> environment variable</TD>
+</TR>
+<TR>
+ <TD>--pin <expr></TD>
+ <TD>For hybrid pinning specify the thread pinning expression for each MPI process.<BR>The format is similar to \ref CPU_expressions separated by '_' for multiple processes.<BR>If -np is not set, the number of MPI processes is calculated using the pinning expressions.</TD>
+</TR>
+<TR>
+ <TD>-s, --skip <arg></TD>
+ <TD>'arg' must be a bitmask in hex. Threads with the ID equal to a set bit in bitmask will be skipped during pinning<BR>Example: 0x1 = Thread 0 is skipped.</TD>
+</TR>
+<TR>
+ <TD>--mpi <mpitype></TD>
+ <TD>Specify the type of the MPI implementation.<BR><CODE>likwid-mpirun</CODE> tries to read the MPI implementation from the <A HREF="http://modules.sourceforge.net/">modules</A> system.<BR>If not recognized automatically, possible values are <B>intelmpi</B>, <B>openmpi</B> and <B>mvapich2</B>.</TD>
+</TR>
+<TR>
+ <TD>--omp <omptype></TD>
+ <TD>Specify the type of OpenMP implementation.<BR><CODE>likwid-mpirun</CODE> tries to read the OpenMP implementation using <I>ldd</I> and the <A HREF="http://modules.sourceforge.net/">modules</A> system.<BR>If not recognized automatically, possible values are <B>intel</B> and <B>gnu</B></TD>
+</TR>
+<TR>
+ <TD>-g, --group <eventset></TD>
+ <TD>Use \ref likwid-perfctr to measure performance data for the MPI processes and OpenMP threads.<BR><eventset> can be either a performance group or a custom event string.<BR>For details see \ref performance_groups.</TD>
+</TR>
+<TR>
+ <TD>-m, --marker</TD>
+ <TD>Activate the \ref Marker_API for the measurements with \ref likwid-perfctr.</TD>
+</TR>
+<TR>
+ <TD>-O</TD>
+ <TD>Print results in CSV format (conform to <A HREF="https://tools.ietf.org/html/rfc4180">RFC 4180</A>)</TD>
+</TR>
+</TABLE>
+
+<H1>Examples</H1>
+<UL>
+<LI><CODE>likwid-mpirun -np 32 ./a.out</CODE><BR>
+Runs <CODE>./a.out</CODE> with 32 MPI processes distributed over the hosts in <CODE>PBS_NODEFILE</CODE>
+</LI>
+<LI><CODE>likwid-mpirun -nperdomain S:1 ./a.out</CODE><BR>
+Runs <CODE>./a.out</CODE> using one MPI process per socket over the hosts in <CODE>PBS_NODEFILE</CODE>, <CODE>LOADL_HOSTFILE</CODE> or <CODE>SLURM_HOSTFILE</CODE>.<BR>The total amount of processes is calculated by <numberOfSocketDomains> * <processCountPerDomain> * <hostsInHostfile>
+</LI>
+<LI><CODE>likwid-mpirun --hostfile host.list -pin S0:2_S1:2 ./a.out</CODE><BR>
+Runs <CODE>./a.out</CODE> using two MPI processes per host in <CODE>host.list</CODE>.<BR>The first MPI process on each host and its 2 threads are pinned to the first two CPUs on socket <CODE>S0</CODE>,<BR>the second MPI process on each host and its 2 threads are pinned to the first two CPUs on socket <CODE>S1</CODE>
+</LI>
+<LI><CODE>likwid-mpirun -nperdomain S:2 -g MEM ./a.out</CODE><BR>
+Runs <CODE>./a.out</CODE> with 2 MPI processes per socket on each host in <CODE>PBS_NODEFILE</CODE>, <CODE>LOADL_HOSTFILE</CODE> or <CODE>SLURM_HOSTFILE</CODE> and measure the <CODE>MEM</CODE> performance group<BR>
+Only one process per socket measures the Uncore/RAPL counters, the other one(s) only core-local counters.
+</LI>
+</UL>
+*/
diff --git a/doc/applications/likwid-perfctr.md b/doc/applications/likwid-perfctr.md
new file mode 100644
index 0000000..9efc789
--- /dev/null
+++ b/doc/applications/likwid-perfctr.md
@@ -0,0 +1,260 @@
+/*! \page likwid-perfctr <CODE>likwid-perfctr</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-perfctr</CODE> is a lightweight command line application to configure and read out hardware performance monitoring data
+on supported x86 processors. It can measure either as wrapper without changing the measured application
+or with \ref Marker_API functions inside the code, which will turn on and off the counters. Moreover, there are the timeline and stethoscope mode.
+There are preconfigured performance groups with useful event sets and derived metrics. Additonally, arbitrary events can be measured with
+custom event sets. The \ref Marker_API can measure mulitple named regions and the results are accumulated over multiple region calls.
+<P>
+<B>Note</B> that <CODE>likwid-perfctr</CODE> measures all events on the specified CPUs and not only the context of the executable. On a highly loaded system it will be hard to determine which part of the given application caused the counter increment. Moreover, it is necessary to ensure that processes and threads are pinned to dedicated resources. You can either pin the application yourself or use the builtin pin functionality.
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Description</TH>
+</TR>
+<TR>
+ <TD>-h, --help</TD>
+ <TD>Print help message.</TD>
+</TR>
+<TR>
+ <TD>-v, --version</TD>
+ <TD>Print version information.</TD>
+</TR>
+<TR>
+ <TD>-V, --verbose <level></TD>
+ <TD>Verbose output during execution for debugging. Possible values for <level>:
+ <TABLE>
+ <TR>
+ <TD>0</TD>
+ <TD>Output only errors</TD>
+ </TR>
+ <TR>
+ <TD>1</TD>
+ <TD>Output some information</TD>
+ </TR>
+ <TR>
+ <TD>2</TD>
+ <TD>Output detailed information</TD>
+ </TR>
+ <TR>
+ <TD>3</TD>
+ <TD>Output developer information</TD>
+ </TR>
+ </TABLE>
+ </TD>
+</TR>
+<TR>
+ <TD>-i, --info</TD>
+ <TD>Print \a CPUID information about processor and about Intel Performance Monitoring features.</TD>
+</TR>
+<TR>
+ <TD>-g, --group <arg></TD>
+ <TD>Specify which event string or performance group should be measured.</TD>
+</TR>
+<TR>
+ <TD>-c <arg></TD>
+ <TD>Defines the CPUs that should be measured<BR>See \ref CPU_expressions on the \ref likwid-pin page for information about the syntax.</TD>
+</TR>
+<TR>
+ <TD>-C <arg></TD>
+ <TD>Defines the CPUs that should be measured and pin the executable to the CPUs<BR>See \ref CPU_expressions on the \ref likwid-pin page for information about the syntax.</TD>
+</TR>
+<TR>
+ <TD>-H</TD>
+ <TD>Print information about a performance group given with -g, --group option.</TD>
+</TR>
+<TR>
+ <TD>-m</TD>
+ <TD>Run in marker API mode</TD>
+</TR>
+<TR>
+ <TD>-a</TD>
+ <TD>Print available performance groups for current processor.</TD>
+</TR>
+<TR>
+ <TD>-e</TD>
+ <TD>Print available counters and performance events and suitable options of current processor.</TD>
+</TR>
+<TR>
+ <TD>-E <pattern></TD>
+ <TD>Print available performance events matching <pattern> and print the usable counters for the found events.<BR>The matching is done with *<pattern>*, so all events matching the substring are returned.</TD>
+</TR>
+<TR>
+ <TD>-o, --output <file></TD>
+ <TD>Store all ouput to file instead of stdout. LIKWID enables the reformatting of output files according to their suffix.<BR>You can place additional output formatters in folder <CODE><PREFIX>/share/likwid/filter</CODE>. LIKWID ships with one filter script <CODE>xml</CODE> written in Perl and a Perl template for developing own output scripts. If the suffix is <CODE>.csv</CODE>, the internal CSV printer is used for file output.<BR>Moreover, there are substitutions possible in the [...]
+</TR>
+<TR>
+ <TD>-S <time></TD>
+ <TD>Specify the time between starting and stopping of counters. Can be used to monitor applications. Option does not require an executable<BR>Examples for <time> are 1s, 250ms, 500us.</TD>
+</TR>
+<TR>
+ <TD>-t <time></TD>
+ <TD>Activates the timeline mode that reads the counters in the given frequency <time> during the whole run of the executable<BR>Examples for <time> are 1s, 250ms, 500us.</TD>
+</TR>
+<TR>
+ <TD>-T <time></TD>
+ <TD>If multiple event sets are given on commandline, switch every <time> to next group. Default is 2s.<BR>Examples for <time> are 1s, 250ms, 500us.<BR>If only a single event set is given, the default read frequency is 30s to catch overflows.</TD>
+</TR>
+<TR>
+ <TD>-O</TD>
+ <TD>Print output in CSV format (conform to <A HREF="https://tools.ietf.org/html/rfc4180">RFC 4180</A>). The output contains some markers that help to parse the output.</TD>
+</TR>
+<TR>
+ <TD>-s, --skip <arg></TD>
+ <TD>'arg' must be a bitmask in hex. Threads with the ID equal to a set bit in bitmask will be skipped during pinning<BR>Example: 0x1 = Thread 0 is skipped.</TD>
+</TR>
+</TABLE>
+
+<H1>Examples</H1>
+<UL>
+<LI><CODE>likwid-perfctr -C 0-2 -g TLB ./a.out</CODE><BR>
+Pin the executable <CODE>./a.out</CODE> to CPUs 0,1,2 and measure on the specified CPUs the performance group <CODE>TLB</CODE>. If not set, the environment variable <CODE>OMP_NUM_THREADS</CODE> is set to 3.
+</LI>
+<LI><CODE>likwid-perfctr -C 0-4 -g INSTRUCTIONS_RETIRED_SSE:PMC0,CPU_CLOCKS_UNHALTED:PMC3 ./a.out</CODE><BR>
+Pin the executable <CODE>./a.out</CODE> to CPUs 0,1,2,3,4 and measure on the specified CPUs the event set <CODE>INSTRUCTIONS_RETIRED_SSE:PMC0,CPU_CLOCKS_UNHALTED:PMC3</CODE>.<BR>The event set consists of two event definitions:
+ <UL>
+ <LI><CODE>INSTRUCTIONS_RETIRED_SSE:PMC0</CODE> measures event <CODE>INSTRUCTIONS_RETIRED_SSE</CODE> using counter register named <CODE>PMC0</CODE></LI>
+ <LI><CODE>CPU_CLOCKS_UNHALTED:PMC3</CODE> measures event <CODE>CPU_CLOCKS_UNHALTED</CODE> using counter register named <CODE>PMC3</CODE>. This event can be used to calculate the run time of the application.</LI>
+ </UL>
+</LI>
+
+<LI><CODE>likwid-perfctr -C 0 -g INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1,UNC_L3_LINES_IN_ANY:UPMC0 ./a.out</CODE><BR>
+Run and pin executable <CODE>./a.out</CODE> on CPU 0 with a custom event set containing three events.<BR>The event set consists of three event definitions:
+ <UL>
+ <LI><CODE>INSTR_RETIRED_ANY:FIXC0</CODE> measures event <CODE>INSTR_RETIRED_ANY</CODE> using Intel's fixed-purpose counter register named <CODE>FIXC0</CODE>.</LI>
+ <LI><CODE>CPU_CLK_UNHALTED_CORE:FIXC1</CODE> measures event <CODE>CPU_CLOCKS_UNHALTED</CODE> using Intel's fixed-purpose counter register named <CODE>FIXC1</CODE>. This event can be used to calculate the run time of the application.</LI>
+ <LI><CODE>UNC_L3_LINES_IN_ANY:UPMC0</CODE> measures event <CODE>UNC_L3_LINES_IN_ANY</CODE> using Uncore counter register named <CODE>UPMC0</CODE>. Uncore counters are socket-specific, hence LIKWID reads the counter registers only on one CPU per socket.</LI>
+ </UL>
+</LI>
+
+<LI><CODE>likwid-perfctr -m -C 0-4 -g INSTRUCTIONS_RETIRED_SSE:PMC0,CPU_CLOCKS_UNHALTED:PMC3 ./a.out</CODE><BR>
+Run and pin the executable to CPUs 0,1,2,3,4 and activate the Marker API. The code in <CODE>a.out</CODE> is assumed to be instrumented with LIKWID's Marker API. Only the marked code regions are measured.
+ <UL>
+ <LI><CODE>INSTRUCTIONS_RETIRED_SSE:PMC0</CODE> measures event <CODE>INSTRUCTIONS_RETIRED_SSE</CODE> using counter register named <CODE>PMC0</CODE>.</LI>
+ <LI><CODE>CPU_CLOCKS_UNHALTED:PMC3</CODE> measures event <CODE>CPU_CLOCKS_UNHALTED</CODE> using counter register named <CODE>PMC3</CODE>. This event can be used to calculate the run time of the application.</LI>
+ </UL>
+The Marker API for C/C++ offers 6 functions to measure named regions. You can use instrumented code with and without LIKWID. In order to activate the Marker API, <CODE>-DLIKWID_PERFMON</CODE> needs to be added to the compiler call. The following listing describes each function shortly (complete list see \ref Marker_API):
+ <UL>
+ <LI><CODE>LIKWID_MARKER_INIT</CODE>: Initialize LIKWID globally. Must be called in serial region and only once.</LI>
+ <LI><CODE>LIKWID_MARKER_THREADINIT</CODE>: Initialize LIKWID for each thread. Must be called in parallel region and executed by every thread.</LI>
+ <LI><CODE>LIKWID_MARKER_START('compute')</CODE>: Start a code region and associate it with the name 'compute'. The names are freely selectable and are used for grouping and outputting regions.</LI>
+ <LI><CODE>LIKWID_MARKER_STOP('compute')</CODE>: Stop the code region associated with the name 'compute'.</LI>
+ <LI><CODE>LIKWID_MARKER_SWITCH</CODE>: Switches to the next performance group or event set in a round-robin fashion. Can be used to measure the same region with multiple events. If called inside a code region, the results for all groups will be faulty. Be aware that each programming of the config registers causes overhead.</LI>
+ <LI><CODE>LIKWID_MARKER_CLOSE</CODE>: Finalize LIKWID globally. Should be called in the end of your application. This writes out all region results to a file that is picked up by <CODE>likwid-perfctr</CODE> for evaluation.</LI>
+ </UL>
+</LI>
+
+<LI><CODE>likwid-perfctr -c 0-3 -g FLOPS_DP -t 300ms ./a.out 2> out.txt</CODE><BR>
+Runs the executable <CODE>a.out</CODE> and measures the performance group <CODE>FLOPS_DP</CODE> on CPUs 0,1,2,3 every 300 ms. Since <CODE>-c</CODE> is used, the application is not pinned to the CPUs and <CODE>OMP_NUM_THREADS</CODE> is not set. The performance group <CODE>FLOPS_DP</CODE> is not available on every architecture, use <CODE>likwid-perfctr -a</CODE> for a complete list. Please note, that <CODE>likwid-perfctr</CODE> writes the measurements to stderr while the application's outp [...]
+The syntax of the timeline mode output lines is:<BR>
+<CODE><groupID> <numberOfEvents> <numberOfThreads> <Timestamp> <Event1_Thread1> <Event1_Thread2> ... <EventN_ThreadN></CODE><BR>
+You can also use the tool \ref likwid-perfscope to print the measured values live with <CODE>gnuplot</CODE>.
+</LI>
+
+<LI><CODE>likwid-perfctr -c 0-3 -g FLOPS_DP -S 2s</CODE><BR>
+Measures the performance group <CODE>FLOPS_DP</CODE> on CPUs 0,1,2,3 for 2 seconds. This option can be used to measure application from external or to perform low-level system monitoring.
+</LI>
+
+<LI><CODE>likwid-perfctr -c S0:0\@S1:0 -g LLC_LOOKUPS_DATA_READ:CBOX0C0:STATE=0x9 -S 2s</CODE><BR>
+Measures the event <CODE> LLC_LOOKUPS_DATA_READ</CODE> on the first CPU of socket 0 and the first CPU on socket 1 for 2 seconds using the counter 0 in CBOX 0 (LLC cache coherency engine). The counting is filtered to only lookups in the 'invalid' and 'modified' state. Look at the microarchitecture Uncore documentation for possible bitmasks. Which option is available for which counter class can be found in section \ref Architectures.
+</LI>
+</UL>
+
+\anchor performance_groups
+<H1>Performance groups</H1>
+One of the outstanding features of LIKWID are the performance groups. Each microarchitecture has its own set of events and related counters and finding the suitable events in the documentation is tedious. Moreover, the raw results of the events are often not meaningful, they need to be combined with other events like run time or clock speed. LIKWID addresses those problems by providing performance groups that specify a set of events and counter combinations as well as a set of derived me [...]
+<B>Please note that performance groups is a feature of the Lua API and not available for the C/C++ API.</B>
+<H3>Directory structure</H3>
+While installation of LIKWID, the performance groups are copied to the path <CODE>${INSTALL_PREFIX}/share/likwid</CODE>. In this folder there is one subfolder per microarchitecture that contains all performance groups for that microarchitecture. The folder names are not freely selectable, they are defined in <CODE>src/topology.c</CODE>. For every microarchitecture at the time of release, there is already a folder that can be extended with your own performance groups. You can change the p [...]
+<H3>Syntax of performance group files</H3>
+<CODE>SHORT <string></CODE> // Short description of the performance group<BR>
+<BR>
+<CODE>EVENTSET</CODE> // Starts the event set definition<BR>
+<CODE><counter>(:<options>) <event></CODE> // Each line defines one event/counter combination with optional options.<BR>
+<CODE>FIXC0 INSTR_RETIRED_ANY</CODE> // Example<BR>
+<BR>
+<CODE>METRICS</CODE> // Starts the derived metric definitions<BR>
+<CODE><metricname> <formula></CODE> // Each line defines one derived metric. <CODE><metricname></CODE> can contain spaces, <CODE><formula></CODE> must be free of spaces. The counter names (with options) and the variables <CODE>time</CODE> and <CODE>inverseClock</CODE> can be used as variables in <CODE><formula></CODE>.
+<CODE>CPI FIXC1/FIXC0</CODE> // Example<BR>
+<BR>
+<CODE>LONG</CODE> // Starts the detailed description of the performance group<BR>
+<CODE><TEXT></CODE> // <CODE><TEXT></CODE> is displayed with <CODE>-H</CODE> commandline option
+
+\anchor Marker_API
+<H1>Marker API</H1>
+The Marker API enables measurement of user-defined code regions in order to get deeper insight what is happening at a specific point in the application. The Marker API itself has 8 commands. In order to activate the Marker API, the code must be compiled with <CODE>-DLIKWID_PERFMON</CODE>. If the code is compiled without this define, the Marker API functions perform no operation and cause no overhead. You can also run code compiled with LIKWID_PERFMON defined without measurements but a me [...]
+Even pure serial applications have to call LIKWID_MARKER_THREADINIT to initialize the accessDaemon or the direct accesses.<BR>
+The names for the regions can be freely chosen but <I>whitespaces are not allowed</I>.
+<H2>C/C++ Code</H2>
+<H3>Original code</H3>
+<CODE>
+\#include <stdlib.h><BR>
+\#include <stdio.h><BR>
+\#include <omp.h><BR>
+<BR>
+int main(int argc, char* argv[])<BR>
+{<BR>
+ int i=0;<BR>
+ double sum = 0;<BR>
+\#pragma omp parallel for reduction(+:sum)<BR>
+ for(i=0;i<100000;i++)<BR>
+ {<BR>
+ sum += 1.0/(omp_get_thread_num()+1);<BR>
+ }<BR>
+ printf("Sum is %f\n", sum);<BR>
+ return 0;<BR>
+}<BR>
+</CODE>
+<H3>Instrumented code</H3>
+<CODE>
+\#include <stdlib.h><BR>
+\#include <stdio.h><BR>
+\#include <omp.h><BR>
+\#include <likwid.h><BR>
+<BR>
+int main(int argc, char* argv[])<BR>
+{<BR>
+ int i=0;<BR>
+ double sum = 0;<BR>
+ LIKWID_MARKER_INIT;<BR>
+\#pragma omp parallel<BR>
+{<BR>
+ LIKWID_MARKER_THREADINIT;<BR>
+}<BR>
+\#pragma omp parallel<BR>
+{<BR>
+ LIKWID_MARKER_START("sum");<BR>
+\#pragma omp for reduction(+:sum)<BR>
+ for(i=0;i<100000;i++)<BR>
+ {<BR>
+ sum += 1.0/(omp_get_thread_num()+1);<BR>
+ }<BR>
+ LIKWID_MARKER_STOP("sum");<BR>
+}<BR>
+ printf("Sum is %f\n", sum);<BR>
+ LIKWID_MARKER_CLOSE;<BR>
+ return 0;<BR>
+}<BR>
+</CODE>
+The LIKWID package contains an example code: see \ref C-markerAPI-code or \ref F-markerAPI-code.
+<H3>Running code</H3>
+With the help of <CODE>likwid-perfctr</CODE> the counters are configured to the selected events. The counters are also started and stopped by <CODE>likwid-perfctr</CODE>, the Marker API only reads the counters to minimize the overhead of the instrumented application. Only if you use <CODE>LIKWID_MARKER_SWITCH</CODE> the Marker API itself configures a new event set to the registers. Basically, <CODE>likwid-perfctr</CODE> exports the whole configuration needed by the Marker API through env [...]
+In order to build your instrumented application:<BR>
+<CODE>$CC -openmp -L<PATH_TO_LIKWID_LIBRARY> -I<PATH_TO_LIKWID_INCLUDES> <SRC_CODE> -o <EXECUTABLE> -llikwid</CODE><BR>
+With standard installation, the paths are <CODE><PATH_TO_LIKWID_LIBRARY>=/usr/local/lib</CODE> and <CODE><PATH_TO_LIKWID_INCLUDES>=/usr/local/include</CODE><BR>
+Example Marker API call:<BR>
+<CODE>likwid-perfctr -C 0-4 -g L3 <B>-m</B> ./a.out</CODE>
+<BR>
+<BR>
+
+<H2>Fortran Code</H2>
+Besides the Marker API for C/C++ programms, LIKWID offers to build a Fortran module to access the Marker API functions from Fortran. Only the Marker API calls are exported, not the whole API. In <CODE>config.mk</CODE> the variable <CODE>FORTRAN_INTERFACE</CODE> must be set to true. LIKWID's default is to use the Intel Fortran compiler to build the interface but it can be modified to use GCC's Fortran compiler in <CODE>make/include_<COMPILER></CODE>.<BR>
+The LIKWID package contains an example code: see \ref F-markerAPI-code.
+
+<H2>Hints for the usage of the Marker API</H2>
+Since the calls to the LIKWID library are executed by your application, the runtime will raise and in specific circumstances, there are some other problems like the time measurement. You can execute <CODE>LIKWID_MARKER_THREADINIT</CODE> and <CODE>LIKWID_MARKER_START</CODE> inside the same parallel region but put a barrier between the calls to ensure that there is no big timing difference between the threads. The common way is to init LIKWID and the participating threads inside of an init [...]
+
+*/
diff --git a/doc/applications/likwid-perfscope.md b/doc/applications/likwid-perfscope.md
new file mode 100644
index 0000000..71c8984
--- /dev/null
+++ b/doc/applications/likwid-perfscope.md
@@ -0,0 +1,107 @@
+/*! \page likwid-perfscope <CODE>likwid-perfscope</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-perfscope</CODE> is a command line application written in Lua that uses the timeline daemon mode of \ref likwid-perfctr
+to create on-the-fly pictures with the current measurements. It uses the <A HREF="https://github.com/dkogan/feedgnuplot">feedGnuplot</A> Perl script to send the current data to <A HREF="http://www.gnuplot.info/">gnuplot</A>. In order to make it more convenient for users, preconfigured plots of interesting metrics are embedded into <CODE>likwid-perfscope</CODE>. Since the plot windows are normally closed directly after the execution of the monitored applications, <CODE>likwid-perfscope</ [...]
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Description</TH>
+</TR>
+<TR>
+ <TD>-h, --help</TD>
+ <TD>Print help message.</TD>
+</TR>
+<TR>
+ <TD>-v, --version</TD>
+ <TD>Print version information.</TD>
+</TR>
+<TR>
+ <TD>-a, --all</TD>
+ <TD>Print available predefined plot configurations for current processor.</TD>
+</TR>
+<TR>
+ <TD>-d, --dump</TD>
+ <TD>Print measurements to stdout.</TD>
+</TR>
+<TR>
+ <TD>-p, --plotdump</TD>
+ <TD>Use feedGnuplots feature to dump plot configuration and its data to stdout.</TD>
+</TR>
+<TR>
+ <TD>-c <arg></TD>
+ <TD>Defines the CPUs that should be measured<BR>See \ref CPU_expressions on the \ref likwid-pin page for information about the syntax.</TD>
+</TR>
+<TR>
+ <TD>-C <arg></TD>
+ <TD>Defines the CPUs that should be measured and pin the executable to the CPUs<BR>See \ref CPU_expressions on the \ref likwid-pin page for information about the syntax.</TD>
+</TR>
+<TR>
+ <TD>-t, --time <time></TD>
+ <TD>Specify the measurement time for each plot. <time> is handled over to \ref likwid-perfctr with the -t option. <BR>Examples for <time> are 1s, 250ms, 500us.</TD>
+</TR>
+<TR>
+ <TD>-g, --group <arg></TD>
+ <TD>Specify a predefined plot with optional changes or an eventset with plot configuration. See \ref plot_configuration for details.</TD>
+</TR>
+<TR>
+ <TD>-r, --range <arg></TD>
+ <TD>Specify the amount of data points that should be visible in the plots. Often refered to as sliding window.</TD>
+</TR>
+<TR>
+ <TD>--host <arg></TD>
+ <TD>Connect to <arg> via ssh and execute likwid-perfctr and the application there. The plots are created on the local machine. Often used if measured on hosts without X11 or GnuPlot.</TD>
+</TR>
+</TABLE>
+
+\anchor plot_configuration
+<H1>Plot configurations</H1>
+<CODE>likwid-perfscope</CODE> extends the format of the eventset option of \ref likwid-perfctr to make it more conveniet for the users. It accepts either a plot configuration of interesting metrics which are embedded into <CODE>likwid-perfscope</CODE> or a custom eventset suitable for \ref likwid-perfctr extended by the plot configuration. A plot configuration can be set with key=value pairs separated by ':' and has to contain at least a definition of a formula for plotting. If specifyed [...]
+<TABLE>
+<TR>
+ <TH>Option
+
+ </TH>
+ <TH>Description</TH>
+</TR>
+<TR>
+ <TD>title=<string><BR>TITLE=<string></TD>
+ <TD>Use <string> as title for the plot. The string has to be quoted if it contains spaces. No ':' are allowed in the string</TD>
+</TR>
+<TR>
+ <TD>xtitle=<string><BR>XTITLE=<string></TD>
+ <TD>Use <string> as label for the x-axis. The default label is 'Time'. The string has to be quoted if it contains spaces. No ':' are allowed in the string</TD>
+</TR>
+<TR>
+ <TD>ytitle=<string><BR>YTITLE=<string></TD>
+ <TD>Use <string> as label for the left y-axis. The string has to be quoted if it contains spaces. No ':' are allowed in the string</TD>
+</TR>
+<TR>
+ <TD><string>=<string></TD>
+ <TD>Use the first <string> as legend entry and the second <string> as input forumla for the plot. The result is printed over the run time. The names of the specified counters can be used as variables in the formula. Additional variables are 'time' for the measurement time and 'inverseClock' for the inverted clock frequency. No spaces are allowed in the formula.</TD>
+</TR>
+<TR>
+ <TD>y2title=<string><BR>Y2TITLE=<string><BR>y2title=<id-string><BR>Y2TITLE=<id-string></TD>
+ <TD>Use <string> as label for the right y-axis. If <id-string> is given, the formula with id is associated with the y2-axis. If used with predefined plot configurations, be aware that the formula 1 is part of the plot configuration. If no id is given, the y2-axis is associated with the last given formula. The string has to be quoted if it contains spaces. No ':' are allowed in the string</TD>
+</TR>
+</TABLE>
+
+<H1>Examples</H1>
+<UL>
+<LI><CODE>likwid-perfscope -g L3_BAND -C 0-2 -t 1s ./a.out</CODE><BR>
+Pin the executable <CODE>./a.out</CODE> to CPUs 0,1,2 and use the predefined plot configuration <CODE>L3_BAND</CODE> The plot is updated ever second.
+</LI>
+<LI><CODE>likwid-perfscope -g L3_BAND:TITLE="My Title" -C S0:1 -t 500ms ./a.out</CODE><BR>
+Pin the executable <CODE>./a.out</CODE> to CPU 1 on Socket 0 and use the predefined plot configuration <CODE>L3_BAND</CODE> but change the title for the plot to "My Title".
+</LI>
+<LI><CODE>likwid-perfscope -g INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1,CPI=FIXC0/FIXC1:YTITLE="CPI" -C 0 --time 2s ./a.out</CODE><BR>
+Pin the executable <CODE>./a.out</CODE> to CPUs 0 and use the custom event set <CODE>INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1</CODE>. The last event set entry specifies custom plot options. The plotted formula is <CODE>FIXC0/FIXC1</CODE> and the plot title and legend entry is set to 'CPI'.
+</LI>
+<LI><CODE>likwid-perfscope -g L3_BAND,CPI=FIXC0/FIXC1:Y2TITLE="2-Cycles per Instruction" -C 0 --time 2s ./a.out</CODE><BR>
+Pin the executable <CODE>./a.out</CODE> to CPU 0 and use the predefined plot configuration <CODE>L3_BAND</CODE> to measure every 2 seconds. Additionally, a formula <CODE>FIXC0/FIXC1</CODE> with the name <CODE>CPI</CODE> is given. The right y-axis is associated to the given function and labeled with <CODE>Cycles per Instruction</CODE>. The formula ID 2 is not needed in this case as the default behavior is to associate the right y-axis to the last formula given.
+</LI>
+</UL>
+
+*/
diff --git a/doc/applications/likwid-pin.md b/doc/applications/likwid-pin.md
new file mode 100644
index 0000000..b8c8a1e
--- /dev/null
+++ b/doc/applications/likwid-pin.md
@@ -0,0 +1,170 @@
+/*! \page likwid-pin <CODE>likwid-pin</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-pin</CODE> is a command line application to pin a sequential or multithreaded application to dedicated processors. It can be used as replacement for taskset.
+Opposite to taskset no affinity mask but single processors are specified. For multithreaded applications based on the <A HREF="https://computing.llnl.gov/tutorials/pthreads/"><CODE>pthreads</CODE></A> library the <CODE>pthread_create</CODE> library call is overloaded through <CODE>LD_PRELOAD</CODE> and each created thread is pinned to a dedicated processor as specified in the pinning list. Per default every generated thread is pinned to the core in the order of calls to <CODE>pthread_cre [...]
+<BR>
+For OpenMP implementations, GCC and ICC compilers are explicitly supported. Clang's OpenMP backend should also work as it is built on top of Intel's OpenMP runtime library. Others may also work.<BR>
+<BR>
+<CODE>likwid-pin</CODE> sets the environment variable <CODE>OMP_NUM_THREADS</CODE> for you if not already present. It will set as many threads as present in the pin expression. Be aware that with <A HREF="https://computing.llnl.gov/tutorials/pthreads/"><CODE>pthreads</CODE></A> the parent thread is always pinned. If you create for example 4 threads with <CODE>pthread_create</CODE> and do not use the parent process as worker you still have to provide <CODE>num_threads + 1</CODE> processo [...]
+<BR>
+<CODE>likwid-pin</CODE> supports different numberings for pinning. Per default physical numbering of the cores is used. This is the numbering also \ref likwid-topology reports. But also logical numbering inside the node or the sockets can be used. For details look at \ref CPU_expressions. <!--If using with a N (e.g. -c N:0-6) the cores are logical numbered over the whole node. Physical cores come first. If a system e.g. has 8 cores with 16 SMT threads with -c N:0-7 you get all physical c [...]
+
+For applications where first touch policy on NUMA systems cannot be employed <CODE>likwid-pin</CODE> can be used to turn on interleave memory placement. This can significantly speed up the performance of memory bound multi threaded codes. All NUMA nodes the user pinned threads to are used for interleaving.
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Description</TH>
+</TR>
+<TR>
+ <TD>-h, --help</TD>
+ <TD>Print help message</TD>
+</TR>
+<TR>
+ <TD>-v, --version</TD>
+ <TD>Print version information</TD>
+</TR>
+<TR>
+ <TD>-V, --verbose <level></TD>
+ <TD>Verbose output during execution for debugging. Possible values for <level>:
+ <TABLE>
+ <TR>
+ <TD>0</TD>
+ <TD>Output only errors</TD>
+ </TR>
+ <TR>
+ <TD>1</TD>
+ <TD>Output some information</TD>
+ </TR>
+ <TR>
+ <TD>2</TD>
+ <TD>Output detailed information</TD>
+ </TR>
+ <TR>
+ <TD>3</TD>
+ <TD>Output developer information</TD>
+ </TR>
+ </TABLE>
+ </TD>
+</TR>
+<TR>
+ <TD>-c <arg></TD>
+ <TD>Define the CPUs that the application should be pinned on. LIKWID provides an intuitive and feature-rich syntax for CPU expressions.<BR>See section \ref CPU_expressions for details.</TD>
+</TR>
+<TR>
+ <TD>-S, --sweep</TD>
+ <TD>Sweep memory and clean LLC of NUMA domains used by the given CPU expression</TD>
+</TR>
+<TR>
+ <TD>-i</TD>
+ <TD>Activate interleaved memory policy for NUMA domains used by the given CPU expression</TD>
+</TR>
+<TR>
+ <TD>-p</TD>
+ <TD>Print the thread affinity domains. If -c is set on the commandline, the affinity domains filled only with the given CPUs are printed.</TD>
+</TR>
+<TR>
+ <TD>-q, --quiet</TD>
+ <TD>Don't print infos of the pinning process</TD>
+</TR>
+<TR>
+ <TD>-s, --skip <arg></TD>
+ <TD>'arg' must be a bitmask in hex. Threads with the ID equal to a set bit in bitmask will be skipped during pinning<BR>Example: 0x1 = Thread 0 is skipped.</TD>
+</TR>
+<TR>
+ <TD>-d</TD>
+ <TD>Set the delimiter for the output of -p. Default is ','</TD>
+</TR>
+</TABLE>
+
+\anchor thread_affinity_domains
+<H1>Affinity Domains</H1>
+While gathering the system topology, LIKWID groups the CPUs into so-called thread affinity domains. A thread affinity domain is a group of CPU IDs that are related to some kind of central entity of the system. The most common domain is the node domain (<CODE>N</CODE>) that contains all CPUs available in the system. Other domains group the CPUs according to socket, LLC or NUMA node relation. <CODE>likwid-pin</CODE> prints out all available affinity domains with the commandline option <COD [...]
+<TABLE>
+<TR>
+ <TH>Domain name</TH>
+ <TH>Description</TH>
+</TR>
+<TR>
+ <TD><CODE>N</CODE></TD>
+ <TD>Includes all CPUs in the system</TD>
+</TR>
+<TR>
+ <TD><CODE>S<number></CODE></TD>
+ <TD>Includes all CPUs that reside on CPU socket x</TD>
+</TR>
+<TR>
+ <TD><CODE>C<number></CODE></TD>
+ <TD>Includes all CPUs that share the same LLC with ID <CODE><number></CODE>.<BR>This domain often contains the same CPUs as the <CODE>S<number></CODE> domain because many CPU socket have a LLC shared by all CPUs of the socket</TD>
+</TR>
+<TR>
+ <TD><CODE>M<number></CODE></TD>
+ <TD>Includes all CPUs that are attached to the same NUMA memory domain</TD>
+</TR>
+</TABLE>
+
+\anchor CPU_expressions
+<H1>CPU expressions</H1>
+One outstanding feature of LIKWID are the CPU expressions which are resolved to the CPUs in the actual system. There are multiple formats that can be chosen where each offers a convenient way to select the desired CPUs for execution or measurement. The CPU expressions are used for <CODE>likwid-pin</CODE> as well as \ref likwid-perfctr. This section introduces the 4 formats and gives examples.
+
+<H3>Physical numbering:</H3>
+The first and probably most natural way of defining a list of CPUs is the usage of the physical numbering, similar to the numbering of the operating system and the IDs printed by \ref likwid-topology. The desired CPU IDs can be set as comma-separated list, as range or a combination of both.
+<UL>
+<LI><CODE>-c 1</CODE><BR>
+Run only on CPU with ID 1
+</LI>
+<LI><CODE>-c 1,4</CODE><BR>
+Run on CPUs with ID 1 and 4
+</LI>
+<LI><CODE>-c 1-3</CODE><BR>
+Run on CPUs ranging from ID 1 to ID 3, hence CPUs 1,2,3
+</LI>
+<LI><CODE>-c 0,1-3</CODE><BR>
+Run on CPU with ID 0 and the CPU range starting from ID 1 to ID3, hence 0,1,2,3
+</LI>
+</UL>
+<H3>Logical numbering:</H3>
+Besides the enumeration of physical CPU IDs, LIKWID supports the logical numbering inside of an affinity domain. For logical selection, the indicies inside of the desired affinity domain has to be given on the commandline. The logical numbering can be selected by prefixing the cpu expression with <CODE>L:</CODE>. The format is <CODE>L:<indices></CODE> assuming affinity domain <CODE>N</CODE> or <CODE>L:<affinity domain>:<indices></CODE>. Moreover, it is automatically act [...]
+<UL>
+<LI><CODE>-c L:0</CODE><BR>
+Run only on CPU 0, the first entry in the <B>sorted</B> affinity domain <CODE>N</CODE>
+</LI>
+<LI><CODE>-c L:0,4</CODE><BR>
+Run on the first and fifth entry in the <B>sorted</B> affinity domain <CODE>N</CODE>
+</LI>
+<LI><CODE>-c L:1-3</CODE><BR>
+Run on CPUs ranging from index 1 to index 3 in the <B>sorted</B> affinity domain <CODE>N</CODE>, hence CPUs 1,2,3.
+</LI>
+<LI><CODE>-c L:N:1,4-6</CODE><BR>
+Run on CPUs with index 1 and the range of indices from 4 to 6 in given <B>sorted</B> affinity domain <CODE>N</CODE>, hence CPUs 1,4,5,6.
+</LI>
+</UL>
+<H3>Numbering by expression:</H3>
+The most powerful format is probably the expression format. The format combines the input values for a selection function in a convenient way. In order to activate the expression format, the CPU string must be prefixed with <CODE>E:</CODE>. The basic format is <CODE>E:<affinity domain>:<numberOfThreads></CODE> which selects simply the given <CODE><numberOfThreads></CODE> in the supplied <CODE><affinity domain></CODE>. The extended format is <CODE>E:<affinity do [...]
+<UL>
+<LI><CODE>-c E:N:1</CODE><BR>
+Selects the first entry in the node affinity domain, thus CPU 0
+</LI>
+<LI><CODE>-c E:N:2</CODE><BR>
+Selects the first two entries in the node affinity domain, thus CPUs 0 and 4
+</LI>
+<LI><CODE>-c E:N:2:1:2</CODE><BR>
+Selects 1 CPU in a row and skips 1 entries thus we get CPUs 0 and 1
+</LI>
+<LI><CODE>-c E:N:4:2:4</CODE><BR>
+Selects in total 4 CPUs, 2 in a row with a stride of 4, thus CPUs 0,4,2,6
+</LI>
+</UL>
+<H3>Scatter expression:</H3>
+The scatter expression distributes the threads evenly over the desired affinity domains. In contrast to the previous selection methods, the scatter expression schedules threads over multiple affinity domains. Although you can also select <CODE>N</CODE> as scatter domain, the intended domains are <CODE>S</CODE>, <CODE>C</CODE> and <CODE>M</CODE>. The scattering selects physical cores first. For the examples we assume that the socket affinity domain looks like this: <CODE>S0 = 0,4,1,5</COD [...]
+<UL>
+<LI><CODE>-c S:scatter</CODE><BR>
+The resulting CPU list is 0,2,1,3,4,6,5,7
+</LI>
+<LI><CODE>-c M:scatter</CODE><BR>
+Scatter the threads evenly over all NUMA memory domains. A kind of interleaved thread policy.
+</LI>
+</UL>
+*/
diff --git a/doc/applications/likwid-powermeter.md b/doc/applications/likwid-powermeter.md
new file mode 100644
index 0000000..489689d
--- /dev/null
+++ b/doc/applications/likwid-powermeter.md
@@ -0,0 +1,75 @@
+/*! \page likwid-powermeter <CODE>likwid-powermeter</CODE>
+
+<H1>Information</H1>
+likwid-powermeter is a command line application to get the energy comsumption on Intel RAPL capable processors. Currently
+all Intel CPUs starting with Intel SandyBridge are supported. It also prints information about TDP and Turbo Mode steps supported.
+The Turbo Mode information works on all Turbo mode enabled Intel processors. The tool can be either used in stethoscope mode for a specified duration or as a wrapper to your application measuring your complete run. RAPL works on a per package (socket) base.
+Please note that the RAPL counters are also accessible as normal events withing \ref likwid-perfctr.
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Description</TH>
+</TR>
+<TR>
+ <TD>-h, --help</TD>
+ <TD>Print help message</TD>
+</TR>
+<TR>
+ <TD>-v, --version</TD>
+ <TD>Print version information</TD>
+</TR>
+<TR>
+ <TD>-V, --verbose <level></TD>
+ <TD>Verbose output during execution for debugging. Possible values for <level>:
+ <TABLE>
+ <TR>
+ <TD>0</TD>
+ <TD>Output only errors</TD>
+ </TR>
+ <TR>
+ <TD>1</TD>
+ <TD>Output some information</TD>
+ </TR>
+ <TR>
+ <TD>2</TD>
+ <TD>Output detailed information</TD>
+ </TR>
+ <TR>
+ <TD>3</TD>
+ <TD>Output developer information</TD>
+ </TR>
+ </TABLE>
+ </TD>
+</TR>
+<TR>
+ <TD>-c <arg></TD>
+ <TD>Specify sockets to measure</TD>
+</TR>
+<TR>
+ <TD>-M <0|1></TD>
+ <TD>Set access mode to access MSRs. 0=direct, 1=accessDaemon</TD>
+</TR>
+<TR>
+ <TD>-s <time></TD>
+ <TD>Set measure duration in us, ms or s. (default 2s)</TD>
+</TR>
+<TR>
+ <TD>-i, --info</TD>
+ <TD>Print information from <CODE>MSR_*_POWER_INFO</CODE> register and Turbo mode</TD>
+</TR>
+<TR>
+ <TD>-t</TD>
+ <TD>Print current temperatures of all CPU cores</TD>
+</TR>
+<TR>
+ <TD>-f</TD>
+ <TD>Print current temperatures of all CPU cores in Fahrenheit</TD>
+</TR>
+<TR>
+ <TD>-p</TD>
+ <TD>Print dynamic clocking and CPI values, uses \ref likwid-perfctr</TD>
+</TR>
+</TABLE>
+*/
diff --git a/doc/applications/likwid-setFreq.md b/doc/applications/likwid-setFreq.md
new file mode 100644
index 0000000..0db59e6
--- /dev/null
+++ b/doc/applications/likwid-setFreq.md
@@ -0,0 +1,13 @@
+/*! \page likwid-setFreq <CODE>likwid-setFreq</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-setFreq</CODE> is a command line application that mediates the actual setting of CPU cores' frequency and governor for \ref likwid-setFrequencies. Since only users with root priviledges are allowed to change the frequency of CPU cores, <CODE>likwid-setFreq</CODE> needs to be suid-root.
+
+<H1>Setup</H1>
+Setting the suid-root bit:<BR>
+<CODE>
+root: # chown root:root likwid-setFreq<BR>
+root: # chmod u+s likwid-setFreq
+</CODE>
+
+*/
diff --git a/doc/applications/likwid-setFrequencies.md b/doc/applications/likwid-setFrequencies.md
new file mode 100644
index 0000000..e753a9e
--- /dev/null
+++ b/doc/applications/likwid-setFrequencies.md
@@ -0,0 +1,50 @@
+/*! \page likwid-setFrequencies <CODE>likwid-setFrequencies</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-setFrequencies</CODE> is a command line application to set the clock frequency of CPU cores. Since only priviledged users are allowed to change the frequency of CPU cores, the application works in combination with a daemon
+\ref likwid-setFreq . The daemon needs the suid permission bit to be set in order to manipulate the sysfs entries. With <CODE>likwid-setFrequencies</CODE> the clock of all cores inside a the cpu_list or affinity domain can be set to a specific frequency or governor at once.
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Description</TH>
+</TR>
+<TR>
+ <TD>-h, --help</TD>
+ <TD>Print help message.</TD>
+</TR>
+<TR>
+ <TD>-v, --version</TD>
+ <TD>Print version information.</TD>
+</TR>
+<TR>
+ <TD>-l</TD>
+ <TD>Print all configurable frequencies.</TD>
+</TR>
+<TR>
+ <TD>-p</TD>
+ <TD>Print the current frequencies for all CPU cores.</TD>
+</TR>
+<TR>
+ <TD>-m</TD>
+ <TD>Print all configurable governors./TD>
+</TR>
+<TR>
+ <TD>-c <arg></TD>
+ <TD>Define the CPUs that should be modified. For information about the syntax see \ref CPU_expressions on the \ref likwid-pin page.</TD>
+</TR>
+<TR>
+ <TD>-f, --freq <arg></TD>
+ <TD>Specify the frequency for the selected CPUs.</TD>
+</TR>
+<TR>
+ <TD>-g <arg></TD>
+ <TD>Specify the governor for the selected CPUs.</TD>
+</TR>
+</TABLE>
+
+<H1>Notice</H1>
+Shortly before releasing the first version of LIKWID 4, the CPU frequency module and its behavior have changed compared to the previous <B>cpufreq</B> module. It is not possible anymore to set the CPU clock to a fixed frequency, you can only define a performance level called P-State. Inside that level, the CPU can vary its clock frequency. <CODE>likwid-setFrequencies</CODE> and its daemon \ref likwid-setFreq do not have support for the new kernel module <B>intel_pstate</B>. Therefore, th [...]
+
+*/
diff --git a/doc/applications/likwid-topology.md b/doc/applications/likwid-topology.md
new file mode 100644
index 0000000..f57a045
--- /dev/null
+++ b/doc/applications/likwid-topology.md
@@ -0,0 +1,68 @@
+/*! \page likwid-topology <CODE>likwid-topology</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-topology</CODE> is a command line application to print the thread and cache topology on multicore x86 processors. Used with mono spaced fonts it can
+draw the processor topology of a machine in ASCII art. Beyond topology <CODE>likwid-topology</CODE> determines the nominal clock of a processor and prints detailed informations about the caches hierarchy.<BR>
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Description</TH>
+</TR>
+<TR>
+ <TD>-h, --help</TD>
+ <TD>Print help message</TD>
+</TR>
+<TR>
+ <TD>-v, --version</TD>
+ <TD>Print version information</TD>
+</TR>
+<TR>
+ <TD>-V, --verbose <level></TD>
+ <TD>Verbose output during execution for debugging. Possible values for <level>:
+ <TABLE>
+ <TR>
+ <TD>0</TD>
+ <TD>Output only errors</TD>
+ </TR>
+ <TR>
+ <TD>1</TD>
+ <TD>Output some information</TD>
+ </TR>
+ <TR>
+ <TD>2</TD>
+ <TD>Output detailed information</TD>
+ </TR>
+ <TR>
+ <TD>3</TD>
+ <TD>Output developer information</TD>
+ </TR>
+ </TABLE>
+ </TD>
+</TR>
+<TR>
+ <TD>-c, --caches</TD>
+ <TD>Print detailed information about all cache levels</TD>
+</TR>
+<TR>
+ <TD>-C, --clock</TD>
+ <TD>Measure the nominal clock frequency and print it</TD>
+</TR>
+<TR>
+ <TD>-g</TD>
+ <TD>ASCII art output of the system's topology</TD>
+</TR>
+<TR>
+ <TD>-O</TD>
+ <TD>Print output in CSV format (conform to <A HREF="https://tools.ietf.org/html/rfc4180">RFC 4180</A>).</TD>
+</TR>
+<TR>
+ <TD>-o, --output <file></TD>
+ <TD>Write the output to file <file> instead of stdout. According to the used filename suffix, LIKWID tries to reformat the output to the specified format.<BR>By now, LIKWID ships with one filter script <CODE>xml</CODE> written in Perl and a Perl template for developing own output scripts. If the suffix is <CODE>.csv</CODE>, the internal CSV printer is used for file output.<BR>If <CODE>\%h</CODE> is in the filename, it is replaced by the host name.</TD>
+</TR>
+</TABLE>
+
+
+
+*/
diff --git a/doc/archs/atom.md b/doc/archs/atom.md
new file mode 100644
index 0000000..58a506c
--- /dev/null
+++ b/doc/archs/atom.md
@@ -0,0 +1,104 @@
+/*! \page atom Intel® Atom
+
+<P>The Intel® Atom performance monitoring counters are equal to the ones of the Intel® Core 2 microarchitecture but the event set is different.</P>
+<H1>Available performance monitors for the Intel® Atom microarchitecture</H1>
+<UL>
+<LI>\ref ATOM_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref ATOM_PMC "General-purpose counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor ATOM_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>The Core2/Atom microarchitecture is the first architecture offering a set of fixed-purpose counters. Each counter can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>FIXC0</TD>
+ <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+ <TD>FIXC1</TD>
+ <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+ <TD>FIXC2</TD>
+ <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit (index*4) in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor ATOM_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Atom microarchitecture provides 2 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PMC0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC1</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit 17 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+
+
+
+*/
diff --git a/doc/archs/broadwell.md b/doc/archs/broadwell.md
new file mode 100644
index 0000000..ff207af
--- /dev/null
+++ b/doc/archs/broadwell.md
@@ -0,0 +1,203 @@
+/*! \page broadwell Intel® Broadwell
+
+<P>This page is valid for Broadwell, Broadwell single socket server (Xeon D) and Broadwell EP/EN/EX. No Uncore support by now, no documentation is available for the Uncore counters of Broadwell</P>
+
+<H1>Available performance monitors for the Intel® Broadwell microarchitecture</H1>
+<UL>
+<LI>\ref BRD_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref BRD_PMC "General-purpose counters"</LI>
+<LI>\ref BRD_THERMAL "Thermal counters"</LI>
+<LI>\ref BRD_POWER "Power measurement counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor BRD_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>FIXC0</TD>
+ <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+ <TD>FIXC1</TD>
+ <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+ <TD>FIXC2</TD>
+ <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>anythread</TD>
+ <TD>N</TD>
+ <TD>Set bit 2+(index*4) in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit (index*4) in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor BRD_PMC
+<H2>General-purpose counters</H2>
+<P>Commonly the Intel® Broadwell microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PMC0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit 17 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>anythread</TD>
+ <TD>N</TD>
+ <TD>Set bit 21 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>in_transaction</TD>
+ <TD>N</TD>
+ <TD>Set bit 32 in config register</TD>
+ <TD>Only available if Intel® Transactional Synchronization Extensions are available</TD>
+</TR>
+<TR>
+ <TD>in_transaction_aborted</TD>
+ <TD>N</TD>
+ <TD>Set bit 33 in config register</TD>
+ <TD>Only counter PMC2 and only if Intel® Transactional Synchronization Extensions are available</TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Broadwell microarchitecture provides measureing of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Broadwell microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied with the [...]
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>16 bit hex value</TD>
+ <TD>Input value masked with 0x8FFF and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+ <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/BDW">https://download.01.org/perfmon/BDW</A>.</TD>
+</TR>
+<TR>
+ <TD>match1</TD>
+ <TD>22 bit hex value</TD>
+ <TD>Input value is written to bits 16-37 in the OFFCORE_RESPONSE register</TD>
+ <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/BDW">https://download.01.org/perfmon/BDW</A>.</TD>
+</TR>
+</TABLE>
+<P>The event MEM_TRANS_RETIRED_LOAD_LATENCY is not available because it needs programming of PEBS registers. PEBS is a kernel-level measurement facility for performance monitoring. Although we can program it from user-space, the results are always 0.</P>
+
+\anchor BRD_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® Haswell microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>TMP0</TD>
+ <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor BRD_POWER
+<H2>Power counter</H2>
+<P>The Intel® Broadwell microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PWR0</TD>
+ <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+ <TD>PWR1</TD>
+ <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+ <TD>PWR2</TD>
+ <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+ <TD>PWR3</TD>
+ <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+
+*/
diff --git a/doc/archs/core2.md b/doc/archs/core2.md
new file mode 100644
index 0000000..679da04
--- /dev/null
+++ b/doc/archs/core2.md
@@ -0,0 +1,103 @@
+/*! \page core2 Intel® Core2
+
+<H1>Available performance monitors for the Intel® Core2 microarchitecture</H1>
+<UL>
+<LI>\ref FIXED "Fixed-purpose counters"</LI>
+<LI>\ref PMC "General-purpose counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor FIXED
+<H2>Fixed-purpose counters</H2>
+<P>The Intel Core2 microarchitecture is the first architecture offering a set of fixed-purpose counters. Each counter can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>FIXC0</TD>
+ <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+ <TD>FIXC1</TD>
+ <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+ <TD>FIXC2</TD>
+ <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit (index*4) in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Core2 microarchitecture provides 2 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PMC0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC1</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit 17 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+
+
+
+*/
diff --git a/doc/archs/haswell.md b/doc/archs/haswell.md
new file mode 100644
index 0000000..65836bd
--- /dev/null
+++ b/doc/archs/haswell.md
@@ -0,0 +1,203 @@
+/*! \page haswell Intel® Haswell
+
+<H1>Available performance monitors for the Intel® Haswell microarchitecture</H1>
+<UL>
+<LI>\ref HAS_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref HAS_PMC "General-purpose counters"</LI>
+<LI>\ref HAS_THERMAL "Thermal counters"</LI>
+<LI>\ref HAS_POWER "Power measurement counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor HAS_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>FIXC0</TD>
+ <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+ <TD>FIXC1</TD>
+ <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+ <TD>FIXC2</TD>
+ <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>anythread</TD>
+ <TD>N</TD>
+ <TD>Set bit 2+(index*4) in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit (index*4) in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HAS_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Haswell microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PMC0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit 17 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>anythread</TD>
+ <TD>N</TD>
+ <TD>Set bit 21 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>in_transaction</TD>
+ <TD>N</TD>
+ <TD>Set bit 32 in config register</TD>
+ <TD>Only available if Intel® Transactional Synchronization Extensions are available</TD>
+</TR>
+<TR>
+ <TD>in_transaction_aborted</TD>
+ <TD>N</TD>
+ <TD>Set bit 33 in config register</TD>
+ <TD>Only counter PMC2 and only if Intel® Transactional Synchronization Extensions are available</TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Haswell microarchitecture provides measureing of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Haswell microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied with the OFF [...]
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>16 bit hex value</TD>
+ <TD>Input value masked with 0x8077 and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+ <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/HSW">https://download.01.org/perfmon/HSW</A>.</TD>
+</TR>
+<TR>
+ <TD>match1</TD>
+ <TD>22 bit hex value</TD>
+ <TD>Input value is written to bits 16-37 in the OFFCORE_RESPONSE register</TD>
+ <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/HSW">https://download.01.org/perfmon/HSW</A>.</TD>
+</TR>
+</TABLE>
+<P>The event MEM_TRANS_RETIRED_LOAD_LATENCY is not available because it needs programming of PEBS registers. PEBS is a kernel-level measurement facility. Although we can program it from user-space, the results are always 0.</P>
+
+\anchor HAS_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® Haswell microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>TMP0</TD>
+ <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor HAS_POWER
+<H2>Power counter</H2>
+<P>The Intel® Haswell microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PWR0</TD>
+ <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+ <TD>PWR1</TD>
+ <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+ <TD>PWR2</TD>
+ <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+ <TD>PWR3</TD>
+ <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+
+*/
+
+
diff --git a/doc/archs/haswellep.md b/doc/archs/haswellep.md
new file mode 100644
index 0000000..9368c54
--- /dev/null
+++ b/doc/archs/haswellep.md
@@ -0,0 +1,896 @@
+/*! \page haswellep Intel® Haswell EP/EN/EX
+
+
+<H1>Available performance monitors for the Intel® Haswell EP/EN/EX microarchitecture</H1>
+<UL>
+<LI>\ref HASEP_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref HASEP_PMC "General-purpose counters"</LI>
+<LI>\ref HASEP_THERMAL "Thermal counters"</LI>
+<LI>\ref HASEP_POWER "Power measurement counters"</LI>
+<LI>\ref HASEP_BBOX "Home Agent counters"</LI>
+<LI>\ref HASEP_SBOX "Ring transfer counters"</LI>
+<LI>\ref HASEP_QBOX "Intel® QPI Link Layer counters"</LI>
+<LI>\ref HASEP_CBOX "Last Level cache counters"</LI>
+<LI>\ref HASEP_UBOX "Uncore management counters"</LI>
+<LI>\ref HASEP_WBOX "Power control unit counters"</LI>
+<LI>\ref HASEP_IBOX "Coherency for IIO traffic counters"</LI>
+<LI>\ref HASEP_MBOX "Integrated memory controller counters"</LI>
+<LI>\ref HASEP_RBOX "Ring-to-QPI interface counters"</LI>
+<LI>\ref HASEP_PBOX "Ring-to-PCIe interface counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor HASEP_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>FIXC0</TD>
+ <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+ <TD>FIXC1</TD>
+ <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+ <TD>FIXC2</TD>
+ <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>anythread</TD>
+ <TD>N</TD>
+ <TD>Set bit 2+(index*4) in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit (index*4) in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PMC0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit 17 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>anythread</TD>
+ <TD>N</TD>
+ <TD>Set bit 21 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>in_transaction</TD>
+ <TD>N</TD>
+ <TD>Set bit 32 in config register</TD>
+ <TD>Only available if Intel® Transactional Synchronization Extensions are available</TD>
+</TR>
+<TR>
+ <TD>in_transaction_aborted</TD>
+ <TD>N</TD>
+ <TD>Set bit 33 in config register</TD>
+ <TD>Only counter PMC2 and only if Intel® Transactional Synchronization Extensions are available</TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measureing of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Haswell microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied wit [...]
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>16 bit hex value</TD>
+ <TD>Input value masked with 0x8FFF and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+ <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/SLM">https://download.01.org/perfmon/HSX</A>.</TD>
+</TR>
+<TR>
+ <TD>match1</TD>
+ <TD>22 bit hex value</TD>
+ <TD>Input value is written to bits 16-37 in the OFFCORE_RESPONSE register</TD>
+ <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/SLM">https://download.01.org/perfmon/HSX</A>.</TD>
+</TR>
+</TABLE>
+<P>The event MEM_TRANS_RETIRED_LOAD_LATENCY is not available because it needs programming of PEBS registers. PEBS is a kernel-level measurement facility. Although we can programm it from user-space, the results are always 0.</P>
+
+\anchor HASEP_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>TMP0</TD>
+ <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor HASEP_POWER
+<H2>Power counter</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PWR0</TD>
+ <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+ <TD>PWR1</TD>
+ <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+ <TD>PWR2</TD>
+ <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+ <TD>PWR3</TD>
+ <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+
+
+\anchor HASEP_BBOX
+<H2>Home Agent counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the Home Agent (HA) in the Uncore. The description from Intel®:<BR>
+<I>Each HA is responsible for the protocol side of memory interactions, including coherent and non-coherent home agent protocols (as defined in the Intel® QuickPath Interconnect Specification). Additionally, the HA is responsible for ordering memory reads/writes, coming in from the modular Ring, to a given address such that the IMC (memory controller).
+</I><BR>
+The Home Agent performance counters are exposed to the operating system through PCI interfaces. There are two of those interfaces for the HA. For systems where each socket has 12 or more cores, there are both HAs available. The name BBOX originates from the Nehalem EX Uncore monitoring where this functional unit is called BBOX.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>BBOX<0,1>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>BBOX<0,1>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>BBOX<0,1>C2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>BBOX<0,1>C3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>opcode</TD>
+ <TD>6 bit hex value</TD>
+ <TD>Set bits 0-5 in PCI_UNC_HA_PMON_OPCODEMATCH register of PCI device</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>46 bit hex address</TD>
+ <TD>Extract bits 6-31 and set bits 6-31 in PCI_UNC_HA_PMON_ADDRMATCH0 register of PCI device<BR>Extract bits 32-45 and set bits 0-13 in PCI_UNC_HA_PMON_ADDRMATCH1 register of PCI device</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_SBOX
+<H2>Ring-to-Ring interface counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture manages the socket internal traffic through ring-based networks. Depending on the system's configuration there are multiple rings in one socket. The SBOXes organizes the traffic between the rings. The description from Intel®:<BR>
+<I>The SBox manages the interface between the two Rings.<BR>
+The processor is composed of two independent rings connected via two sets of bi-directional buffered switches. Each set of bi-directional buffered switches is partitioned into two ingress/egress pairs. Further, each ingress/egress pair is associated with a ring stop on adjacent rings. This ring stop is termed an Sbo. The processor has up to 4 SBos depending on SKU. The Sbo can be simply thought of as a conduit for the ring, but must also help maintain ordering of traffic to ensure functi [...]
+</I><BR>
+The SBOX hardware performance counters are exposed to the operating system through the MSR interface. There are maximal four of those interfaces but not all must be present. The name SBOX originates from the Nehalem EX Uncore monitoring where the functional unit to the QPI network is called SBOX but it had a different duty..
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>SBOX<0-3>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>SBOX<0-3>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>SBOX<0-3>C2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>SBOX<0-3>C3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>tid</TD>
+ <TD>N</TD>
+ <TD>Set bit 19 in config register</TD>
+ <TD>This option has no real effect because TID filtering can be activated but there is no possibility to specify the TID somewhere.</TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_QBOX
+<H2>QPI interface counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the QPI Link layer (QPI) in the Uncore. The description from Intel®:<BR>
+<I>The Intel® QPI Link Layer is responsible for packetizing requests from the caching agent on the way out to the system interface. As such, it shares responsibility with the CBo(s) as the Intel® QPI caching agent(s). It is responsible for converting CBo requests to Intel® QPI messages (i.e. snoop generation and data response messages from the snoop response) as well as converting/forwarding ring messages to Intel® QPI packets and vice versa. On Intel® Xeon processor [...]
+</I><BR>
+The QPI hardware performance counters are exposed to the operating system through PCI interfaces. There are two of those interfaces for the QPI. The actual amount of QBOX counters depend on the CPU core count of one socket. If your system has not all interfaces but interface 0 does not work, try the other ones. The QBOX was introduced for the Haswell EP microarchitecture, for older Uncore-aware architectures the QBOX and the SBOX are the same.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>QBOX<0,1>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>QBOX<0,1>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>QBOX<0,1>C2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>QBOX<0,1>C3</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>QBOX<0,1>FIX0</TD>
+ <TD>QPI_RATE</TD>
+</TR>
+<TR>
+ <TD>QBOX<0,1>FIX1</TD>
+ <TD>QPI_RX_IDLE</TD>
+</TR>
+<TR>
+ <TD>QBOX<0,1>FIX2</TD>
+ <TD>QPI_RX_LLR</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for QBOX<0,1>C<0,1,2,3> counters)</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>32 bit hex address</TD>
+ <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_V3_QPI_PMON_RX_MATCH_0 register of PCI device</TD>
+ <TD>This option matches the receive side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+ <TD>match1</TD>
+ <TD>20 bit hex address</TD>
+ <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_V3_QPI_PMON_RX_MATCH_1 register of PCI device</TD>
+ <TD>This option matches the receive side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+ <TD>match2</TD>
+ <TD>32 bit hex address</TD>
+ <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_V3_QPI_PMON_TX_MATCH_0 register of PCI device</TD>
+ <TD>This option matches the transmit side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+ <TD>match3</TD>
+ <TD>20 bit hex address</TD>
+ <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_V3_QPI_PMON_TX_MATCH_1 register of PCI device</TD>
+ <TD>This option matches the transmit side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+ <TD>mask0</TD>
+ <TD>32 bit hex address</TD>
+ <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_V3_QPI_PMON_RX_MASK_0 register of PCI device</TD>
+ <TD>This option masks the receive side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+ <TD>mask1</TD>
+ <TD>20 bit hex address</TD>
+ <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_V3_QPI_PMON_RX_MASK_1 register of PCI device</TD>
+ <TD>This option masks the receive side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+ <TD>mask2</TD>
+ <TD>32 bit hex address</TD>
+ <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_V3_QPI_PMON_TX_MASK_0 register of PCI device</TD>
+ <TD>This option masks the transmit side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+ <TD>mask3</TD>
+ <TD>20 bit hex address</TD>
+ <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_V3_QPI_PMON_TX_MASK_1 register of PCI device</TD>
+ <TD>This option masks the transmit side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_CBOX
+<H2>Last Level cache counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the LLC coherency engine in the Uncore. The description from Intel®:<BR>
+<I>The LLC coherence engine (CBo) manages the interface between the core and the last level cache (LLC). All core transactions that access the LLC are directed from the core to a CBo via the ring interconnect. The CBo is responsible for managing data delivery
+from the LLC to the requesting core. It is also responsible for maintaining coherence between the cores within the socket that share the LLC; generating snoops and collecting snoop responses from the local cores when the MESIF protocol requires it.
+</I><BR>
+The LLC hardware performance counters are exposed to the operating system through the MSR interface. The maximal amount of supported coherency engines for the Intel® Haswell EP/EN/EX microarchitecture is 17. E7-8800 v2 systems have all 17 engines, the E5-2600 v2 only 10 of them and the E5-1600 v2 only 6. It may be possible that your systems does not have all CBOXes, LIKWID will skip the unavailable ones in the setup phase. The name CBOX originates from the Nehalem EX Uncore monitorin [...]
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>CBOX<0-17>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>CBOX<0-17>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>CBOX<0-17>C2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>CBOX<0-17>C3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Operation</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>5 bit hex value</TD>
+ <TD>Set bits 24-28 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>tid</TD>
+ <TD>5 bit hex value</TD>
+ <TD>Set bits 0-4 in MSR_UNC_C<0-17>_PMON_BOX_FILTER register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>state</TD>
+ <TD>6 bit hex value</TD>
+ <TD>Set bits 17-22 in MSR_UNC_C<0-17>_PMON_BOX_FILTER register</TD>
+ <TD>M: 0x28, F: 0x10, M: 0x08, E: 0x04, S: 0x02, I: 0x01</TD>
+</TR>
+<TR>
+ <TD>nid</TD>
+ <TD>16 bit hex value</TD>
+ <TD>Set bits 0-15 in MSR_UNC_C<0-17>_PMON_BOX_FILTER1 register</TD>
+ <TD>Note: Node 0 has value 0x0001</TD>
+</TR>
+<TR>
+ <TD>opcode</TD>
+ <TD>9 bit hex value</TD>
+ <TD>Set bits 20-28 in MSR_UNC_C<0-17>_PMON_BOX_FILTER1 register</TD>
+ <TD>A list of valid opcodes can be found in the <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A>.</TD>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>2 bit hex address</TD>
+ <TD>Set bits 30-31 in MSR_UNC_C<0-17>_PMON_BOX_FILTER1 register</TD>
+ <TD>See the <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for more information.</TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides an event LLC_LOOKUP which can be filtered with the 'state' option. If no 'state' is set, LIKWID sets the state to 0x1F, the default value to measure all lookups.</P>
+
+\anchor HASEP_UBOX
+<H2>Uncore management counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the management box in the Uncore. The description from Intel®:<BR>
+<I>The UBox serves as the system configuration controller within the physical processor. In this capacity, the UBox acts as the central unit for a variety of functions:
+<UL>
+<LI>The master for reading and writing physically distributed registers across Intel® Xeon processor E5 v3 family using the Message Channel.</LI>
+<LI>The UBox is the intermediary for interrupt traffic, receiving interrupts from the system and dispatching interrupts to the appropriate core.</LI>
+<LI>The UBox serves as the system lock master used when quiescing the platform (e.g., Intel® QPI bus lock).</LI>
+</UL>
+</I><BR>
+The Uncore management performance counters are exposed to the operating system through the MSR interface. The name UBOX originates from the Nehalem EX Uncore monitoring where those functional units are called UBOX.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>UBOX0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UBOX1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UBOXFIX</TD>
+ <TD>UBOX_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for UBOX<0,1> counters)</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Operation</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>5 bit hex value</TD>
+ <TD>Set bits 24-28 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_WBOX
+<H2>Power control unit counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the power control unit (PCU) in the Uncore. The description from Intel®:<BR>
+<I>The PCU is the primary Power Controller for the Intel® Xeon processor E5 v3 family. Intel® Xeon processor E5 v3 family uncore implements a power control unit acting as a core/uncore power and thermal manager. It runs its firmware on an internal micro-controller and coordinates the socket’s power states.
+</I><BR>
+The PCU performance counters are exposed to the operating system through the MSR interface. The name WBOX originates from the Nehalem EX Uncore monitoring where those functional units are called WBOX.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>WBOX0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>WBOX1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>WBOX2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>WBOX3</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>WBOX0FIX</TD>
+ <TD>CORES_IN_C3</TD>
+</TR>
+<TR>
+ <TD>WBOX1FIX</TD>
+ <TD>CORES_IN_C6</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for WBOX<0-3> counters)</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Operation</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>5 bit hex value</TD>
+ <TD>Set bits 24-28 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>32 bit hex value</TD>
+ <TD>Set bits 0-31 in MSR_UNC_PCU_PMON_BOX_FILTER register</TD>
+ <TD>Band0: bits 0-7, Band1: bits 8-15, Band2: bits 16-23, Band3: bits 24-31</TD>
+</TR>
+<TR>
+ <TD>occupancy</TD>
+ <TD>2 bit hex value</TD>
+ <TD>Set bit 14-15 in config register</TD>
+ <TD>Cores in C0: 0x1, in C3: 0x2, in C6: 0x3</TD>
+</TR>
+<TR>
+ <TD>occupancy_edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>occupancy_invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 30 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_IBOX
+<H2>IRP box counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the IRP box in the Uncore. The description from Intel®:<BR>
+<I>IRP is responsible for maintaining coherency for IIO traffic that needs to be coherent (e.g. cross-socket P2P).
+</I>
+
+The IRP box counters are exposed to the operating system through the PCI interface. The IBOX was introduced with the Intel® IvyBridge EP/EN/EX microarchitecture.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>IBOX<0,1>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>IBOX<0,1>C1</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Operation</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_MBOX
+<H2>Memory controller counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the integrated Memory Controllers (iMC) in the Uncore. The description from Intel®:<BR>
+<I>The Intel® Xeon processor E5 v3 family integrated Memory Controller provides the interface to DRAM and communicates to the rest of the uncore through the Home Agent (i.e. the IMC does not connect to the Ring).<BR>
+In conjunction with the HA, the memory controller also provides a variety of RAS features, such as ECC, lockstep, memory access retry, memory scrubbing, thermal throttling, mirroring, and rank sparing.
+</I><BR>
+The integrated Memory Controllers performance counters are exposed to the operating system through PCI interfaces. There may be two memory controllers in the system (E7-8800 v2). There are 4 different PCI devices per memory controller, each covering one memory channel. Each channel has 4 different general-purpose counters and one fixed counter for the DRAM clock. The four channels of the first memory controller are MBOX0-3, the four channels of the second memory controller (if available) [...]
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>MBOX<0-7>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>MBOX<0-7>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>MBOX<0-7>C2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>MBOX<0-7>C3</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>MBOX<0-7>FIX</TD>
+ <TD>DRAM_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for counter MBOX<0-7>C<0-3>)</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Operation</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_RBOX
+<H2>Ring-to-QPI counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the Ring-to-QPI (R3QPI) interface in the Uncore. The description from Intel®:<BR>
+<I>R3QPI is the interface between the Intel® QPI Link Layer, which packetizes requests, and the Ring.<BR>
+R3QPI is the interface between the ring and the Intel® QPI Link Layer. It is responsible for translating between ring protocol packets and flits that are used for transmitting data across the Intel® QPI interface. It performs credit checking between the local Intel® QPI LL, the remote Intel® QPI LL and other agents on the local ring.
+</I><BR>
+The Ring-to-QPI performance counters are exposed to the operating system through PCI interfaces. Since the RBOXes manage the traffic from the LLC-connecting ring interface on the socket with the QPI interfaces (SBOXes), the amount is similar to the amount of SBOXes. See at SBOXes how many are available for which system configuration. The name RBOX originates from the Nehalem EX Uncore monitoring where those functional units are called RBOX.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>RBOX<0,1,2>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>RBOX<0,1,2>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>RBOX<0,1,2>C2</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Operation</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_PBOX
+<H2>Ring-to-PCIe counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the Ring-to-PCIe (R2PCIe) interface in the Uncore. The description from Intel®:<BR>
+<I>R2PCIe represents the interface between the Ring and IIO traffic to/from PCIe.
+</I><BR>
+The Ring-to-PCIe performance counters are exposed to the operating system through a PCI interface. Independent of the system's configuration, there is only one Ring-to-PCIe interface per CPU socket.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PBOX0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PBOX1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PBOX2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PBOX3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Operation</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+
+*/
diff --git a/doc/archs/interlagos.md b/doc/archs/interlagos.md
new file mode 100644
index 0000000..cec7734
--- /dev/null
+++ b/doc/archs/interlagos.md
@@ -0,0 +1,107 @@
+/*! \page interlagos AMD® Interlagos
+
+<H1>Available performance monitors for the AMD® Interlagos microarchitecture</H1>
+<UL>
+<LI>\ref ILG_PMC "General-purpose counters"</LI>
+<LI>\ref ILG_UPMC "Northbridge general-purpose counters"</LI>
+</UL>
+
+
+\anchor ILG_PMC
+<H2>General-purpose counters</H2>
+<P>The AMD® Interlagos microarchitecture provides 6 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PMC0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC3</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC4</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC5</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit 17 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD>The value for threshold can range between 0x0 and 0x1F</TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor ILG_UPMC
+<H2>Northbridge general-purpose counters</H2>
+<P>The AMD® Interlagos microarchitecture provides 4 general-purpose counters for the Northbridge consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>UPMC0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UPMC1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UPMC2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UPMC3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+
+
+*/
diff --git a/doc/archs/ivybridge.md b/doc/archs/ivybridge.md
new file mode 100644
index 0000000..3008475
--- /dev/null
+++ b/doc/archs/ivybridge.md
@@ -0,0 +1,190 @@
+/*! \page ivybridge Intel® IvyBridge
+
+<H1>Available performance monitors for the Intel® IvyBridge microarchitecture</H1>
+<UL>
+<LI>\ref IVB_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref IVB_PMC "General-purpose counters"</LI>
+<LI>\ref IVB_THERMAL "Thermal counters"</LI>
+<LI>\ref IVB_POWER "Power measurement counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor IVB_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>FIXC0</TD>
+ <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+ <TD>FIXC1</TD>
+ <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+ <TD>FIXC2</TD>
+ <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>anythread</TD>
+ <TD>N</TD>
+ <TD>Set bit 2+(index*4) in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit (index*4) in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVB_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® IvyBridge microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PMC0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit 17 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>anythread</TD>
+ <TD>N</TD>
+ <TD>Set bit 21 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® IvyBridge microarchitecture provides measuring of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® IvyBridge microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied with the [...]
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>16 bit hex value</TD>
+ <TD>Input value masked with 0x8FFF and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+ <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/IVB">https://download.01.org/perfmon/IVB</A>.</TD>
+</TR>
+<TR>
+ <TD>match1</TD>
+ <TD>22 bit hex value</TD>
+ <TD>Input value is written to bits 16-37 in the OFFCORE_RESPONSE register</TD>
+ <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/IVB">https://download.01.org/perfmon/IVB</A>.</TD>
+</TR>
+</TABLE>
+
+\anchor IVB_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® IvyBridge microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>TMP0</TD>
+ <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor IVB_POWER
+<H2>Power counter</H2>
+<P>The Intel® IvyBridge microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PWR0</TD>
+ <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+ <TD>PWR1</TD>
+ <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+ <TD>PWR2*</TD>
+ <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+ <TD>PWR3</TD>
+ <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+<P>*) The PWR2 counter is often not implemented by Intel® IvyBridge systems</P>
+*/
+
+
diff --git a/doc/archs/ivybridgeep.md b/doc/archs/ivybridgeep.md
new file mode 100644
index 0000000..09f0bcd
--- /dev/null
+++ b/doc/archs/ivybridgeep.md
@@ -0,0 +1,790 @@
+/*! \page ivybridgeep Intel® IvyBridge EP/EN/EX
+
+<H1>Available performance monitors for the Intel® IvyBridge microarchitecture</H1>
+<UL>
+<LI>\ref IVBEP_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref IVBEP_PMC "General-purpose counters"</LI>
+<LI>\ref IVBEP_THERMAL "Thermal counters"</LI>
+<LI>\ref IVBEP_POWER "Power measurement counters"</LI>
+<LI>\ref IVBEP_BBOX "Home Agent counters"</LI>
+<LI>\ref IVBEP_SBOX "Intel® QPI Link Layer counters"</LI>
+<LI>\ref IVBEP_CBOX "Last Level cache counters"</LI>
+<LI>\ref IVBEP_UBOX "Uncore management counters"</LI>
+<LI>\ref IVBEP_WBOX "Power control unit counters"</LI>
+<LI>\ref IVBEP_IBOX "Coherency for IIO traffic counters"</LI>
+<LI>\ref IVBEP_MBOX "Integrated memory controller counters"</LI>
+<LI>\ref IVBEP_RBOX "Ring-to-QPI interface counters"</LI>
+<LI>\ref IVBEP_PBOX "Ring-to-PCIe interface counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor IVBEP_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>FIXC0</TD>
+ <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+ <TD>FIXC1</TD>
+ <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+ <TD>FIXC2</TD>
+ <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>anythread</TD>
+ <TD>N</TD>
+ <TD>Set bit 2+(index*4) in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit (index*4) in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PMC0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit 17 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>anythread</TD>
+ <TD>N</TD>
+ <TD>Set bit 21 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measureing of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® IvyBridge EP/EN/EX microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can b [...]
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>16 bit hex value</TD>
+ <TD>Input value masked with 0x8FFF and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+ <TD>Check the <A HREF="http://www.Intel®.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/IVT">https://download.01.org/perfmon/IVT</A>.</TD>
+</TR>
+<TR>
+ <TD>match1</TD>
+ <TD>16 bit hex value</TD>
+ <TD>Input value is written to bits 16-37 in the OFFCORE_RESPONSE register</TD>
+ <TD>Check the <A HREF="http://www.Intel®.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/IVT">https://download.01.org/perfmon/IVT</A>.</TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>TMP0</TD>
+ <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor IVBEP_POWER
+<H2>Power counter</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PWR0</TD>
+ <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+ <TD>PWR1</TD>
+ <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+ <TD>PWR2*</TD>
+ <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+ <TD>PWR3</TD>
+ <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+<P>*) The PWR2 counter is often not implemented by Intel® IvyBridge systems</P>
+
+\anchor IVBEP_BBOX
+<H2>Home Agent counters</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the Home Agent (HA) in the Uncore. The description from Intel®:<BR>
+<I>The HA is responsible for the protocol side of memory interactions, including coherent and non-coherent home agent protocols (as defined in the Intel® QuickPath Interconnect Specification). Additionally, the HA is responsible for ordering memory reads/writes, coming in from the modular Ring, to a given address such that the iMC (memory controller).</I><BR>
+The HA hardware performance counters are exposed to the operating system through PCI interfaces. There are two of those interfaces for the HA but only for the E7-8800 v2 both are available. The name BBOX originates from the Nehalem EX Uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>BBOX<0,1>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>BBOX<0,1>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>BBOX<0,1>C2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>BBOX<0,1>C3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>opcode</TD>
+ <TD>6 bit hex value</TD>
+ <TD>Set bits 0-5 in PCI_UNC_HA_PMON_OPCODEMATCH register of PCI device</TD>
+ <TD>A table of all valid opcodes can be found in the <A HREF="http://www.Intel®.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v2 Uncore Manual</A>.</TD>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>46 bit hex address</TD>
+ <TD>Extract bits 6-31 and set bits 6-31 in PCI_UNC_HA_PMON_ADDRMATCH0 register of PCI device<BR>Extract bits 32-45 and set bits 0-13 in PCI_UNC_HA_PMON_ADDRMATCH1 register of PCI device</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_SBOX
+<H2>LLC-to-QPI interface counters</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the QPI Link layer (QPI) in the Uncore. The description from Intel®:<BR>
+<I>The Intel® QPI Link Layer is responsible for packetizing requests from the caching agent on the way out to the system interface. As such, it shares responsibility with the CBo(s) as the Intel® QPI caching agent(s). It is responsible for converting CBo requests to Intel® QPI messages (i.e. snoop generation and data response messages from the snoop response) as well as converting/forwarding ring messages to Intel® QPI packets and vice versa. On Ivy Bridge, Intel® QPI [...]
+</I><BR>
+The QPI hardware performance counters are exposed to the operating system through PCI interfaces. There are two of those interfaces for the QPI. If your system has not all interfaces but interface 0 does not work, try the other one. The name SBOX originates from the Nehalem EX Uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>SBOX<0,1,2>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>SBOX<0,1,2>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>SBOX<0,1,2>C2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>SBOX<0,1,2>C3</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>SBOX<0,1,2>FIX</TD>
+ <TD>QPI_RATE</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for SBOX<0-2>C<0-3> counters)</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>opcode</TD>
+ <TD>6 bit hex value</TD>
+ <TD>Set bits 0-5 in PCI_UNC_HA_PMON_OPCODEMATCH register of PCI device</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>32 bit hex address</TD>
+ <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_QPI_PMON_MATCH_0 register of PCI device</TD>
+ <TD>A description of matching capabilities can be found in the <A HREF="http://www.Intel®.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v2 Uncore Manual</A>.</TD>
+</TR>
+<TR>
+ <TD>match1</TD>
+ <TD>20 bit hex address</TD>
+ <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_QPI_PMON_MATCH_1 register of PCI device</TD>
+ <TD>A description of matching capabilities can be found in the <A HREF="http://www.Intel®.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v2 Uncore Manual</A>.</TD>
+</TR>
+<TR>
+ <TD>mask0</TD>
+ <TD>32 bit hex address</TD>
+ <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_QPI_PMON_MASK_0 register of PCI device</TD>
+ <TD>A description of masking capabilities can be found in the <A HREF="http://www.Intel®.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v2 Uncore Manual</A>.</TD>
+</TR>
+<TR>
+ <TD>mask1</TD>
+ <TD>20 bit hex address</TD>
+ <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_QPI_PMON_MASK_1 register of PCI device</TD>
+ <TD>A description of masking capabilities can be found in the <A HREF="http://www.Intel®.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v2 Uncore Manual</A>.</TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_CBOX
+<H2>CBOX counter</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the LLC coherency engine in the Uncore. The description from Intel®:<BR>
+<I>The LLC coherence engine (CBo) manages the interface between the core and the last level cache (LLC). All core transactions that access the LLC are directed from the core to a CBo via the ring interconnect. The CBo is responsible for managing data delivery from the LLC to the requesting core. It is also responsible for maintaining coherence between the cores within the socket that share the LLC;
+generating snoops and collecting snoop responses from the local cores when the MESIF protocol requires it.
+</I><BR>
+The LLC hardware performance counters are exposed to the operating system through the MSR interface. The maximal amount of supported coherency engines for the Intel® IvyBridge EP/EN/EX microarchitecture is 15. E7-8800 v2 systems have all 15 engines, the E5-2600 v2 only 10 of them and the E5-1600 v2 only 6. It may be possible that your systems does not have all CBOXes, LIKWID will skip the unavailable ones in the setup phase. The name CBOX originates from the Nehalem EX Uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>CBOX<0-15>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>CBOX<0-15>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>CBOX<0-15>C2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>CBOX<0-15>C3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Operation</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>5 bit hex value</TD>
+ <TD>Set bits 24-28 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>tid</TD>
+ <TD>5 bit hex value</TD>
+ <TD>Set bits 0-4 in MSR_UNC_C<0-15>_PMON_BOX_FILTER register</TD>
+ <TD>A description of filter capabilities can be found in the <A HREF="http://www.Intel®.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v2 Uncore Manual</A>.</TD>
+</TR>
+<TR>
+ <TD>state</TD>
+ <TD>6 bit hex value</TD>
+ <TD>Set bits 17-22 in MSR_UNC_C<0-15>_PMON_BOX_FILTER register</TD>
+ <TD>M: 0x28, F: 0x10, M: 0x08, E: 0x04, S: 0x02, I: 0x01</TD>
+</TR>
+<TR>
+ <TD>nid</TD>
+ <TD>16 bit hex value</TD>
+ <TD>Set bits 0-15 in MSR_UNC_C<0-15>_PMON_BOX_FILTER1 register</TD>
+ <TD>Note: Node 0 has value 0x0001</TD>
+</TR>
+<TR>
+ <TD>opcode</TD>
+ <TD>9 bit hex value</TD>
+ <TD>Set bits 20-28 in MSR_UNC_C<0-15>_PMON_BOX_FILTER1 register</TD>
+ <TD>A table of all valid opcodes can be found in the <A HREF="http://www.Intel®.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v2 Uncore Manual</A>.</TD>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>2 bit hex address</TD>
+ <TD>Set bits 30-31 in MSR_UNC_C<0-15>_PMON_BOX_FILTER1 register</TD>
+ <TD>A description of matching capabilities can be found in the <A HREF="http://www.Intel®.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v2 Uncore Manual</A>.</TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides an event LLC_LOOKUP which can be filtered with the 'state' option. If no 'state' is set, LIKWID sets the state to 0x1F, the default value to measure all lookups.</P>
+
+
+\anchor IVBEP_UBOX
+<H2>Uncore management counters</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the management box in the Uncore. The description from Intel®:<BR>
+<I>
+The UBox serves as the system configuration controller within the physical processor. In this capacity, the UBox acts as the central unit for a variety of functions:
+<UL>
+<LI>The master for reading and writing physically distributed registers across physical processor using the Message Channel.</LI>
+<LI>The UBox is the intermediary for interrupt traffic, receiving interrupts from the system and dispatching interrupts to the appropriate core.</LI>
+<LI>The UBox serves as the system lock master used when quiescing the platform (e.g., Intel® QPI bus lock).</LI>
+</UL>
+</I><BR>
+The Uncore management performance counters are exposed to the operating system through the MSR interface. The name UBOX originates from the Nehalem EX Uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>UBOX0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UBOX1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UBOXFIX</TD>
+ <TD>UBOX_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for UBOX<0,1> counters)</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Operation</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>5 bit hex value</TD>
+ <TD>Set bits 24-28 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_WBOX
+<H2>Power control unit counters</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the power control unit (PCU) in the Uncore. The description from Intel®:<BR>
+<I>The PCU is the primary Power Controller for the physical processor package. The uncore implements a power control unit acting as a core/uncore power and thermal manager. It runs its firmware on an internal micro-controller and coordinates the socket’s power states.
+</I><BR>
+The PCU performance counters are exposed to the operating system through the MSR interface. The name WBOX originates from the Nehalem EX Uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>WBOX0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>WBOX1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>WBOX2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>WBOX3</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>WBOX0FIX</TD>
+ <TD>CORES_IN_C3</TD>
+</TR>
+<TR>
+ <TD>WBOX1FIX</TD>
+ <TD>CORES_IN_C6</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for WBOX<0-3> counters)</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Operation</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>5 bit hex value</TD>
+ <TD>Set bits 24-28 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>32 bit hex value</TD>
+ <TD>Set bits 0-31 in MSR_UNC_PCU_PMON_BOX_FILTER register</TD>
+ <TD>Band0: bits 0-7, Band1: bits 8-15, Band2: bits 16-23, Band3: bits 24-31</TD>
+</TR>
+<TR>
+ <TD>occupancy</TD>
+ <TD>2 bit hex value</TD>
+ <TD>Set bit 14-15 in config register</TD>
+ <TD>Cores in C0: 0x1, in C3: 0x2, in C6: 0x3</TD>
+</TR>
+<TR>
+ <TD>occupancy_edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>occupancy_invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 30 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_IBOX
+<H2>IBOX counter</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the IRP box in the Uncore. The description from Intel®:<BR>
+<I>IRP is responsible for maintaining coherency for IIO traffic that needs to be coherent (e.g. cross-socket P2P).
+</I><BR>
+The IRP box counters are exposed to the operating system through the PCI interface. The IBOX was introduced with the Intel® IvyBridge EP/EN/EX microarchitecture.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>IBOX<0,1>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>IBOX<0,1>C1</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Operation</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_MBOX
+<H2>MBOX counter</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the integrated Memory Controllers (iMC) in the Uncore. The description from Intel®:<BR>
+<I>The integrated Memory Controller provides the interface to DRAM and communicates to the rest of the uncore through the Home Agent (i.e. the iMC does not connect to the Ring).<BR>
+In conjunction with the HA, the memory controller also provides a variety of RAS features, such as ECC, lockstep, memory access retry, memory scrubbing, thermal throttling, mirroring, and rank sparing.
+</I><BR>
+The uncore management performance counters are exposed to the operating system through PCI interfaces. There may be two memory controllers in the system (E7-8800 v2). There are 4 different PCI devices per memory controller, each covering one memory channel. Each channel has 4 different general-purpose counters and one fixed counter for the DRAM clock. The four channels of the first memory controller are MBOX0-3, the four channels of the second memory controller (if available) are named M [...]
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>MBOX<0-7>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>MBOX<0-7>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>MBOX<0-7>C2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>MBOX<0-7>C3</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>MBOX<0-7>FIX</TD>
+ <TD>DRAM_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for counter MBOX<0-7>C<0-3>)</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Operation</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_RBOX
+<H2>RBOX counter</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the Ring-to-QPI (R3QPI) interface in the Uncore. The description from Intel®:<BR>
+<I>R3QPI is the interface between the Intel® QPI Link Layer, which packetizes requests, and the Ring.<BR>
+R3QPI is the interface between the ring and the Intel® QPI Link Layer. It is responsible for translating between ring protocol packets and flits that are used for transmitting data across the Intel® QPI interface. It performs credit checking between the local Intel® QPI LL, the remote Intel® QPI LL and other agents on the local ring.
+</I><BR>
+The R3QPI performance counters are exposed to the operating system through PCI interfaces. Since the RBOXes manage the traffic from the LLC-connecting ring interface on the socket with the QPI interfaces (SBOXes), the amount is similar to the amount of SBOXes. See at SBOXes how many are available for which system configuration. The name RBOX originates from the Nehalem EX Uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>RBOX<0,1,2>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>RBOX<0,1,2>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>RBOX<0,1,2>C2</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Operation</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_PBOX
+<H2>PBOX counter</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the Ring-to-PCIe (R2PCIe) interface in the Uncore. The description from Intel®:<BR>
+<I>R2PCIe represents the interface between the Ring and IIO traffic to/from PCIe.
+</I><BR>
+The R2PCIe performance counters are exposed to the operating system through a PCI interface. Independent of the system's configuration, there is only one Ring-to-PCIe interface.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PBOX0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PBOX1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PBOX2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PBOX3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Operation</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+
+*/
diff --git a/doc/archs/k10.md b/doc/archs/k10.md
new file mode 100644
index 0000000..a5ab582
--- /dev/null
+++ b/doc/archs/k10.md
@@ -0,0 +1,68 @@
+/*! \page k10 AMD® K10
+
+<H1>Available performance monitors for the AMD® K10 microarchitecture</H1>
+<UL>
+<LI>\ref K10_PMC "General-purpose counters"</LI>
+</UL>
+
+\anchor K10_PMC
+<H2>General-purpose counters</H2>
+<P>The AMD® K10 microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PMC0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit 17 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD>The value for threshold can range between 0x0 and 0x3</TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+*/
diff --git a/doc/archs/k8.md b/doc/archs/k8.md
new file mode 100644
index 0000000..5bcdcce
--- /dev/null
+++ b/doc/archs/k8.md
@@ -0,0 +1,68 @@
+/*! \page k8 AMD® K8
+
+<H1>Available performance monitors for the AMD® K8 microarchitecture</H1>
+<UL>
+<LI>\ref K8_PMC "General-purpose counters"</LI>
+</UL>
+
+\anchor K8_PMC
+<H2>General-purpose counters</H2>
+<P>The AMD® K8 microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PMC0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit 17 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD>The value for threshold can range between 0x0 and 0x3</TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+*/
diff --git a/doc/archs/kabini.md b/doc/archs/kabini.md
new file mode 100644
index 0000000..41824cc
--- /dev/null
+++ b/doc/archs/kabini.md
@@ -0,0 +1,162 @@
+/*! \page kabini AMD® Kabini
+
+<H1>Available performance monitors for the AMD® Kabini microarchitecture</H1>
+<UL>
+<LI>\ref KAB_PMC "General-purpose counters"</LI>
+<LI>\ref KAB_CPMC "L2 cache general-purpose counters"</LI>
+<LI>\ref KAB_UPMC "Northbridge general-purpose counters"</LI>
+</UL>
+
+
+\anchor KAB_PMC
+<H2>General-purpose counters</H2>
+<P>The AMD® Kabini microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PMC0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit 17 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD>The value for threshold can range between 0x0 and 0x3</TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+
+<H1>Counters available for one hardware thread per shared L2 cache</H1>
+\anchor KAB_CPMC
+<H2>L2 general-purpose counters</H2>
+<P>The AMD® Kabini microarchitecture provides 4 general-purpose counters for measuring L2 cache events. They consist of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>CPMC0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>CPMC1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>CPMC2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>CPMC3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD>The value for threshold can range between 0x0 and 0x3</TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>tid</TD>
+ <TD>4 bit hex value</TD>
+ <TD>Set bits 56-59 in config register</TD>
+ <TD>If bit equals 0, the events of the thread are counted. See <A HREF="http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2012/10/48751_16h_bkdg.pdf">BIOS and Kernel Developer’s Guide (BKDG) for AMD Family 16h Processors</A> for details.</TD>
+</TR>
+<TR>
+ <TD>nid</TD>
+ <TD>4 bit hex value</TD>
+ <TD>Set bits 48-51 in config register</TD>
+ <TD>If bit equals 0, the events of the thread are counted. See <A HREF="http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2012/10/48751_16h_bkdg.pdf">BIOS and Kernel Developer’s Guide (BKDG) for AMD Family 16h Processors</A> for details.</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor KAB_UPMC
+<H2>Northbridge general-purpose counters</H2>
+<P>The AMD® Kabini microarchitecture provides 4 general-purpose counters for the Northbridge consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>UPMC0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UPMC1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UPMC2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UPMC3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+
+
+*/
diff --git a/doc/archs/nehalem.md b/doc/archs/nehalem.md
new file mode 100644
index 0000000..b2d45b8
--- /dev/null
+++ b/doc/archs/nehalem.md
@@ -0,0 +1,237 @@
+/*! \page nehalem Intel® Nehalem
+
+<H1>Available performance monitors for the Intel® Nehalem microarchitecture</H1>
+<UL>
+<LI>\ref NEH_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref NEH_PMC "General-purpose counters"</LI>
+<LI>\ref NEH_UNCORE "General-purpose counters for the Uncore"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor NEH_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>FIXC0</TD>
+ <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+ <TD>FIXC1</TD>
+ <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+ <TD>FIXC2</TD>
+ <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>anythread</TD>
+ <TD>N</TD>
+ <TD>Set bit 2+(index*4) in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit (index*4) in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor NEH_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Nehalem microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PMC0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit 17 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>anythread</TD>
+ <TD>N</TD>
+ <TD>Set bit 21 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Nehalem microarchitecture provides measuring of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Nehalem microarchitecture has one of those registers. Own filtering can be applied with the OFFCORE_RESPONSE_0_OPTIONS event. Only for those events two more counter options are available:</P>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Input value masked with 0xFF and written to bits 0-7 in the OFFCORE_RESPONSE register</TD>
+ <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/NHM-EP">https://download.01.org/perfmon/NHM-EP</A>.</TD>
+</TR>
+<TR>
+ <TD>match1</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Input value masked with 0xF7 and written to bits 8-15 in the OFFCORE_RESPONSE register</TD>
+ <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/NHM-EP">https://download.01.org/perfmon/NHM-EP</A>.</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor NEH_UNCORE
+<H2>Uncore counters</H2>
+<P>The Intel® Nehalem microarchitecture provides 8 general-purpose counters consisting of a config and a counter register. Moreover, there is a fixed-purpose counter to measure the clock of the Uncore.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>UPMC0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UPMC1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UPMC2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UPMC3</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UPMC4</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UPMC5</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UPMC6</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UPMC7</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UPMCFIX</TD>
+ <TD>UNCORE_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+<H3>Available Options (Only for UPMC<0-7> counters)</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>anythread</TD>
+ <TD>N</TD>
+ <TD>Set bit 21 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>opcode</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 40-47 in MSR_UNCORE_ADDR_OPCODE_MATCH register</TD>
+ <TD>Documented but register only available in Westmere architecture. A list of valid opcodes can be found in the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A>.</TD>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>40 bit physical memory address</TD>
+ <TD>Extract bits 3-39 from address and write them to bits 3-39 in MSR_UNCORE_ADDR_OPCODE_MATCH register</TD>
+ <TD>Documented but register only available in Westmere architecture. </TD>
+</TR>
+</TABLE>
+
+*/
diff --git a/doc/archs/nehalemex.md b/doc/archs/nehalemex.md
new file mode 100644
index 0000000..8bbb735
--- /dev/null
+++ b/doc/archs/nehalemex.md
@@ -0,0 +1,554 @@
+/*! \page nehalemex Intel® Nehalem EX
+
+<H1>Available performance monitors for the Intel® Nehalem EX microarchitecture</H1>
+<UL>
+<LI>\ref NEHEX_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref NEHEX_PMC "General-purpose counters"</LI>
+<LI>\ref NEHEX_MBOX "Memory controller counters"</LI>
+<LI>\ref NEHEX_BBOX "Home Agent counters"</LI>
+<LI>\ref NEHEX_RBOX "Crossbar router counters"</LI>
+<LI>\ref NEHEX_CBOX "Last Level cache counters"</LI>
+<LI>\ref NEHEX_SBOX "LLC-to-QPI interface counters"</LI>
+<LI>\ref NEHEX_WBOX "Power control unit counters"</LI>
+<LI>\ref NEHEX_UBOX "Uncore management counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor NEHEX_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>FIXC0</TD>
+ <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+ <TD>FIXC1</TD>
+ <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+ <TD>FIXC2</TD>
+ <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>anythread</TD>
+ <TD>N</TD>
+ <TD>Set bit 2+(index*4) in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit (index*4) in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor NEHEX_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Nehalem EX microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PMC0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit 17 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Nehalem EX microarchitecture provides measuring of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Nehalem EX microarchitecture has two of those registers. Own filtering can be applied with the OFFCORE_RESPONSE_0_OPTIONS event. Only for those events two more counter options are available:</P>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Input value masked with 0xFF and written to bits 0-7 in the OFFCORE_RESPONSE register</TD>
+ <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/NHM-EX">https://download.01.org/perfmon/NHM-EX</A>.</TD>
+</TR>
+<TR>
+ <TD>match1</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Input value masked with 0xF7 and written to bits 8-15 in the OFFCORE_RESPONSE register</TD>
+ <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/NHM-EX">https://download.01.org/perfmon/NHM-EX</A>.</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor NEHEX_MBOX
+<H2>MBOX counters</H2>
+<P>The Intel® Nehalem EX microarchitecture provides measurements of the memory controllers in the Uncore. The description from Intel®:<BR>
+<I>The memory controller interfaces to the Intel® 7500 Scalable Memory Buffers and translates read and write commands into specific Intel® Scalable Memory Interconnect (Intel® SMI) operations. Intel SMI is based on the FB-DIMM architecture, but the Intel 7500 Scalable Memory Buffer is not an AMB2 device and has significant exceptions to the FB-DIMM2 architecture. The memory controller also provides a variety of RAS features, such as ECC, memory scrubbing, thermal throttling, [...]
+</I><BR>
+The Intel® Nehalem EX microarchitecture has 2 memory controllers, each with 6 general-purpose counters. They are exposed through the MSR interface to the operating system kernel. The MBOX and RBOX setup routines are taken from Likwid 3, they are not as flexible as the newer setup routines but programming of the MBOXes and RBOXes is tedious for Westmere EX. It is not possible to specify a FVID (Fill Victim Index) for the MBOX or IPERF option for RBOXes.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>MBOX<0,1>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>MBOX<0,1>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>MBOX<0,1>C2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>MBOX<0,1>C3</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>MBOX<0,1>C4</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>MBOX<0,1>C5</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Special handling for events</H3>
+<P>For the events DRAM_CMD_ALL and DRAM_CMD_ILLEGAL two counter options are available:</P>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>34 bit address</TD>
+ <TD>Set bits 0-33 in MSR_M<0,1>_PMON_ADDR_MATCH register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>mask0</TD>
+ <TD>60 bit hex value</TD>
+ <TD>Extract bits 6-33 from address and set bits 0-27 in MSR_M<0,1>_PMON_ADDR_MASK register</TD>
+ <TD></TD>
+</TR>
+</TABLE><BR>
+<P>For the events THERM_TRP_DN and THERM_TRP_UP you cannot measure events for all and one specific DIMM simultaneously because they program the same filter register MSR_M<0,1>_PMON_MSC_THR and have contrary configurations.</P>
+<P>Although the events FVC_EV<0-3> are available to measure multiple memory events, some overlap and do not allow simultaneous measuring. That's because they program the same filter register MSR_M<0,1>_PMON_ZDP and have contrary configurations. One case are the FVC_EV<0-3>_BBOX_CMDS_READS and FVC_EV<0-3>_BBOX_CMDS_WRITES events that measure memory reads or writes but cannot be measured at the same time.</P>
+
+
+
+\anchor NEHEX_BBOX
+<H2>BBOX counters</H2>
+<P>The Intel® Nehalem EX microarchitecture provides measurements of the Home Agent in the Uncore. The description from Intel®:<BR>
+<I>The B-Box is responsible for the protocol side of memory interactions, including coherent and non-coherent home agent protocols (as defined in the Intel® QuickPath Interconnect Specification). Additionally, the B-Box is responsible for ordering memory reads/writes to a given address such that the M-Box does not have to perform this conflict checking. All requests for memory attached to the coupled M-Box must first be ordered through the B-Box.
+</I><BR>
+The memory traffic in an Intel® Nehalem EX system is controller by the Home Agents. Each MBOX has a corresponding BBOX. Each BBOX offers 4 general-purpose counters. They are exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>BBOX<0,1>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>BBOX<0,1>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>BBOX<0,1>C2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>BBOX<0,1>C3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Special handling for events</H3>
+<P>For the matching events MSG_IN_MATCH, MSG_ADDR_IN_MATCH, MSG_OPCODE_ADDR_IN_MATCH, MSG_OPCODE_IN_MATCH, MSG_OPCODE_OUT_MATCH, MSG_OUT_MATCH, OPCODE_ADDR_IN_MATCH, OPCODE_IN_MATCH, OPCODE_OUT_MATCH and ADDR_IN_MATCH two counter options are available:</P>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>60 bit hex value</TD>
+ <TD>Set bits 0-59 in MSR_B<0,1>_PMON_MATCH register</TD>
+ <TD>For register layout and valid settings see <A HREF="http://www.intel.com/Assets/en_US/PDF/designguide/323535.pdf">Intel® Xeon® Processor 7500 Series Uncore Programming Guide</A></TD>
+</TR>
+<TR>
+ <TD>mask0</TD>
+ <TD>60 bit hex value</TD>
+ <TD>Set bits 0-59 in MSR_B<0,1>_PMON_MASK register</TD>
+ <TD>For register layout and valid settings see <A HREF="http://www.intel.com/Assets/en_US/PDF/designguide/323535.pdf">Intel® Xeon® Processor 7500 Series Uncore Programming Guide</A></TD>
+</TR>
+</TABLE>
+
+\anchor NEHEX_RBOX
+<H2>RBOX counters</H2>
+<P>The Intel® Nehalem EX microarchitecture provides measurements of the crossbar router in the Uncore. The description from Intel®:<BR>
+<I>The Crossbar Router (R-Box) is a 8 port switch/router implementing the Intel® QuickPath Interconnect Link and Routing layers. The R-Box is responsible for routing and transmitting all intra- and inter-processor communication.
+</I><BR>
+The Intel® Nehalem EX microarchitecture has two interfaces to the RBOX although each socket contains only one crossbar router. Each RBOX offers 8 general-purpose counters. They are exposed through the MSR interface to the operating system kernel. The RBOX setup routine is taken from Likwid 3.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>RBOX<0,1>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>RBOX<0,1>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>RBOX<0,1>C2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>RBOX<0,1>C3</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>RBOX<0,1>C4</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>RBOX<0,1>C5</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>RBOX<0,1>C6</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>RBOX<0,1>C7</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+
+\anchor NEHEX_CBOX
+<H2>CBOX counters</H2>
+<P>The Intel® Nehalem EX microarchitecture provides measurements of the LLC coherency engine in the Uncore. The description from Intel®:<BR>
+<I>For the Intel Xeon Processor 7500 Series, the LLC coherence engine (C-Box) manages the interface between the core and the last level cache (LLC). All core transactions that access the LLC are directed from the core to a C-Box via the ring interconnect. The C-Box is responsible for managing data delivery from the LLC to the requesting core. It is also responsible for maintaining coherence between the cores within the socket that share the LLC; generating snoops and collecting snoop res [...]
+The C-Box is also the gate keeper for all Intel® QuickPath Interconnect (Intel® QPI) messages that originate in the core and is responsible for ensuring that all Intel QuickPath Interconnect messages that pass through the socket’s LLC remain coherent.
+</I><BR>
+The Intel® Nehalem EX microarchitecture has 8 CBOX instances. Each CBOX offers 6 general-purpose counters. They are exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>CBOX<0-7>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>CBOX<0-7>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>CBOX<0-7>C2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>CBOX<0-7>C3</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>CBOX<0-7>C4</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>CBOX<0-7>C5</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>5 bit hex value</TD>
+ <TD>Set bits 24-28 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor NEHEX_SBOX
+<H2>SBOX counters</H2>
+<P>The Intel® Nehalem EX microarchitecture provides measurements of the LLC-to-QPI interface in the Uncore. The description from Intel®:<BR>
+<I>The S-Box represents the interface between the last level cache and the system interface. It manages flow control between the C and R & B-Boxes. The S-Box is broken into system bound (ring to Intel® QPI) and ring bound (Intel® QPI to ring) connections.<BR>
+As such, it shares responsibility with the C-Box(es) as the Intel® QPI caching agent(s). It is responsible for converting C-box requests to Intel® QPI messages (i.e. snoop generation and data response messages from the snoop response) as well as converting/forwarding ring messages to Intel® QPI packets and vice versa.
+</I><BR>
+The Intel® Nehalem EX microarchitecture has 2 SBOX instances. Each SBOX offers 4 general-purpose counters. They are exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>SBOX<0,1>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>SBOX<0,1>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>SBOX<0,1>C2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>SBOX<0,1>C3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>Only for the TO_R_PROG_EV events two counter options are available:</P>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>64 bit hex value</TD>
+ <TD>Set bit 0-63 in MSR_S<0,1>_PMON_MATCH register</TD>
+ <TD>For register layout and valid settings see <A HREF="http://www.intel.com/Assets/en_US/PDF/designguide/323535.pdf">Intel® Xeon® Processor 7500 Series Uncore Programming Guide</A></TD>
+</TR>
+<TR>
+ <TD>mask0</TD>
+ <TD>39 bit hex value</TD>
+ <TD>Set bit 0-38 in MSR_S<0,1>_PMON_MASK register</TD>
+ <TD>For register layout and valid settings see <A HREF="http://www.intel.com/Assets/en_US/PDF/designguide/323535.pdf">Intel® Xeon® Processor 7500 Series Uncore Programming Guide</A></TD>
+</TR>
+</TABLE>
+
+\anchor NEHEX_WBOX
+<H2>WBOX counters</H2>
+<P>The Intel® Nehalem EX microarchitecture provides measurements of the power controller in the Uncore. The description from Intel®:<BR>
+<I>The W-Box is the primary Power Controller for the Intel® Xeon® Processor 7500 Series.
+</I><BR>
+The Intel® Nehalem EX microarchitecture has one WBOX and it offers 4 general-purpose counters and one fixed counter. They are exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>WBOX0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>WBOX1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>WBOX2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>WBOX3</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>WBOXFIX</TD>
+ <TD>UNCORE_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+<H3>Available Options (Only for WBOX<0-3> counters)</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor NEHEX_UBOX
+<H2>UBOX counters</H2>
+<P>The Intel® Nehalem EX microarchitecture provides measurements of the system configuration controller in the Uncore. The description from Intel®:<BR>
+<I>The U-Box serves as the system configuration controller for the Intel® Xeon® Processor E7 Family.
+</I><BR>
+The Intel® Nehalem EX microarchitecture has one UBOX and it offers a single general-purpose counter. It is exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>UBOX0</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options (Only for WBOX<0-3> counters)</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+*/
diff --git a/doc/archs/pentiumm.md b/doc/archs/pentiumm.md
new file mode 100644
index 0000000..8ebc46d
--- /dev/null
+++ b/doc/archs/pentiumm.md
@@ -0,0 +1,63 @@
+/*! \page pentiumm Intel® Pentium M
+
+<H1>Available performance monitors for the Intel® Pentium M microarchitecture</H1>
+<UL>
+<LI>\ref PM_PMC "General-purpose counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor PM_PMC
+<H2>PMC counters</H2>
+The Intel® Pentium M microarchitecture provides 2 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PMC0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC1</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit 17 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+
+*/
diff --git a/doc/archs/phi.md b/doc/archs/phi.md
new file mode 100644
index 0000000..ac256c8
--- /dev/null
+++ b/doc/archs/phi.md
@@ -0,0 +1,78 @@
+/*! \page phi Intel® Xeon Phi
+
+<P>To use LIKWID you have to turn of power management on the MIC. LIKWID relies on
+RDTSC being used for wallclock time. On the MIC this is only given if power
+management is turned off. This can be configured in
+<CODE>/etc/sysconfig/mic/default.conf</CODE>.<BR>
+
+At the end of this file the power management is configured. The following
+configuration worked:<BR>
+<CODE>PowerManagement "cpufreq_off;corec6_off;pc3_off;pc6_off"</CODE>
+</P>
+
+<H1>Available performance monitors for the Intel® Xeon Phi microarchitecture</H1>
+<UL>
+<LI>\ref PHI_PMC "General-purpose counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor PHI_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Xeon Phi microarchitecture provides 2 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PMC0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC1</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit 17 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>anythread</TD>
+ <TD>N</TD>
+ <TD>Set bit 21 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+
+*/
diff --git a/doc/archs/sandybridge.md b/doc/archs/sandybridge.md
new file mode 100644
index 0000000..385a724
--- /dev/null
+++ b/doc/archs/sandybridge.md
@@ -0,0 +1,189 @@
+/*! \page sandybridge Intel® SandyBridge
+
+<H1>Available performance monitors for the Intel® IvyBridge microarchitecture</H1>
+<UL>
+<LI>\ref SNB_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref SNB_PMC "General-purpose counters"</LI>
+<LI>\ref SNB_THERMAL "Thermal counters"</LI>
+<LI>\ref SNB_POWER "Power measurement counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor SNB_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>FIXC0</TD>
+ <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+ <TD>FIXC1</TD>
+ <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+ <TD>FIXC2</TD>
+ <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>anythread</TD>
+ <TD>N</TD>
+ <TD>Set bit 2+(index*4) in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit (index*4) in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SNB_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® SandyBridge microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PMC0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit 17 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>anythread</TD>
+ <TD>N</TD>
+ <TD>Set bit 21 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® SandyBridge microarchitecture provides measureing of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® SandyBridge microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied with [...]
+</P>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>16 bit hex value</TD>
+ <TD>Input value masked with 0x8FFF and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+ <TD>Check the <A HREF="http://www.Intel®®.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel®®® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/SNB">https://download.01.org/perfmon/SNB</A>.</TD>
+</TR>
+<TR>
+ <TD>match1</TD>
+ <TD>22 bit hex value</TD>
+ <TD>Input value is written to bits 16-37 in the OFFCORE_RESPONSE register</TD>
+ <TD>Check the <A HREF="http://www.Intel®®.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel®®® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/SNB">https://download.01.org/perfmon/SNB</A>.</TD>
+</TR>
+</TABLE>
+
+\anchor SNB_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® SandyBridge microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>TMP0</TD>
+ <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor SNB_POWER
+<H2>Power counter</H2>
+<P>The Intel® SandyBridge microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PWR0</TD>
+ <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+ <TD>PWR1</TD>
+ <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+ <TD>PWR2*</TD>
+ <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+ <TD>PWR3</TD>
+ <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+<P>*) The PWR2 counter is often not implemented by Intel® SandyBridge systems</P>
+*/
diff --git a/doc/archs/sandybridgeep.md b/doc/archs/sandybridgeep.md
new file mode 100644
index 0000000..ce98c8a
--- /dev/null
+++ b/doc/archs/sandybridgeep.md
@@ -0,0 +1,775 @@
+/*! \page sandybridgeep Intel® SandyBridge EP/EN
+
+<H1>Available performance monitors for the Intel® IvyBridge microarchitecture</H1>
+<UL>
+<LI>\ref SNBEP_FIXED Fixed-purpose counters</LI>
+<LI>\ref SNBEP_PMC General-purpose counters</LI>
+<LI>\ref SNBEP_THERMAL Thermal counters</LI>
+<LI>\ref SNBEP_POWER Power measurement counters</LI>
+<LI>\ref SNBEP_MBOX Integrated memory controller counters</LI>
+<LI>\ref SNBEP_CBOX Last Level cache counters</LI>
+<LI>\ref SNBEP_UBOX Uncore management counters</LI>
+<LI>\ref SNBEP_SBOX Intel® QPI Link Layer counters</LI>
+<LI>\ref SNBEP_BBOX Home Agent counters</LI>
+<LI>\ref SNBEP_WBOX Power control unit counters</LI>
+<LI>\ref SNBEP_RBOX Ring-to-QPI interface counters</LI>
+<LI>\ref SNBEP_PBOX Ring-to-PCIe interface counters</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor SNBEP_FIXED
+<H2>Fixed counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>FIXC0</TD>
+ <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+ <TD>FIXC1</TD>
+ <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+ <TD>FIXC2</TD>
+ <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>anythread</TD>
+ <TD>N</TD>
+ <TD>Set bit 2+(index*4) in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit (index*4) in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_PMC
+<H2>PMC counters</H2>
+<P>The Intel® SandyBridge microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PMC0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit 17 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>anythread</TD>
+ <TD>N</TD>
+ <TD>Set bit 21 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® SandyBridge microarchitecture provides measureing of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® SandyBridge microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied with [...]
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>16 bit hex value</TD>
+ <TD>Input value masked with 0x8FFF and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+ <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/JKT">https://download.01.org/perfmon/JKT</A>.</TD>
+</TR>
+<TR>
+ <TD>match1</TD>
+ <TD>22 bit hex value</TD>
+ <TD>Input value is written to bits 16-37 in the OFFCORE_RESPONSE register</TD>
+ <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/JKT">https://download.01.org/perfmon/JKT</A>.</TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® SandyBridge microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>TMP0</TD>
+ <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor SNBEP_POWER
+<H2>Power counter</H2>
+<P>The Intel® SandyBridge microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PWR0</TD>
+ <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+ <TD>PWR1</TD>
+ <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+ <TD>PWR2</TD>
+ <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+ <TD>PWR3</TD>
+ <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_MBOX
+<H2>Memory controller counters</H2>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides measurements of the integrated Memory Controllers (iMC) in the uncore. The description from Intel®:<BR>
+<I>The integrated Memory Controller provides the interface to DRAM and communicates to the rest of the uncore through the Home Agent (i.e. the iMC does not connect to the Ring).<BR>
+In conjunction with the HA, the memory controller also provides a variety of RAS features, such as ECC, lockstep, memory access retry, memory scrubbing, thermal throttling, mirroring, and rank sparing.
+</I><BR>
+The uncore management performance counters are exposed to the operating system through PCI interfaces. All SandyBridge based systems have one memory controller. There are 4 different PCI devices per memory controller, each covering one memory channel. Each channel has 4 different general-purpose counters and one fixed counter for the DRAM clock. The name MBOX originates from the Nehalem EX uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>MBOX<0-3>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>MBOX<0-3>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>MBOX<0-3>C2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>MBOX<0-3>C3</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>MBOX<0-3>FIX</TD>
+ <TD>DRAM_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for counter MBOX<0-3>C<0-3>)</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Operation</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_CBOX
+<H2>Last Level cache counters</H2>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides measurements of the LLC coherency engine in the uncore. The description from Intel®:<BR>
+<I>The LLC coherence engine (CBo) manages the interface between the core and the last level cache (LLC). All core transactions that access the LLC are directed from the core to a CBo via the ring interconnect. The CBo is responsible for managing data delivery from the LLC to the requesting core. It is also responsible for maintaining coherence between the cores within the socket that share the
+LLC; generating snoops and collecting snoop responses from the local cores when the MESIF protocol requires it.
+</I><BR>
+The Last Level cache performance counters are exposed to the operating system through the MSR interface. SandyBridge EN/EP systems have maximal 8 CBOXes, each with 4 general-purpose counters. The name CBOX originates from the Nehalem EX uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>CBOX<0-7>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>CBOX<0-7>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>CBOX<0-7>C2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>CBOX<0-7>C3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Operation</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>opcode</TD>
+ <TD>9 bit opcode identifier, see uncore performance monitoring guide for SandyBridge</TD>
+ <TD>Set bits 23-31 in CBOX filter register MSR_UNC_C<0-7>_PMON_BOX_FILTER</TD>
+ <TD>LIKWID checks whether the given value is a valid opcode. A list of all valid opcodes can be found in the <A HREF="http://www.intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-uncore-guide.html">Intel® E5-2600 uncore monitoring guide</A></TD>
+</TR>
+<TR>
+ <TD>state</TD>
+ <TD>5 bit state representation</TD>
+ <TD>Set bits 18-22 in CBOX filter register MSR_UNC_C<0-7>_PMON_BOX_FILTER</TD>
+ <TD>F: 0x10,<BR>M: 0x08,<BR>E: 0x04,<BR>S: 0x02,<BR>I: 0x01</TD>
+</TR>
+<TR>
+ <TD>nid</TD>
+ <TD>8 bit node ID</TD>
+ <TD>Set bits 10-17 in CBOX filter register MSR_UNC_C<0-7>_PMON_BOX_FILTER</TD>
+ <TD>Note that for Node ID 0 the hex value should be 0x01.</TD>
+</TR>
+<TR>
+ <TD>tid</TD>
+ <TD>5 bit thread ID value</TD>
+ <TD>Set bits 0-4 in CBOX filter register MSR_UNC_C<0-7>_PMON_BOX_FILTER</TD>
+ <TD>Bit 0 means physical or logical thread, bits 1-3 the core ID</TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides an event LLC_LOOKUP which can be filtered with the 'state' option. If no 'state' is set, LIKWID sets the state to 0x1F, the default value to measure all lookups.</P>
+
+\anchor SNBEP_UBOX
+<H2>Uncore management counters</H2>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides measurements of the management box in the uncore. The description from Intel®:<BR>
+<I>The UBox serves as the system configuration controller for the Intel® Xeon Processor E5-2600 family uncore.<BR>
+In this capacity, the UBox acts as the central unit for a variety of functions:<BR>
+<UL>
+<LI>The master for reading and writing physically distributed registers across the uncore using the Message Channel.</LI>
+<LI>The UBox is the intermediary for interrupt traffic, receiving interrupts from the sytem and dispatching interrupts to the appropriate core.</LI>
+<LI>The UBox serves as the system lock master used when quiescing the platform (e.g., Intel® QPI bus lock).</LI>
+</UL>
+</I><BR>
+The uncore management performance counters are exposed to the operating system through the MSR interface. The name UBOX originates from the Nehalem EX uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>UBOX0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UBOX1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UBOXFIX</TD>
+ <TD>UBOX_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for counter UBOX<0,1>)</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Operation</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>5 bit hex value</TD>
+ <TD>Set bits 24-28 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_SBOX
+<H2>Intel® QPI Link Layer counters</H2>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides measurements of the QPI Link layer (QPI) in the uncore. The description from Intel®:<BR>
+<I>The Intel® QPI Link Layer is responsible for packetizing requests from the caching agent on the way out to the system interface. As such, it shares responsibility with the CBo(s) as the Intel® QPI caching agent(s). It is responsible for converting CBo requests to Intel® QPI messages (i.e. snoop generation and data response messages from the snoop response) as well as converting/forwarding ring
+messages to Intel® QPI packets and vice versa.<BR>
+The Intel® QPI is split into two separate layers. The Intel® QPI LL (link layer) is responsible for generating, transmitting, and receiving packets with the Intel®® QPI link.
+</I><BR>
+The QPI hardware performance counters are exposed to the operating system through PCI interfaces. There are two of those interfaces for the QPI. If your system has not all interfaces but interface 0 does not work, try the other one. The name SBOX originates from the Nehalem EX uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>SBOX<0,1>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>SBOX<0,1>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>SBOX<0,1>C2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>SBOX<0,1>C3</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>SBOX<0,1>FIX</TD>
+ <TD>QPI_RATE, QPI_SLOW_MODE</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for counter SBOX<0,1>C<0-3>)</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Operation</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>5 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>32 bit hex address</TD>
+ <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_QPI_PMON_MATCH_0 register of PCI device</TD>
+ <TD>Only if corresponding device available. See <A HREF="http://www.intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-uncore-guide.html">Intel® E5-2600 uncore monitoring guide</A> for fields in PCI_UNC_QPI_PMON_MATCH_0</TD>
+</TR>
+<TR>
+ <TD>match1</TD>
+ <TD>20 bit hex address</TD>
+ <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_QPI_PMON_MATCH_1 register of PCI device</TD>
+ <TD>Only if corresponding device available. See <A HREF="http://www.intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-uncore-guide.html">Intel® E5-2600 uncore monitoring guide</A> for fields in PCI_UNC_QPI_PMON_MATCH_1</TD>
+</TR>
+<TR>
+ <TD>mask0</TD>
+ <TD>32 bit hex address</TD>
+ <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_QPI_PMON_MASK_0 register of PCI device</TD>
+ <TD>Only if corresponding device available. See <A HREF="http://www.intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-uncore-guide.html">Intel® E5-2600 uncore monitoring guide</A> for fields in PCI_UNC_QPI_PMON_MASK_0</TD>
+</TR>
+<TR>
+ <TD>mask1</TD>
+ <TD>20 bit hex address</TD>
+ <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_QPI_PMON_MASK_1 register of PCI device</TD>
+ <TD>Only if corresponding device available. See <A HREF="http://www.intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-uncore-guide.html">Intel® E5-2600 uncore monitoring guide</A> for fields in PCI_UNC_QPI_PMON_MASK_1</TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_BBOX
+<H2>BBOX counter</H2>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides measurements of the Home Agent (HA) in the uncore. The description from Intel®:<BR>
+<I>The HA is responsible for the protocol side of memory interactions, including coherent and non-coherent home agent protocols (as defined in the Intel®® QuickPath Interconnect Specification). Additionally, the HA is responsible for ordering memory reads/writes, coming in from the modular Ring, to a given address such that the iMC (memory controller).<BR>
+In other words, it is the coherency agent responsible for guarding the memory controller. All requests for memory attached to the coupled iMC must first be ordered through the HA.
+</I><BR>
+The HA hardware performance counters are exposed to the operating system through PCI interfaces. The name BBOX originates from the Nehalem EX uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>BBOX0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>BBOX1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>BBOX2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>BBOX3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>opcode</TD>
+ <TD>6 bit hex value</TD>
+ <TD>Set bits 0-5 in PCI_UNC_HA_PMON_OPCODEMATCH register of PCI device</TD>
+ <TD>A table of all valid opcodes can be found in the <A HREF="http://www.intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-uncore-guide.html">Intel® E5-2600 uncore monitoring guide</A>.</TD>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>46 bit hex address</TD>
+ <TD>Extract bits 6-31 and set bits 6-31 in PCI_UNC_HA_PMON_ADDRMATCH0 register of PCI device<BR>Extract bits 32-45 and set bits 0-13 in PCI_UNC_HA_PMON_ADDRMATCH1 register of PCI device</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_WBOX
+<H2>WBOX counter</H2>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides measurements of the power control unit (PCU) in the uncore. The description from Intel®:<BR>
+<I>The PCU is the primary Power Controller for the physical processor package.<BR>
+The uncore implements a power control unit acting as a core/uncore power and thermal manager. It runs its firmware on an internal micro-controller and coordinates the socket’s power states.
+</I><BR>
+The PCU performance counters are exposed to the operating system through the MSR interface. The name WBOX originates from the Nehalem EX uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>WBOX0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>WBOX1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>WBOX2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>WBOX3</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>WBOX0FIX</TD>
+ <TD>CORES_IN_C3</TD>
+</TR>
+<TR>
+ <TD>WBOX1FIX</TD>
+ <TD>CORES_IN_C6</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for WBOX<0-3> counters)</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Operation</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>5 bit hex value</TD>
+ <TD>Set bits 24-28 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>32 bit hex value</TD>
+ <TD>Set bits 0-31 in MSR_UNC_PCU_PMON_BOX_FILTER register</TD>
+ <TD>Band0: bits 0-7,<BR>Band1: bits 8-15,<BR>Band2: bits 16-23,<BR>Band3: bits 24-31</TD>
+</TR>
+<TR>
+ <TD>occupancy</TD>
+ <TD>2 bit hex value</TD>
+ <TD>Set bit 14-15 in config register</TD>
+ <TD>Cores<BR>in C0: 0x1,<BR>in C3: 0x2,<BR>in C6: 0x3</TD>
+</TR>
+<TR>
+ <TD>occupancy_edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>occupancy_invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 30 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_RBOX
+<H2>RBOX counter</H2>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides measurements of the Ring-to-QPI (R3QPI) interface in the uncore. The description from Intel®:<BR>
+<I>R3QPI is the interface between the Intel® QPI Link Layer, which packetizes requests, and the Ring.<BR>
+R3QPI is the interface between the ring and the Intel® QPI Link Layer. It is responsible for translating between ring protocol packets and flits that are used for transmitting data across the Intel® QPI interface. It performs credit checking between the local Intel® QPI LL, the remote Intel® QPI LL and other agents on the local ring.
+</I><BR>
+The R3QPI performance counters are exposed to the operating system through PCI interfaces. Since the RBOXes manage the traffic from the LLC-connecting ring interface on the socket with the QPI interfaces (SBOXes), the amount is similar to the amount of SBOXes. See at SBOXes how many are available for which system configuration. The name RBOX originates from the Nehalem EX uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>RBOX<0,1>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>RBOX<0,1>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>RBOX<0,1>C2</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Operation</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_PBOX
+<H2>PBOX counter</H2>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides measurements of the Ring-to-PCIe (R2PCIe) interface in the uncore. The description from Intel®:<BR>
+<I>R2PCIe represents the interface between the Ring and IIO traffic to/from PCIe.
+</I><BR>
+The R2PCIe performance counters are exposed to the operating system through a PCI interface. Independent of the system's configuration, there is only one Ring-to-PCIe interface. The name PBOX originates from the Nehalem EX uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PBOX0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PBOX1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PBOX2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PBOX3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Operation</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+*/
diff --git a/doc/archs/silvermont.md b/doc/archs/silvermont.md
new file mode 100644
index 0000000..af22e32
--- /dev/null
+++ b/doc/archs/silvermont.md
@@ -0,0 +1,175 @@
+/*! \page silvermont Intel® Silvermont/Airmont
+
+<H1>Available performance monitors for the Intel® Silvermont microarchitecture</H1>
+<UL>
+<LI>\ref SVM_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref SVM_PMC "General-purpose counters"</LI>
+<LI>\ref SVM_THERMAL "Thermal counters"</LI>
+<LI>\ref SVM_POWER "Power measurement counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor SVM_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>FIXC0</TD>
+ <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+ <TD>FIXC1</TD>
+ <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+ <TD>FIXC2</TD>
+ <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>anythread</TD>
+ <TD>N</TD>
+ <TD>Set bit 2+(index*4) in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit (index*4) in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SVM_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Silvermont microarchitecture provides 2 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PMC0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC1</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit 17 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Silvermont microarchitecture provides measuring of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Silvermont microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied with th [...]
+</P>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>16 bit hex value</TD>
+ <TD>Input value masked with 0xFFFF and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+ <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/SLM">https://download.01.org/perfmon/SLM</A>.</TD>
+</TR>
+<TR>
+ <TD>match1</TD>
+ <TD>22 bit hex value</TD>
+ <TD>Input value is written to bits 16-38 in the OFFCORE_RESPONSE register</TD>
+ <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/SLM">https://download.01.org/perfmon/SLM</A>.</TD>
+</TR>
+</TABLE>
+
+\anchor SVM_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® Silvermont microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>TMP0</TD>
+ <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor SVM_POWER
+<H2>Power counters</H2>
+<P>The Intel® Silvermont microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PWR0</TD>
+ <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+ <TD>PWR1</TD>
+ <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+ <TD>PWR2*</TD>
+ <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+ <TD>PWR3*</TD>
+ <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+<P>*) The PWR2 and PWR3 counter is commonly not implemented by Intel® Silvermont systems.</P>
+*/
diff --git a/doc/archs/westmere.md b/doc/archs/westmere.md
new file mode 100644
index 0000000..3371c20
--- /dev/null
+++ b/doc/archs/westmere.md
@@ -0,0 +1,239 @@
+/*! \page westmere Intel® Westmere
+
+<P>The Intel® Westmere microarchitecture has the same features as the Intel® Nehalem architecture. There are some additional features like a second OFFCORE_RESPONSE register and an addr/opcode matching unit for general-purpose counters in the Uncore.</P>
+
+<H1>Available performance monitors for the Intel® Westmere microarchitecture</H1>
+<UL>
+<LI>\ref WES_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref WES_PMC "General-purpose counters"</LI>
+<LI>\ref WES_UNCORE "Uncore counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor WES_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>FIXC0</TD>
+ <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+ <TD>FIXC1</TD>
+ <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+ <TD>FIXC2</TD>
+ <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>anythread</TD>
+ <TD>N</TD>
+ <TD>Set bit 2+(index*4) in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit (index*4) in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor WES_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Westmere microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PMC0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit 17 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>anythread</TD>
+ <TD>N</TD>
+ <TD>Set bit 21 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Westmere microarchitecture provides measuring of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Westmere microarchitecture has two of those registers. Own filtering can be applied with the OFFCORE_RESPONSE_0_OPTIONS and OFFCORE_RESPONSE_1_OPTIONS events. Only for those events two more counter options are available:</P>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Input value masked with 0xFF and written to bits 0-7 in the OFFCORE_RESPONSE register</TD>
+ <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/WSM-EP-SP">https://download.01.org/perfmon/WSM-EP-SP</A>.</TD>
+</TR>
+<TR>
+ <TD>match1</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Input value masked with 0xF7 and written to bits 8-15 in the OFFCORE_RESPONSE register</TD>
+ <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/WSM-EP-SP">https://download.01.org/perfmon/WSM-EP-SP</A>.</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor WES_UNCORE
+<H2>Uncore counters</H2>
+<P>The Intel® Westmere microarchitecture provides 8 general-purpose counters for the uncpre consisting of a config and a counter register. Moreover, there is a fixed-purpose counter to measure the clock of the uncore.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>UPMC0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UPMC1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UPMC2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UPMC3</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UPMC4</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UPMC5</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UPMC6</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UPMC7</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>UPMCFIX</TD>
+ <TD>UNCORE_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+<H3>Available Options (Only for UPMC<0-7> counters)</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>anythread</TD>
+ <TD>N</TD>
+ <TD>Set bit 21 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>opcode</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 40-47 in MSR_UNCORE_ADDR_OPCODE_MATCH register</TD>
+ <TD>A list of valid opcodes can be found in the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A>.</TD>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>40 bit physical memory address</TD>
+ <TD>Extract bits 3-39 from address and write them to bits 3-39 in MSR_UNCORE_ADDR_OPCODE_MATCH register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+*/
diff --git a/doc/archs/westmereex.md b/doc/archs/westmereex.md
new file mode 100644
index 0000000..ce37674
--- /dev/null
+++ b/doc/archs/westmereex.md
@@ -0,0 +1,555 @@
+/*! \page westmereex Intel® Westmere EX
+
+<P>The Intel® Westmere EX microarchitecture has the same features as the Intel® Westmere architecture. There are some additional features like a second OFFCORE_RESPONSE register and an addr/opcode matching unit for general-purpose counters in the uncore.</P>
+
+<H1>Available performance monitors for the Intel® Westmere EX microarchitecture</H1>
+<UL>
+<LI>\ref WESEX_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref WESEX_PMC "General-purpose counters"</LI>
+<LI>\ref WESEX_MBOX "Memory controller counters"</LI>
+<LI>\ref WESEX_BBOX "Home Agent counters"</LI>
+<LI>\ref WESEX_RBOX "Crossbar router counters"</LI>
+<LI>\ref WESEX_CBOX "Last Level cache counters"</LI>
+<LI>\ref WESEX_SBOX "LLC-to-QPI interface counters"</LI>
+<LI>\ref WESEX_WBOX "Power control unit counters"</LI>
+<LI>\ref WESEX_UBOX "Uncore management counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor WESEX_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>FIXC0</TD>
+ <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+ <TD>FIXC1</TD>
+ <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+ <TD>FIXC2</TD>
+ <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>anythread</TD>
+ <TD>N</TD>
+ <TD>Set bit 2+(index*4) in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit (index*4) in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor WESEX_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Westmere EX microarchitecture provides 4 general-purpose counters consisting of a config and a counter register. They are core-local, hence each hardware thread has its own set of general-purpose counters.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>PMC0</TD>
+ <TD>*</TD>
+</TR>
+ <TD>PMC1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>PMC3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>kernel</TD>
+ <TD>N</TD>
+ <TD>Set bit 17 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Westmere EX microarchitecture provides measuring of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Westmere EX microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied with [...]
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Input value masked with 0xFF and written to bits 0-7 in the OFFCORE_RESPONSE register</TD>
+ <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and the event files at <A HREF="https://download.01.org/perfmon/WSM-EX">https://download.01.org/perfmon/WSM-EX</A>.</TD>
+</TR>
+<TR>
+ <TD>match1</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Input value masked with 0xF7 and written to bits 8-15 in the OFFCORE_RESPONSE register</TD>
+ <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and the event files at <A HREF="https://download.01.org/perfmon/WSM-EX">https://download.01.org/perfmon/WSM-EX</A>.</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor WESEX_MBOX
+<H2>MBOX counters</H2>
+<P>The Intel® Westmere EX microarchitecture provides measurements of the memory controllers in the uncore. The description from Intel®:<BR>
+<I>The memory controller interfaces to the Intel® 7500 Scalable Memory Buffers and translates read and write commands into specific Intel® Scalable Memory Interconnect (Intel® SMI) operations. Intel SMI is based on the FB-DIMM architecture, but the Intel 7500 Scalable Memory Buffer is not an AMB2 device and has significant exceptions to the FB-DIMM2 architecture. The memory controller also provides a variety of RAS features, such as ECC, memory scrubbing, thermal throttling, [...]
+</I><BR>
+The Intel® Westmere EX microarchitecture has 2 memory controllers, each with 6 general-purpose counters. They are exposed through the MSR interface to the operating system kernel. The MBOX and RBOX setup routines are taken from Likwid 3, they are not as flexible as the newer setup routines but programming of the MBOXes and RBOXes is tedious for Westmere EX. It is not possible to specify a FVID (Fill Victim Index) for the MBOX or IPERF option for RBOXes.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>MBOX<0,1>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>MBOX<0,1>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>MBOX<0,1>C2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>MBOX<0,1>C3</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>MBOX<0,1>C4</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>MBOX<0,1>C5</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Special handling for events</H3>
+<P>For the events DRAM_CMD_ALL and DRAM_CMD_ILLEGAL two counter options are available:</P>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>34 bit address</TD>
+ <TD>Set bits 0-33 in MSR_M<0,1>_PMON_ADDR_MATCH register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>mask0</TD>
+ <TD>60 bit hex value</TD>
+ <TD>Extract bits 6-33 from address and set bits 0-27 in MSR_M<0,1>_PMON_ADDR_MASK register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+<P>For the events THERM_TRP_DN and THERM_TRP_UP you cannot measure events for all and one specific DIMM simultaneously because they program the same filter register MSR_M<0,1>_PMON_MSC_THR and have contrary configurations.</P>
+<P>Although the events FVC_EV<0-3> are available to measure multiple memory events, some overlap and do not allow simultaneous measuring. That's because they program the same filter register MSR_M<0,1>_PMON_ZDP and have contrary configurations. One case are the FVC_EV<0-3>_BBOX_CMDS_READS and FVC_EV<0-3>_BBOX_CMDS_WRITES events that measure memory reads or writes but cannot be measured at the same time.</P>
+
+
+
+\anchor WESEX_BBOX
+<H2>BBOX counters</H2>
+<P>The Intel® Westmere EX microarchitecture provides measurements of the Home Agent in the uncore. The description from Intel®:<BR>
+<I>The B-Box is responsible for the protocol side of memory interactions, including coherent and non-coherent home agent protocols (as defined in the Intel® QuickPath Interconnect Specification). Additionally, the B-Box is responsible for ordering memory reads/writes to a given address such that the M-Box does not have to perform this conflict checking. All requests for memory attached to the coupled M-Box must first be ordered through the B-Box.
+</I><BR>
+The memory traffic in an Intel® Westmere EX system is controller by the Home Agents. Each MBOX has a corresponding BBOX. Each BBOX offers 4 general-purpose counters. They are exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>BBOX<0,1>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>BBOX<0,1>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>BBOX<0,1>C2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>BBOX<0,1>C3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Special handling for events</H3>
+<P>For the matching events MSG_IN_MATCH, MSG_ADDR_IN_MATCH, MSG_OPCODE_ADDR_IN_MATCH, MSG_OPCODE_IN_MATCH, MSG_OPCODE_OUT_MATCH, MSG_OUT_MATCH, OPCODE_ADDR_IN_MATCH, OPCODE_IN_MATCH, OPCODE_OUT_MATCH and ADDR_IN_MATCH two counter options are available:</P>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+ <TD>match0</TD>
+ <TD>60 bit hex value</TD>
+ <TD>Set bits 0-59 in MSR_B<0,1>_PMON_MATCH register</TD>
+ <TD>For register layout and valid settings see <A HREF="http://www.intel.com/content/www/us/en/processors/xeon/xeon-e7-family-uncore-performance-programming-guide.html">Intel® Xeon® Processor E7 Family uncore Performance Monitoring Guide</A></TD>
+</TR>
+<TR>
+ <TD>mask0</TD>
+ <TD>60 bit hex value</TD>
+ <TD>Set bits 0-59 in MSR_B<0,1>_PMON_MASK register</TD>
+ <TD>For register layout and valid settings see <A HREF="http://www.intel.com/content/www/us/en/processors/xeon/xeon-e7-family-uncore-performance-programming-guide.html">Intel® Xeon® Processor E7 Family uncore Performance Monitoring Guide</A></TD>
+</TR>
+</TABLE>
+
+\anchor WESEX_RBOX
+<H2>RBOX counters</H2>
+<P>The Intel® Westmere EX microarchitecture provides measurements of the crossbar router in the uncore. The description from Intel®:<BR>
+<I>The Crossbar Router (R-Box) is a 8 port switch/router implementing the Intel® QuickPath Interconnect Link and Routing layers. The R-Box is responsible for routing and transmitting all intra- and inter-processor communication.
+</I><BR>
+The Intel® Westmere EX microarchitecture has two interfaces to the RBOX although each socket contains only one crossbar router, RBOX0 is the left part and RBOX1 is the right part of the single RBOX. Each RBOX side offers 8 general-purpose counters. They are exposed through the MSR interface to the operating system kernel. The MBOX and RBOX setup routines are taken from Likwid 3, they are not as flexible as the newer setup routines but programming of the MBOXes and RBOXes is tedious f [...]
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>RBOX<0,1>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>RBOX<0,1>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>RBOX<0,1>C2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>RBOX<0,1>C3</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>RBOX<0,1>C4</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>RBOX<0,1>C5</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>RBOX<0,1>C6</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>RBOX<0,1>C7</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+
+\anchor WESEX_CBOX
+<H2>CBOX counters</H2>
+<P>The Intel® Westmere EX microarchitecture provides measurements of the LLC coherency engine in the uncore. The description from Intel®:<BR>
+<I>For the Intel Xeon Processor 7500 Series, the LLC coherence engine (C-Box) manages the interface between the core and the last level cache (LLC). All core transactions that access the LLC are directed from the core to a C-Box via the ring interconnect. The C-Box is responsible for managing data delivery from the LLC to the requesting core. It is also responsible for maintaining coherence between the cores within the socket that share the LLC; generating snoops and collecting snoop res [...]
+The C-Box is also the gate keeper for all Intel® QuickPath Interconnect (Intel® QPI) messages that originate in the core and is responsible for ensuring that all Intel QuickPath Interconnect messages that pass through the socket’s LLC remain coherent.
+</I><BR>
+The Intel® Westmere EX microarchitecture has 10 CBOX instances. Each CBOX offers 6 general-purpose counters. They are exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>CBOX<0-9>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>CBOX<0-9>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>CBOX<0-9>C2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>CBOX<0-9>C3</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>CBOX<0-9>C4</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>CBOX<0-9>C5</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>5 bit hex value</TD>
+ <TD>Set bits 24-28 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor WESEX_SBOX
+<H2>SBOX counters</H2>
+<P>The Intel® Westmere EX microarchitecture provides measurements of the LLC-to-QPI interface in the uncore. The description from Intel®:<BR>
+<I>The S-Box represents the interface between the last level cache and the system interface. It manages flow control between the C and R & B-Boxes. The S-Box is broken into system bound (ring to Intel® QPI) and ring bound (Intel® QPI to ring) connections.<BR>
+As such, it shares responsibility with the C-Box(es) as the Intel® QPI caching agent(s). It is responsible for converting C-box requests to Intel® QPI messages (i.e. snoop generation and data response messages from the snoop response) as well as converting/forwarding ring messages to Intel® QPI packets and vice versa.
+</I><BR>
+The Intel® Westmere EX microarchitecture has 2 SBOX instances. Each SBOX offers 4 general-purpose counters. They are exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>SBOX<0,1>C0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>SBOX<0,1>C1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>SBOX<0,1>C2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>SBOX<0,1>C3</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>Only for the TO_R_PROG_EV events two counter options are available:</P>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>match0</TD>
+ <TD>64 bit hex value</TD>
+ <TD>Set bit 0-63 in MSR_S<0,1>_PMON_MATCH register</TD>
+ <TD>For register layout and valid settings see <A HREF="http://www.intel.com/content/www/us/en/processors/xeon/xeon-e7-family-uncore-performance-programming-guide.html">Intel® Xeon® Processor E7 Family uncore Performance Monitoring Guide</A></TD>
+</TR>
+<TR>
+ <TD>mask0</TD>
+ <TD>39 bit hex value</TD>
+ <TD>Set bit 0-38 in MSR_S<0,1>_PMON_MASK register</TD>
+ <TD>For register layout and valid settings see <A HREF="http://www.intel.com/content/www/us/en/processors/xeon/xeon-e7-family-uncore-performance-programming-guide.html">Intel® Xeon® Processor E7 Family uncore Performance Monitoring Guide</A></TD>
+</TR>
+</TABLE>
+
+\anchor WESEX_WBOX
+<H2>WBOX counters</H2>
+<P>The Intel® Westmere EX microarchitecture provides measurements of the power controller in the uncore. The description from Intel®:<BR>
+<I>The W-Box is the primary Power Controller for the Intel® Xeon® Processor 7500 Series.
+</I><BR>
+The Intel® Westmere EX microarchitecture has one WBOX and it offers 4 general-purpose counters and one fixed counter. They are exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>WBOX0</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>WBOX1</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>WBOX2</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>WBOX3</TD>
+ <TD>*</TD>
+</TR>
+<TR>
+ <TD>WBOXFIX</TD>
+ <TD>UNCORE_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+<H3>Available Options (Only for WBOX<0-3> counters)</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>threshold</TD>
+ <TD>8 bit hex value</TD>
+ <TD>Set bits 24-31 in config register</TD>
+ <TD></TD>
+</TR>
+<TR>
+ <TD>invert</TD>
+ <TD>N</TD>
+ <TD>Set bit 23 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+\anchor WESEX_UBOX
+<H2>UBOX counters</H2>
+<P>The Intel® Westmere EX microarchitecture provides measurements of the system configuration controller in the uncore. The description from Intel®:<BR>
+<I>The U-Box serves as the system configuration controller for the Intel® Xeon® Processor E7 Family.
+</I><BR>
+The Intel® Westmere EX microarchitecture has one UBOX and it offers a single general-purpose counter. It is exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+ <TH>Counter name</TH>
+ <TH>Event name</TH>
+</TR>
+<TR>
+ <TD>UBOX0</TD>
+ <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options (Only for WBOX<0-3> counters)</H3>
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Argument</TH>
+ <TH>Description</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>edgedetect</TD>
+ <TD>N</TD>
+ <TD>Set bit 18 in config register</TD>
+ <TD></TD>
+</TR>
+</TABLE>
+
+*/
diff --git a/doc/bstrlib.txt b/doc/bstrlib.txt
new file mode 100644
index 0000000..d0f02f7
--- /dev/null
+++ b/doc/bstrlib.txt
@@ -0,0 +1,3201 @@
+Better String library
+---------------------
+
+by Paul Hsieh
+
+The bstring library is an attempt to provide improved string processing
+functionality to the C and C++ language. At the heart of the bstring library
+(Bstrlib for short) is the management of "bstring"s which are a significant
+improvement over '\0' terminated char buffers.
+
+===============================================================================
+
+Motivation
+----------
+
+The standard C string library has serious problems:
+
+ 1) Its use of '\0' to denote the end of the string means knowing a
+ string's length is O(n) when it could be O(1).
+ 2) It imposes an interpretation for the character value '\0'.
+ 3) gets() always exposes the application to a buffer overflow.
+ 4) strtok() modifies the string its parsing and thus may not be usable in
+ programs which are re-entrant or multithreaded.
+ 5) fgets has the unusual semantic of ignoring '\0's that occur before
+ '\n's are consumed.
+ 6) There is no memory management, and actions performed such as strcpy,
+ strcat and sprintf are common places for buffer overflows.
+ 7) strncpy() doesn't '\0' terminate the destination in some cases.
+ 8) Passing NULL to C library string functions causes an undefined NULL
+ pointer access.
+ 9) Parameter aliasing (overlapping, or self-referencing parameters)
+ within most C library functions has undefined behavior.
+ 10) Many C library string function calls take integer parameters with
+ restricted legal ranges. Parameters passed outside these ranges are
+ not typically detected and cause undefined behavior.
+
+So the desire is to create an alternative string library that does not suffer
+from the above problems and adds in the following functionality:
+
+ 1) Incorporate string functionality seen from other languages.
+ a) MID$() - from BASIC
+ b) split()/join() - from Python
+ c) string/char x n - from Perl
+ 2) Implement analogs to functions that combine stream IO and char buffers
+ without creating a dependency on stream IO functionality.
+ 3) Implement the basic text editor-style functions insert, delete, find,
+ and replace.
+ 4) Implement reference based sub-string access (as a generalization of
+ pointer arithmetic.)
+ 5) Implement runtime write protection for strings.
+
+There is also a desire to avoid "API-bloat". So functionality that can be
+implemented trivially in other functionality is omitted. So there is no
+left$() or right$() or reverse() or anything like that as part of the core
+functionality.
+
+Explaining Bstrings
+-------------------
+
+A bstring is basically a header which wraps a pointer to a char buffer. Lets
+start with the declaration of a struct tagbstring:
+
+ struct tagbstring {
+ int mlen;
+ int slen;
+ unsigned char * data;
+ };
+
+This definition is considered exposed, not opaque (though it is neither
+necessary nor recommended that low level maintenance of bstrings be performed
+whenever the abstract interfaces are sufficient). The mlen field (usually)
+describes a lower bound for the memory allocated for the data field. The
+slen field describes the exact length for the bstring. The data field is a
+single contiguous buffer of unsigned chars. Note that the existence of a '\0'
+character in the unsigned char buffer pointed to by the data field does not
+necessarily denote the end of the bstring.
+
+To be a well formed modifiable bstring the mlen field must be at least the
+length of the slen field, and slen must be non-negative. Furthermore, the
+data field must point to a valid buffer in which access to the first mlen
+characters has been acquired. So the minimal check for correctness is:
+
+ (slen >= 0 && mlen >= slen && data != NULL)
+
+bstrings returned by bstring functions can be assumed to be either NULL or
+satisfy the above property. (When bstrings are only readable, the mlen >=
+slen restriction is not required; this is discussed later in this section.)
+A bstring itself is just a pointer to a struct tagbstring:
+
+ typedef struct tagbstring * bstring;
+
+Note that use of the prefix "tag" in struct tagbstring is required to work
+around the inconsistency between C and C++'s struct namespace usage. This
+definition is also considered exposed.
+
+Bstrlib basically manages bstrings allocated as a header and an associated
+data-buffer. Since the implementation is exposed, they can also be
+constructed manually. Functions which mutate bstrings assume that the header
+and data buffer have been malloced; the bstring library may perform free() or
+realloc() on both the header and data buffer of any bstring parameter.
+Functions which return bstring's create new bstrings. The string memory is
+freed by a bdestroy() call (or using the bstrFree macro).
+
+The following related typedef is also provided:
+
+ typedef const struct tagbstring * const_bstring;
+
+which is also considered exposed. These are directly bstring compatible (no
+casting required) but are just used for parameters which are meant to be
+non-mutable. So in general, bstring parameters which are read as input but
+not meant to be modified will be declared as const_bstring, and bstring
+parameters which may be modified will be declared as bstring. This convention
+is recommended for user written functions as well.
+
+Since bstrings maintain interoperability with C library char-buffer style
+strings, all functions which modify, update or create bstrings also append a
+'\0' character into the position slen + 1. This trailing '\0' character is
+not required for bstrings input to the bstring functions; this is provided
+solely as a convenience for interoperability with standard C char-buffer
+functionality.
+
+Analogs for the ANSI C string library functions have been created when they
+are necessary, but have also been left out when they are not. In particular
+there are no functions analogous to fwrite, or puts just for the purposes of
+bstring. The ->data member of any string is exposed, and therefore can be
+used just as easily as char buffers for C functions which read strings.
+
+For those that wish to hand construct bstrings, the following should be kept
+in mind:
+
+ 1) While bstrlib can accept constructed bstrings without terminating
+ '\0' characters, the rest of the C language string library will not
+ function properly on such non-terminated strings. This is obvious
+ but must be kept in mind.
+ 2) If it is intended that a constructed bstring be written to by the
+ bstring library functions then the data portion should be allocated
+ by the malloc function and the slen and mlen fields should be entered
+ properly. The struct tagbstring header is not reallocated, and only
+ freed by bdestroy.
+ 3) Writing arbitrary '\0' characters at various places in the string
+ will not modify its length as perceived by the bstring library
+ functions. In fact, '\0' is a legitimate non-terminating character
+ for a bstring to contain.
+ 4) For read only parameters, bstring functions do not check the mlen.
+ I.e., the minimal correctness requirements are reduced to:
+
+ (slen >= 0 && data != NULL)
+
+Better pointer arithmetic
+-------------------------
+
+One built-in feature of '\0' terminated char * strings, is that its very easy
+and fast to obtain a reference to the tail of any string using pointer
+arithmetic. Bstrlib does one better by providing a way to get a reference to
+any substring of a bstring (or any other length delimited block of memory.)
+So rather than just having pointer arithmetic, with bstrlib one essentially
+has segment arithmetic. This is achieved using the macro blk2tbstr() which
+builds a reference to a block of memory and the macro bmid2tbstr() which
+builds a reference to a segment of a bstring. Bstrlib also includes
+functions for direct consumption of memory blocks into bstrings, namely
+bcatblk () and blk2bstr ().
+
+One scenario where this can be extremely useful is when string contains many
+substrings which one would like to pass as read-only reference parameters to
+some string consuming function without the need to allocate entire new
+containers for the string data. More concretely, imagine parsing a command
+line string whose parameters are space delimited. This can only be done for
+tails of the string with '\0' terminated char * strings.
+
+Improved NULL semantics and error handling
+------------------------------------------
+
+Unless otherwise noted, if a NULL pointer is passed as a bstring or any other
+detectably illegal parameter, the called function will return with an error
+indicator (either NULL or BSTR_ERR) rather than simply performing a NULL
+pointer access, or having undefined behavior.
+
+To illustrate the value of this, consider the following example:
+
+ strcpy (p = malloc (13 * sizeof (char)), "Hello,");
+ strcat (p, " World");
+
+This is not correct because malloc may return NULL (due to an out of memory
+condition), and the behaviour of strcpy is undefined if either of its
+parameters are NULL. However:
+
+ bstrcat (p = bfromcstr ("Hello,"), q = bfromcstr (" World"));
+ bdestroy (q);
+
+is well defined, because if either p or q are assigned NULL (indicating a
+failure to allocate memory) both bstrcat and bdestroy will recognize it and
+perform no detrimental action.
+
+Note that it is not necessary to check any of the members of a returned
+bstring for internal correctness (in particular the data member does not need
+to be checked against NULL when the header is non-NULL), since this is
+assured by the bstring library itself.
+
+bStreams
+--------
+
+In addition to the bgets and bread functions, bstrlib can abstract streams
+with a high performance read only stream called a bStream. In general, the
+idea is to open a core stream (with something like fopen) then pass its
+handle as well as a bNread function pointer (like fread) to the bsopen
+function which will return a handle to an open bStream. Then the functions
+bsread, bsreadln or bsreadlns can be called to read portions of the stream.
+Finally, the bsclose function is called to close the bStream -- it will
+return a handle to the original (core) stream. So bStreams, essentially,
+wrap other streams.
+
+The bStreams have two main advantages over the bgets and bread (as well as
+fgets/ungetc) paradigms:
+
+1) Improved functionality via the bunread function which allows a stream to
+ unread characters, giving the bStream stack-like functionality if so
+ desired.
+2) A very high performance bsreadln function. The C library function fgets()
+ (and the bgets function) can typically be written as a loop on top of
+ fgetc(), thus paying all of the overhead costs of calling fgetc on a per
+ character basis. bsreadln will read blocks at a time, thus amortizing the
+ overhead of fread calls over many characters at once.
+
+However, clearly bStreams are suboptimal or unusable for certain kinds of
+streams (stdin) or certain usage patterns (a few spotty, or non-sequential
+reads from a slow stream.) For those situations, using bgets will be more
+appropriate.
+
+The semantics of bStreams allows practical construction of layerable data
+streams. What this means is that by writing a bNread compatible function on
+top of a bStream, one can construct a new bStream on top of it. This can be
+useful for writing multi-pass parsers that don't actually read the entire
+input more than once and don't require the use of intermediate storage.
+
+Aliasing
+--------
+
+Aliasing occurs when a function is given two parameters which point to data
+structures which overlap in the memory they occupy. While this does not
+disturb read only functions, for many libraries this can make functions that
+write to these memory locations malfunction. This is a common problem of the
+C standard library and especially the string functions in the C standard
+library.
+
+The C standard string library is entirely char by char oriented (as is
+bstring) which makes conforming implementations alias safe for some
+scenarios. However no actual detection of aliasing is typically performed,
+so it is easy to find cases where the aliasing will cause anomolous or
+undesirable behaviour (consider: strcat (p, p).) The C99 standard includes
+the "restrict" pointer modifier which allows the compiler to document and
+assume a no-alias condition on usage. However, only the most trivial cases
+can be caught (if at all) by the compiler at compile time, and thus there is
+no actual enforcement of non-aliasing.
+
+Bstrlib, by contrast, permits aliasing and is completely aliasing safe, in
+the C99 sense of aliasing. That is to say, under the assumption that
+pointers of incompatible types from distinct objects can never alias, bstrlib
+is completely aliasing safe. (In practice this means that the data buffer
+portion of any bstring and header of any bstring are assumed to never alias.)
+With the exception of the reference building macros, the library behaves as
+if all read-only parameters are first copied and replaced by temporary
+non-aliased parameters before any writing to any output bstring is performed
+(though actual copying is extremely rarely ever done.)
+
+Besides being a useful safety feature, bstring searching/comparison
+functions can improve to O(1) execution when aliasing is detected.
+
+Note that aliasing detection and handling code in Bstrlib is generally
+extremely cheap. There is almost never any appreciable performance penalty
+for using aliased parameters.
+
+Reenterancy
+-----------
+
+Nearly every function in Bstrlib is a leaf function, and is completely
+reenterable with the exception of writing to common bstrings. The split
+functions which use a callback mechanism requires only that the source string
+not be destroyed by the callback function unless the callback function returns
+with an error status (note that Bstrlib functions which return an error do
+not modify the string in any way.) The string can in fact be modified by the
+callback and the behaviour is deterministic. See the documentation of the
+various split functions for more details.
+
+Undefined scenarios
+-------------------
+
+One of the basic important premises for Bstrlib is to not to increase the
+propogation of undefined situations from parameters that are otherwise legal
+in of themselves. In particular, except for extremely marginal cases, usages
+of bstrings that use the bstring library functions alone cannot lead to any
+undefined action. But due to C/C++ language and library limitations, there
+is no way to define a non-trivial library that is completely without
+undefined operations. All such possible undefined operations are described
+below:
+
+1) bstrings or struct tagbstrings that are not explicitely initialized cannot
+ be passed as a parameter to any bstring function.
+2) The members of the NULL bstring cannot be accessed directly. (Though all
+ APIs and macros detect the NULL bstring.)
+3) A bstring whose data member has not been obtained from a malloc or
+ compatible call and which is write accessible passed as a writable
+ parameter will lead to undefined results. (i.e., do not writeAllow any
+ constructed bstrings unless the data portion has been obtained from the
+ heap.)
+4) If the headers of two strings alias but are not identical (which can only
+ happen via a defective manual construction), then passing them to a
+ bstring function in which one is writable is not defined.
+5) If the mlen member is larger than the actual accessible length of the data
+ member for a writable bstring, or if the slen member is larger than the
+ readable length of the data member for a readable bstring, then the
+ corresponding bstring operations are undefined.
+6) Any bstring definition whose header or accessible data portion has been
+ assigned to inaccessible or otherwise illegal memory clearly cannot be
+ acted upon by the bstring library in any way.
+7) Destroying the source of an incremental split from within the callback
+ and not returning with a negative value (indicating that it should abort)
+ will lead to undefined behaviour. (Though *modifying* or adjusting the
+ state of the source data, even if those modification fail within the
+ bstrlib API, has well defined behavior.)
+8) Modifying a bstring which is write protected by direct access has
+ undefined behavior.
+
+While this may seem like a long list, with the exception of invalid uses of
+the writeAllow macro, and source destruction during an iterative split
+without an accompanying abort, no usage of the bstring API alone can cause
+any undefined scenario to occurr. I.e., the policy of restricting usage of
+bstrings to the bstring API can significantly reduce the risk of runtime
+errors (in practice it should eliminate them) related to string manipulation
+due to undefined action.
+
+C++ wrapper
+-----------
+
+A C++ wrapper has been created to enable bstring functionality for C++ in the
+most natural (for C++ programers) way possible. The mandate for the C++
+wrapper is different from the base C bstring library. Since the C++ language
+has far more abstracting capabilities, the CBString structure is considered
+fully abstracted -- i.e., hand generated CBStrings are not supported (though
+conversion from a struct tagbstring is allowed) and all detectable errors are
+manifest as thrown exceptions.
+
+- The C++ class definitions are all under the namespace Bstrlib. bstrwrap.h
+ enables this namespace (with a using namespace Bstrlib; directive at the
+ end) unless the macro BSTRLIB_DONT_ASSUME_NAMESPACE has been defined before
+ it is included.
+
+- Erroneous accesses results in an exception being thrown. The exception
+ parameter is of type "struct CBStringException" which is derived from
+ std::exception if STL is used. A verbose description of the error message
+ can be obtained from the what() method.
+
+- CBString is a C++ structure derived from a struct tagbstring. An address
+ of a CBString cast to a bstring must not be passed to bdestroy. The bstring
+ C API has been made C++ safe and can be used directly in a C++ project.
+
+- It includes constructors which can take a char, '\0' terminated char
+ buffer, tagbstring, (char, repeat-value), a length delimited buffer or a
+ CBStringList to initialize it.
+
+- Concatenation is performed with the + and += operators. Comparisons are
+ done with the ==, !=, <, >, <= and >= operators. Note that == and != use
+ the biseq call, while <, >, <= and >= use bstrcmp.
+
+- CBString's can be directly cast to const character buffers.
+
+- CBString's can be directly cast to double, float, int or unsigned int so
+ long as the CBString are decimal representations of those types (otherwise
+ an exception will be thrown). Converting the other way should be done with
+ the format(a) method(s).
+
+- CBString contains the length, character and [] accessor methods. The
+ character and [] accessors are aliases of each other. If the bounds for
+ the string are exceeded, an exception is thrown. To avoid the overhead for
+ this check, first cast the CBString to a (const char *) and use [] to
+ dereference the array as normal. Note that the character and [] accessor
+ methods allows both reading and writing of individual characters.
+
+- The methods: format, formata, find, reversefind, findcaseless,
+ reversefindcaseless, midstr, insert, insertchrs, replace, findreplace,
+ findreplacecaseless, remove, findchr, nfindchr, alloc, toupper, tolower,
+ gets, read are analogous to the functions that can be found in the C API.
+
+- The caselessEqual and caselessCmp methods are analogous to biseqcaseless
+ and bstricmp functions respectively.
+
+- Note that just like the bformat function, the format and formata methods do
+ not automatically cast CBStrings into char * strings for "%s"-type
+ substitutions:
+
+ CBString w("world");
+ CBString h("Hello");
+ CBString hw;
+
+ /* The casts are necessary */
+ hw.format ("%s, %s", (const char *)h, (const char *)w);
+
+- The methods trunc and repeat have been added instead of using pattern.
+
+- ltrim, rtrim and trim methods have been added. These remove characters
+ from a given character string set (defaulting to the whitespace characters)
+ from either the left, right or both ends of the CBString, respectively.
+
+- The method setsubstr is also analogous in functionality to bsetstr, except
+ that it cannot be passed NULL. Instead the method fill and the fill-style
+ constructor have been supplied to enable this functionality.
+
+- The writeprotect(), writeallow() and iswriteprotected() methods are
+ analogous to the bwriteprotect(), bwriteallow() and biswriteprotected()
+ macros in the C API. Write protection semantics in CBString are stronger
+ than with the C API in that indexed character assignment is checked for
+ write protection. However, unlike with the C API, a write protected
+ CBString can be destroyed by the destructor.
+
+- CBStream is a C++ structure which wraps a struct bStream (its not derived
+ from it, since destruction is slightly different). It is constructed by
+ passing in a bNread function pointer and a stream parameter cast to void *.
+ This structure includes methods for detecting eof, setting the buffer
+ length, reading the whole stream or reading entries line by line or block
+ by block, an unread function, and a peek function.
+
+- If STL is available, the CBStringList structure is derived from a vector of
+ CBString with various split methods. The split method has been overloaded
+ to accept either a character or CBString as the second parameter (when the
+ split parameter is a CBString any character in that CBString is used as a
+ seperator). The splitstr method takes a CBString as a substring seperator.
+ Joins can be performed via a CBString constructor which takes a
+ CBStringList as a parameter, or just using the CBString::join() method.
+
+- If there is proper support for std::iostreams, then the >> and << operators
+ and the getline() function have been added (with semantics the same as
+ those for std::string).
+
+Multithreading
+--------------
+
+A mutable bstring is kind of analogous to a small (two entry) linked list
+allocated by malloc, with all aliasing completely under programmer control.
+I.e., manipulation of one bstring will never affect any other distinct
+bstring unless explicitely constructed to do so by the programmer via hand
+construction or via building a reference. Bstrlib also does not use any
+static or global storage, so there are no hidden unremovable race conditions.
+Bstrings are also clearly not inherently thread local. So just like
+char *'s, bstrings can be passed around from thread to thread and shared and
+so on, so long as modifications to a bstring correspond to some kind of
+exclusive access lock as should be expected (or if the bstring is read-only,
+which can be enforced by bstring write protection) for any sort of shared
+object in a multithreaded environment.
+
+Bsafe module
+------------
+
+For convenience, a bsafe module has been included. The idea is that if this
+module is included, inadvertant usage of the most dangerous C functions will
+be overridden and lead to an immediate run time abort. Of course, it should
+be emphasized that usage of this module is completely optional. The
+intention is essentially to provide an option for creating project safety
+rules which can be enforced mechanically rather than socially. This is
+useful for larger, or open development projects where its more difficult to
+enforce social rules or "coding conventions".
+
+Problems not solved
+-------------------
+
+Bstrlib is written for the C and C++ languages, which have inherent weaknesses
+that cannot be easily solved:
+
+1. Memory leaks: Forgetting to call bdestroy on a bstring that is about to be
+ unreferenced, just as forgetting to call free on a heap buffer that is
+ about to be dereferenced. Though bstrlib itself is leak free.
+2. Read before write usage: In C, declaring an auto bstring does not
+ automatically fill it with legal/valid contents. This problem has been
+ somewhat mitigated in C++. (The bstrDeclare and bstrFree macros from
+ bstraux can be used to help mitigate this problem.)
+
+Other problems not addressed:
+
+3. Built-in mutex usage to automatically avoid all bstring internal race
+ conditions in multitasking environments: The problem with trying to
+ implement such things at this low a level is that it is typically more
+ efficient to use locks in higher level primitives. There is also no
+ platform independent way to implement locks or mutexes.
+4. Unicode/widecharacter support.
+
+Note that except for spotty support of wide characters, the default C
+standard library does not address any of these problems either.
+
+Configurable compilation options
+--------------------------------
+
+All configuration options are meant solely for the purpose of compiler
+compatibility. Configuration options are not meant to change the semantics
+or capabilities of the library, except where it is unavoidable.
+
+Since some C++ compilers don't include the Standard Template Library and some
+have the options of disabling exception handling, a number of macros can be
+used to conditionally compile support for each of this:
+
+BSTRLIB_CAN_USE_STL
+
+ - defining this will enable the used of the Standard Template Library.
+ Defining BSTRLIB_CAN_USE_STL overrides the BSTRLIB_CANNOT_USE_STL macro.
+
+BSTRLIB_CANNOT_USE_STL
+
+ - defining this will disable the use of the Standard Template Library.
+ Defining BSTRLIB_CAN_USE_STL overrides the BSTRLIB_CANNOT_USE_STL macro.
+
+BSTRLIB_CAN_USE_IOSTREAM
+
+ - defining this will enable the used of streams from class std. Defining
+ BSTRLIB_CAN_USE_IOSTREAM overrides the BSTRLIB_CANNOT_USE_IOSTREAM macro.
+
+BSTRLIB_CANNOT_USE_IOSTREAM
+
+ - defining this will disable the use of streams from class std. Defining
+ BSTRLIB_CAN_USE_IOSTREAM overrides the BSTRLIB_CANNOT_USE_IOSTREAM macro.
+
+BSTRLIB_THROWS_EXCEPTIONS
+
+ - defining this will enable the exception handling within bstring.
+ Defining BSTRLIB_THROWS_EXCEPTIONS overrides the
+ BSTRLIB_DOESNT_THROWS_EXCEPTIONS macro.
+
+BSTRLIB_DOESNT_THROW_EXCEPTIONS
+
+ - defining this will disable the exception handling within bstring.
+ Defining BSTRLIB_THROWS_EXCEPTIONS overrides the
+ BSTRLIB_DOESNT_THROW_EXCEPTIONS macro.
+
+Note that these macros must be defined consistently throughout all modules
+that use CBStrings including bstrwrap.cpp.
+
+Some older C compilers do not support functions such as vsnprintf. This is
+handled by the following macro variables:
+
+BSTRLIB_NOVSNP
+
+ - defining this indicates that the compiler does not support vsnprintf.
+ This will cause bformat and bformata to not be declared. Note that
+ for some compilers, such as Turbo C, this is set automatically.
+ Defining BSTRLIB_NOVSNP overrides the BSTRLIB_VSNP_OK macro.
+
+BSTRLIB_VSNP_OK
+
+ - defining this will disable the autodetection of compilers that do not
+ vsnprintf.
+ Defining BSTRLIB_NOVSNP overrides the BSTRLIB_VSNP_OK macro.
+
+Semantic compilation options
+----------------------------
+
+Bstrlib comes with very few compilation options for changing the semantics of
+of the library. These are described below.
+
+BSTRLIB_DONT_ASSUME_NAMESPACE
+
+ - Defining this before including bstrwrap.h will disable the automatic
+ enabling of the Bstrlib namespace for the C++ declarations.
+
+BSTRLIB_DONT_USE_VIRTUAL_DESTRUCTOR
+
+ - Defining this will make the CBString destructor non-virtual.
+
+BSTRLIB_MEMORY_DEBUG
+
+ - Defining this will cause the bstrlib modules bstrlib.c and bstrwrap.cpp
+ to invoke a #include "memdbg.h". memdbg.h has to be supplied by the user.
+
+Note that these macros must be defined consistently throughout all modules
+that use bstrings or CBStrings including bstrlib.c, bstraux.c and
+bstrwrap.cpp.
+
+===============================================================================
+
+Files
+-----
+
+bstrlib.c - C implementaion of bstring functions.
+bstrlib.h - C header file for bstring functions.
+bstraux.c - C example that implements trivial additional functions.
+bstraux.h - C header for bstraux.c
+bstest.c - C unit/regression test for bstrlib.c
+
+bstrwrap.cpp - C++ implementation of CBString.
+bstrwrap.h - C++ header file for CBString.
+test.cpp - C++ unit/regression test for bstrwrap.cpp
+
+bsafe.c - C runtime stubs to abort usage of unsafe C functions.
+bsafe.h - C header file for bsafe.c functions.
+
+C projects need only include bstrlib.h and compile/link bstrlib.c to use the
+bstring library. C++ projects need to additionally include bstrwrap.h and
+compile/link bstrwrap.cpp. For both, there may be a need to make choices
+about feature configuration as described in the "Configurable compilation
+options" in the section above.
+
+Other files that are included in this archive are:
+
+license.txt - The BSD license for Bstrlib
+gpl.txt - The GPL version 2
+security.txt - A security statement useful for auditting Bstrlib
+porting.txt - A guide to porting Bstrlib
+bstrlib.txt - This file
+
+===============================================================================
+
+The functions
+-------------
+
+ extern bstring bfromcstr (const char * str);
+
+ Take a standard C library style '\0' terminated char buffer and generate
+ a bstring with the same contents as the char buffer. If an error occurs
+ NULL is returned.
+
+ So for example:
+
+ bstring b = bfromcstr ("Hello");
+ if (!b) {
+ fprintf (stderr, "Out of memory");
+ } else {
+ puts ((char *) b->data);
+ }
+
+ ..........................................................................
+
+ extern bstring bfromcstralloc (int mlen, const char * str);
+
+ Create a bstring which contains the contents of the '\0' terminated
+ char * buffer str. The memory buffer backing the bstring is at least
+ mlen characters in length. If an error occurs NULL is returned.
+
+ So for example:
+
+ bstring b = bfromcstralloc (64, someCstr);
+ if (b) b->data[63] = 'x';
+
+ The idea is that this will set the 64th character of b to 'x' if it is at
+ least 64 characters long otherwise do nothing. And we know this is well
+ defined so long as b was successfully created, since it will have been
+ allocated with at least 64 characters.
+
+ ..........................................................................
+
+ extern bstring blk2bstr (const void * blk, int len);
+
+ Create a bstring whose contents are described by the contiguous buffer
+ pointing to by blk with a length of len bytes. Note that this function
+ creates a copy of the data in blk, rather than simply referencing it.
+ Compare with the blk2tbstr macro. If an error occurs NULL is returned.
+
+ ..........................................................................
+
+ extern char * bstr2cstr (const_bstring s, char z);
+
+ Create a '\0' terminated char buffer which contains the contents of the
+ bstring s, except that any contained '\0' characters are converted to the
+ character in z. This returned value should be freed with bcstrfree(), by
+ the caller. If an error occurs NULL is returned.
+
+ ..........................................................................
+
+ extern int bcstrfree (char * s);
+
+ Frees a C-string generated by bstr2cstr (). This is normally unnecessary
+ since it just wraps a call to free (), however, if malloc () and free ()
+ have been redefined as a macros within the bstrlib module (via macros in
+ the memdbg.h backdoor) with some difference in behaviour from the std
+ library functions, then this allows a correct way of freeing the memory
+ that allows higher level code to be independent from these macro
+ redefinitions.
+
+ ..........................................................................
+
+ extern bstring bstrcpy (const_bstring b1);
+
+ Make a copy of the passed in bstring. The copied bstring is returned if
+ there is no error, otherwise NULL is returned.
+
+ ..........................................................................
+
+ extern int bassign (bstring a, const_bstring b);
+
+ Overwrite the bstring a with the contents of bstring b. Note that the
+ bstring a must be a well defined and writable bstring. If an error
+ occurs BSTR_ERR is returned and a is not overwritten.
+
+ ..........................................................................
+
+ int bassigncstr (bstring a, const char * str);
+
+ Overwrite the string a with the contents of char * string str. Note that
+ the bstring a must be a well defined and writable bstring. If an error
+ occurs BSTR_ERR is returned and a may be partially overwritten.
+
+ ..........................................................................
+
+ int bassignblk (bstring a, const void * s, int len);
+
+ Overwrite the string a with the contents of the block (s, len). Note that
+ the bstring a must be a well defined and writable bstring. If an error
+ occurs BSTR_ERR is returned and a is not overwritten.
+
+ ..........................................................................
+
+ extern int bassignmidstr (bstring a, const_bstring b, int left, int len);
+
+ Overwrite the bstring a with the middle of contents of bstring b
+ starting from position left and running for a length len. left and
+ len are clamped to the ends of b as with the function bmidstr. Note that
+ the bstring a must be a well defined and writable bstring. If an error
+ occurs BSTR_ERR is returned and a is not overwritten.
+
+ ..........................................................................
+
+ extern bstring bmidstr (const_bstring b, int left, int len);
+
+ Create a bstring which is the substring of b starting from position left
+ and running for a length len (clamped by the end of the bstring b.) If
+ there was no error, the value of this constructed bstring is returned
+ otherwise NULL is returned.
+
+ ..........................................................................
+
+ extern int bdelete (bstring s1, int pos, int len);
+
+ Removes characters from pos to pos+len-1 and shifts the tail of the
+ bstring starting from pos+len to pos. len must be positive for this call
+ to have any effect. The section of the bstring described by (pos, len)
+ is clamped to boundaries of the bstring b. The value BSTR_OK is returned
+ if the operation is successful, otherwise BSTR_ERR is returned.
+
+ ..........................................................................
+
+ extern int bconcat (bstring b0, const_bstring b1);
+
+ Concatenate the bstring b1 to the end of bstring b0. The value BSTR_OK
+ is returned if the operation is successful, otherwise BSTR_ERR is
+ returned.
+
+ ..........................................................................
+
+ extern int bconchar (bstring b, char c);
+
+ Concatenate the character c to the end of bstring b. The value BSTR_OK
+ is returned if the operation is successful, otherwise BSTR_ERR is
+ returned.
+
+ ..........................................................................
+
+ extern int bcatcstr (bstring b, const char * s);
+
+ Concatenate the char * string s to the end of bstring b. The value
+ BSTR_OK is returned if the operation is successful, otherwise BSTR_ERR is
+ returned.
+
+ ..........................................................................
+
+ extern int bcatblk (bstring b, const void * s, int len);
+
+ Concatenate a fixed length buffer (s, len) to the end of bstring b. The
+ value BSTR_OK is returned if the operation is successful, otherwise
+ BSTR_ERR is returned.
+
+ ..........................................................................
+
+ extern int biseq (const_bstring b0, const_bstring b1);
+
+ Compare the bstring b0 and b1 for equality. If the bstrings differ, 0
+ is returned, if the bstrings are the same, 1 is returned, if there is an
+ error, -1 is returned. If the length of the bstrings are different, this
+ function has O(1) complexity. Contained '\0' characters are not treated
+ as a termination character.
+
+ Note that the semantics of biseq are not completely compatible with
+ bstrcmp because of its different treatment of the '\0' character.
+
+ ..........................................................................
+
+ extern int bisstemeqblk (const_bstring b, const void * blk, int len);
+
+ Compare beginning of bstring b0 with a block of memory of length len for
+ equality. If the beginning of b0 differs from the memory block (or if b0
+ is too short), 0 is returned, if the bstrings are the same, 1 is returned,
+ if there is an error, -1 is returned.
+
+ ..........................................................................
+
+ extern int biseqcaseless (const_bstring b0, const_bstring b1);
+
+ Compare two bstrings for equality without differentiating between case.
+ If the bstrings differ other than in case, 0 is returned, if the bstrings
+ are the same, 1 is returned, if there is an error, -1 is returned. If
+ the length of the bstrings are different, this function is O(1). '\0'
+ termination characters are not treated in any special way.
+
+ ..........................................................................
+
+ extern int bisstemeqcaselessblk (const_bstring b0, const void * blk, int len);
+
+ Compare beginning of bstring b0 with a block of memory of length len
+ without differentiating between case for equality. If the beginning of b0
+ differs from the memory block other than in case (or if b0 is too short),
+ 0 is returned, if the bstrings are the same, 1 is returned, if there is an
+ error, -1 is returned.
+
+ ..........................................................................
+
+ extern int biseqcstr (const_bstring b, const char *s);
+
+ Compare the bstring b and char * bstring s. The C string s must be '\0'
+ terminated at exactly the length of the bstring b, and the contents
+ between the two must be identical with the bstring b with no '\0'
+ characters for the two contents to be considered equal. This is
+ equivalent to the condition that their current contents will be always be
+ equal when comparing them in the same format after converting one or the
+ other. If they are equal 1 is returned, if they are unequal 0 is
+ returned and if there is a detectable error BSTR_ERR is returned.
+
+ ..........................................................................
+
+ extern int biseqcstrcaseless (const_bstring b, const char *s);
+
+ Compare the bstring b and char * string s. The C string s must be '\0'
+ terminated at exactly the length of the bstring b, and the contents
+ between the two must be identical except for case with the bstring b with
+ no '\0' characters for the two contents to be considered equal. This is
+ equivalent to the condition that their current contents will be always be
+ equal ignoring case when comparing them in the same format after
+ converting one or the other. If they are equal, except for case, 1 is
+ returned, if they are unequal regardless of case 0 is returned and if
+ there is a detectable error BSTR_ERR is returned.
+
+ ..........................................................................
+
+ extern int bstrcmp (const_bstring b0, const_bstring b1);
+
+ Compare the bstrings b0 and b1 for ordering. If there is an error,
+ SHRT_MIN is returned, otherwise a value less than or greater than zero,
+ indicating that the bstring pointed to by b0 is lexicographically less
+ than or greater than the bstring pointed to by b1 is returned. If the
+ bstring lengths are unequal but the characters up until the length of the
+ shorter are equal then a value less than, or greater than zero,
+ indicating that the bstring pointed to by b0 is shorter or longer than the
+ bstring pointed to by b1 is returned. 0 is returned if and only if the
+ two bstrings are the same. If the length of the bstrings are different,
+ this function is O(n). Like its standard C library counter part, the
+ comparison does not proceed past any '\0' termination characters
+ encountered.
+
+ The seemingly odd error return value, merely provides slightly more
+ granularity than the undefined situation given in the C library function
+ strcmp. The function otherwise behaves very much like strcmp().
+
+ Note that the semantics of bstrcmp are not completely compatible with
+ biseq because of its different treatment of the '\0' termination
+ character.
+
+ ..........................................................................
+
+ extern int bstrncmp (const_bstring b0, const_bstring b1, int n);
+
+ Compare the bstrings b0 and b1 for ordering for at most n characters. If
+ there is an error, SHRT_MIN is returned, otherwise a value is returned as
+ if b0 and b1 were first truncated to at most n characters then bstrcmp
+ was called with these new bstrings are paremeters. If the length of the
+ bstrings are different, this function is O(n). Like its standard C
+ library counter part, the comparison does not proceed past any '\0'
+ termination characters encountered.
+
+ The seemingly odd error return value, merely provides slightly more
+ granularity than the undefined situation given in the C library function
+ strncmp. The function otherwise behaves very much like strncmp().
+
+ ..........................................................................
+
+ extern int bstricmp (const_bstring b0, const_bstring b1);
+
+ Compare two bstrings without differentiating between case. The return
+ value is the difference of the values of the characters where the two
+ bstrings first differ, otherwise 0 is returned indicating that the
+ bstrings are equal. If the lengths are different, then a difference from
+ 0 is given, but if the first extra character is '\0', then it is taken to
+ be the value UCHAR_MAX+1.
+
+ ..........................................................................
+
+ extern int bstrnicmp (const_bstring b0, const_bstring b1, int n);
+
+ Compare two bstrings without differentiating between case for at most n
+ characters. If the position where the two bstrings first differ is
+ before the nth position, the return value is the difference of the values
+ of the characters, otherwise 0 is returned. If the lengths are different
+ and less than n characters, then a difference from 0 is given, but if the
+ first extra character is '\0', then it is taken to be the value
+ UCHAR_MAX+1.
+
+ ..........................................................................
+
+ extern int bdestroy (bstring b);
+
+ Deallocate the bstring passed. Passing NULL in as a parameter will have
+ no effect. Note that both the header and the data portion of the bstring
+ will be freed. No other bstring function which modifies one of its
+ parameters will free or reallocate the header. Because of this, in
+ general, bdestroy cannot be called on any declared struct tagbstring even
+ if it is not write protected. A bstring which is write protected cannot
+ be destroyed via the bdestroy call. Any attempt to do so will result in
+ no action taken, and BSTR_ERR will be returned.
+
+ Note to C++ users: Passing in a CBString cast to a bstring will lead to
+ undefined behavior (free will be called on the header, rather than the
+ CBString destructor.) Instead just use the ordinary C++ language
+ facilities to dealloc a CBString.
+
+ ..........................................................................
+
+ extern int binstr (const_bstring s1, int pos, const_bstring s2);
+
+ Search for the bstring s2 in s1 starting at position pos and looking in a
+ forward (increasing) direction. If it is found then it returns with the
+ first position after pos where it is found, otherwise it returns BSTR_ERR.
+ The algorithm used is brute force; O(m*n).
+
+ ..........................................................................
+
+ extern int binstrr (const_bstring s1, int pos, const_bstring s2);
+
+ Search for the bstring s2 in s1 starting at position pos and looking in a
+ backward (decreasing) direction. If it is found then it returns with the
+ first position after pos where it is found, otherwise return BSTR_ERR.
+ Note that the current position at pos is tested as well -- so to be
+ disjoint from a previous forward search it is recommended that the
+ position be backed up (decremented) by one position. The algorithm used
+ is brute force; O(m*n).
+
+ ..........................................................................
+
+ extern int binstrcaseless (const_bstring s1, int pos, const_bstring s2);
+
+ Search for the bstring s2 in s1 starting at position pos and looking in a
+ forward (increasing) direction but without regard to case. If it is
+ found then it returns with the first position after pos where it is
+ found, otherwise it returns BSTR_ERR. The algorithm used is brute force;
+ O(m*n).
+
+ ..........................................................................
+
+ extern int binstrrcaseless (const_bstring s1, int pos, const_bstring s2);
+
+ Search for the bstring s2 in s1 starting at position pos and looking in a
+ backward (decreasing) direction but without regard to case. If it is
+ found then it returns with the first position after pos where it is
+ found, otherwise return BSTR_ERR. Note that the current position at pos
+ is tested as well -- so to be disjoint from a previous forward search it
+ is recommended that the position be backed up (decremented) by one
+ position. The algorithm used is brute force; O(m*n).
+
+ ..........................................................................
+
+ extern int binchr (const_bstring b0, int pos, const_bstring b1);
+
+ Search for the first position in b0 starting from pos or after, in which
+ one of the characters in b1 is found. This function has an execution
+ time of O(b0->slen + b1->slen). If such a position does not exist in b0,
+ then BSTR_ERR is returned.
+
+ ..........................................................................
+
+ extern int binchrr (const_bstring b0, int pos, const_bstring b1);
+
+ Search for the last position in b0 no greater than pos, in which one of
+ the characters in b1 is found. This function has an execution time
+ of O(b0->slen + b1->slen). If such a position does not exist in b0,
+ then BSTR_ERR is returned.
+
+ ..........................................................................
+
+ extern int bninchr (const_bstring b0, int pos, const_bstring b1);
+
+ Search for the first position in b0 starting from pos or after, in which
+ none of the characters in b1 is found and return it. This function has
+ an execution time of O(b0->slen + b1->slen). If such a position does
+ not exist in b0, then BSTR_ERR is returned.
+
+ ..........................................................................
+
+ extern int bninchrr (const_bstring b0, int pos, const_bstring b1);
+
+ Search for the last position in b0 no greater than pos, in which none of
+ the characters in b1 is found and return it. This function has an
+ execution time of O(b0->slen + b1->slen). If such a position does not
+ exist in b0, then BSTR_ERR is returned.
+
+ ..........................................................................
+
+ extern int bstrchr (const_bstring b, int c);
+
+ Search for the character c in the bstring b forwards from the start of
+ the bstring. Returns the position of the found character or BSTR_ERR if
+ it is not found.
+
+ NOTE: This has been implemented as a macro on top of bstrchrp ().
+
+ ..........................................................................
+
+ extern int bstrrchr (const_bstring b, int c);
+
+ Search for the character c in the bstring b backwards from the end of the
+ bstring. Returns the position of the found character or BSTR_ERR if it is
+ not found.
+
+ NOTE: This has been implemented as a macro on top of bstrrchrp ().
+
+ ..........................................................................
+
+ extern int bstrchrp (const_bstring b, int c, int pos);
+
+ Search for the character c in b forwards from the position pos
+ (inclusive). Returns the position of the found character or BSTR_ERR if
+ it is not found.
+
+ ..........................................................................
+
+ extern int bstrrchrp (const_bstring b, int c, int pos);
+
+ Search for the character c in b backwards from the position pos in bstring
+ (inclusive). Returns the position of the found character or BSTR_ERR if
+ it is not found.
+
+ ..........................................................................
+
+ extern int bsetstr (bstring b0, int pos, const_bstring b1, unsigned char fill);
+
+ Overwrite the bstring b0 starting at position pos with the bstring b1. If
+ the position pos is past the end of b0, then the character "fill" is
+ appended as necessary to make up the gap between the end of b0 and pos.
+ If b1 is NULL, it behaves as if it were a 0-length bstring. The value
+ BSTR_OK is returned if the operation is successful, otherwise BSTR_ERR is
+ returned.
+
+ ..........................................................................
+
+ extern int binsert (bstring s1, int pos, const_bstring s2, unsigned char fill);
+
+ Inserts the bstring s2 into s1 at position pos. If the position pos is
+ past the end of s1, then the character "fill" is appended as necessary to
+ make up the gap between the end of s1 and pos. The value BSTR_OK is
+ returned if the operation is successful, otherwise BSTR_ERR is returned.
+
+ ..........................................................................
+
+ extern int binsertch (bstring s1, int pos, int len, unsigned char fill);
+
+ Inserts the character fill repeatedly into s1 at position pos for a
+ length len. If the position pos is past the end of s1, then the
+ character "fill" is appended as necessary to make up the gap between the
+ end of s1 and the position pos + len (exclusive). The value BSTR_OK is
+ returned if the operation is successful, otherwise BSTR_ERR is returned.
+
+ ..........................................................................
+
+ extern int breplace (bstring b1, int pos, int len, const_bstring b2,
+ unsigned char fill);
+
+ Replace a section of a bstring from pos for a length len with the bstring
+ b2. If the position pos is past the end of b1 then the character "fill"
+ is appended as necessary to make up the gap between the end of b1 and
+ pos.
+
+ ..........................................................................
+
+ extern int bfindreplace (bstring b, const_bstring find,
+ const_bstring replace, int position);
+
+ Replace all occurrences of the find substring with a replace bstring
+ after a given position in the bstring b. The find bstring must have a
+ length > 0 otherwise BSTR_ERR is returned. This function does not
+ perform recursive per character replacement; that is to say successive
+ searches resume at the position after the last replace.
+
+ So for example:
+
+ bfindreplace (a0 = bfromcstr("aabaAb"), a1 = bfromcstr("a"),
+ a2 = bfromcstr("aa"), 0);
+
+ Should result in changing a0 to "aaaabaaAb".
+
+ This function performs exactly (b->slen - position) bstring comparisons,
+ and data movement is bounded above by character volume equivalent to size
+ of the output bstring.
+
+ ..........................................................................
+
+ extern int bfindreplacecaseless (bstring b, const_bstring find,
+ const_bstring replace, int position);
+
+ Replace all occurrences of the find substring, ignoring case, with a
+ replace bstring after a given position in the bstring b. The find bstring
+ must have a length > 0 otherwise BSTR_ERR is returned. This function
+ does not perform recursive per character replacement; that is to say
+ successive searches resume at the position after the last replace.
+
+ So for example:
+
+ bfindreplacecaseless (a0 = bfromcstr("AAbaAb"), a1 = bfromcstr("a"),
+ a2 = bfromcstr("aa"), 0);
+
+ Should result in changing a0 to "aaaabaaaab".
+
+ This function performs exactly (b->slen - position) bstring comparisons,
+ and data movement is bounded above by character volume equivalent to size
+ of the output bstring.
+
+ ..........................................................................
+
+ extern int balloc (bstring b, int length);
+
+ Increase the allocated memory backing the data buffer for the bstring b
+ to a length of at least length. If the memory backing the bstring b is
+ already large enough, not action is performed. This has no effect on the
+ bstring b that is visible to the bstring API. Usually this function will
+ only be used when a minimum buffer size is required coupled with a direct
+ access to the ->data member of the bstring structure.
+
+ Be warned that like any other bstring function, the bstring must be well
+ defined upon entry to this function. I.e., doing something like:
+
+ b->slen *= 2; /* ?? Most likely incorrect */
+ balloc (b, b->slen);
+
+ is invalid, and should be implemented as:
+
+ int t;
+ if (BSTR_OK == balloc (b, t = (b->slen * 2))) b->slen = t;
+
+ This function will return with BSTR_ERR if b is not detected as a valid
+ bstring or length is not greater than 0, otherwise BSTR_OK is returned.
+
+ ..........................................................................
+
+ extern int ballocmin (bstring b, int length);
+
+ Change the amount of memory backing the bstring b to at least length.
+ This operation will never truncate the bstring data including the
+ extra terminating '\0' and thus will not decrease the length to less than
+ b->slen + 1. Note that repeated use of this function may cause
+ performance problems (realloc may be called on the bstring more than
+ the O(log(INT_MAX)) times). This function will return with BSTR_ERR if b
+ is not detected as a valid bstring or length is not greater than 0,
+ otherwise BSTR_OK is returned.
+
+ So for example:
+
+ if (BSTR_OK == ballocmin (b, 64)) b->data[63] = 'x';
+
+ The idea is that this will set the 64th character of b to 'x' if it is at
+ least 64 characters long otherwise do nothing. And we know this is well
+ defined so long as the ballocmin call was successfully, since it will
+ ensure that b has been allocated with at least 64 characters.
+
+ ..........................................................................
+
+ int btrunc (bstring b, int n);
+
+ Truncate the bstring to at most n characters. This function will return
+ with BSTR_ERR if b is not detected as a valid bstring or n is less than
+ 0, otherwise BSTR_OK is returned.
+
+ ..........................................................................
+
+ extern int bpattern (bstring b, int len);
+
+ Replicate the starting bstring, b, end to end repeatedly until it
+ surpasses len characters, then chop the result to exactly len characters.
+ This function operates in-place. This function will return with BSTR_ERR
+ if b is NULL or of length 0, otherwise BSTR_OK is returned.
+
+ ..........................................................................
+
+ extern int btoupper (bstring b);
+
+ Convert contents of bstring to upper case. This function will return with
+ BSTR_ERR if b is NULL or of length 0, otherwise BSTR_OK is returned.
+
+ ..........................................................................
+
+ extern int btolower (bstring b);
+
+ Convert contents of bstring to lower case. This function will return with
+ BSTR_ERR if b is NULL or of length 0, otherwise BSTR_OK is returned.
+
+ ..........................................................................
+
+ extern int bltrimws (bstring b);
+
+ Delete whitespace contiguous from the left end of the bstring. This
+ function will return with BSTR_ERR if b is NULL or of length 0, otherwise
+ BSTR_OK is returned.
+
+ ..........................................................................
+
+ extern int brtrimws (bstring b);
+
+ Delete whitespace contiguous from the right end of the bstring. This
+ function will return with BSTR_ERR if b is NULL or of length 0, otherwise
+ BSTR_OK is returned.
+
+ ..........................................................................
+
+ extern int btrimws (bstring b);
+
+ Delete whitespace contiguous from both ends of the bstring. This function
+ will return with BSTR_ERR if b is NULL or of length 0, otherwise BSTR_OK
+ is returned.
+
+ ..........................................................................
+
+ extern struct bstrList* bstrListCreate (void);
+
+ Create an empty struct bstrList. The struct bstrList output structure is
+ declared as follows:
+
+ struct bstrList {
+ int qty, mlen;
+ bstring * entry;
+ };
+
+ The entry field actually is an array with qty number entries. The mlen
+ record counts the maximum number of bstring's for which there is memory
+ in the entry record.
+
+ The Bstrlib API does *NOT* include a comprehensive set of functions for
+ full management of struct bstrList in an abstracted way. The reason for
+ this is because aliasing semantics of the list are best left to the user
+ of this function, and performance varies wildly depending on the
+ assumptions made. For a complete list of bstring data type it is
+ recommended that the C++ public std::vector<CBString> be used, since its
+ semantics are usage are more standard.
+
+ ..........................................................................
+
+ extern int bstrListDestroy (struct bstrList * sl);
+
+ Destroy a struct bstrList structure that was returned by the bsplit
+ function. Note that this will destroy each bstring in the ->entry array
+ as well. See bstrListCreate() above for structure of struct bstrList.
+
+ ..........................................................................
+
+ extern int bstrListAlloc (struct bstrList * sl, int msz);
+
+ Ensure that there is memory for at least msz number of entries for the
+ list.
+
+ ..........................................................................
+
+ extern int bstrListAllocMin (struct bstrList * sl, int msz);
+
+ Try to allocate the minimum amount of memory for the list to include at
+ least msz entries or sl->qty whichever is greater.
+
+ ..........................................................................
+
+ extern struct bstrList * bsplit (bstring str, unsigned char splitChar);
+
+ Create an array of sequential substrings from str divided by the
+ character splitChar. Successive occurrences of the splitChar will be
+ divided by empty bstring entries, following the semantics from the Python
+ programming language. To reclaim the memory from this output structure,
+ bstrListDestroy () should be called. See bstrListCreate() above for
+ structure of struct bstrList.
+
+ ..........................................................................
+
+ extern struct bstrList * bsplits (bstring str, const_bstring splitStr);
+
+ Create an array of sequential substrings from str divided by any
+ character contained in splitStr. An empty splitStr causes a single entry
+ bstrList containing a copy of str to be returned. See bstrListCreate()
+ above for structure of struct bstrList.
+
+ ..........................................................................
+
+ extern struct bstrList * bsplitstr (bstring str, const_bstring splitStr);
+
+ Create an array of sequential substrings from str divided by the entire
+ substring splitStr. An empty splitStr causes a single entry bstrList
+ containing a copy of str to be returned. See bstrListCreate() above for
+ structure of struct bstrList.
+
+ ..........................................................................
+
+ extern bstring bjoin (const struct bstrList * bl, const_bstring sep);
+
+ Join the entries of a bstrList into one bstring by sequentially
+ concatenating them with the sep bstring in between. If sep is NULL, it
+ is treated as if it were the empty bstring. Note that:
+
+ bjoin (l = bsplit (b, s->data[0]), s);
+
+ should result in a copy of b, if s->slen is 1. If there is an error NULL
+ is returned, otherwise a bstring with the correct result is returned.
+ See bstrListCreate() above for structure of struct bstrList.
+
+ ..........................................................................
+
+ extern int bsplitcb (const_bstring str, unsigned char splitChar, int pos,
+ int (* cb) (void * parm, int ofs, int len), void * parm);
+
+ Iterate the set of disjoint sequential substrings over str starting at
+ position pos divided by the character splitChar. The parm passed to
+ bsplitcb is passed on to cb. If the function cb returns a value < 0,
+ then further iterating is halted and this value is returned by bsplitcb.
+
+ Note: Non-destructive modification of str from within the cb function
+ while performing this split is not undefined. bsplitcb behaves in
+ sequential lock step with calls to cb. I.e., after returning from a cb
+ that return a non-negative integer, bsplitcb continues from the position
+ 1 character after the last detected split character and it will halt
+ immediately if the length of str falls below this point. However, if the
+ cb function destroys str, then it *must* return with a negative value,
+ otherwise bsplitcb will continue in an undefined manner.
+
+ This function is provided as an incremental alternative to bsplit that is
+ abortable and which does not impose additional memory allocation.
+
+ ..........................................................................
+
+ extern int bsplitscb (const_bstring str, const_bstring splitStr, int pos,
+ int (* cb) (void * parm, int ofs, int len), void * parm);
+
+ Iterate the set of disjoint sequential substrings over str starting at
+ position pos divided by any of the characters in splitStr. An empty
+ splitStr causes the whole str to be iterated once. The parm passed to
+ bsplitcb is passed on to cb. If the function cb returns a value < 0,
+ then further iterating is halted and this value is returned by bsplitcb.
+
+ Note: Non-destructive modification of str from within the cb function
+ while performing this split is not undefined. bsplitscb behaves in
+ sequential lock step with calls to cb. I.e., after returning from a cb
+ that return a non-negative integer, bsplitscb continues from the position
+ 1 character after the last detected split character and it will halt
+ immediately if the length of str falls below this point. However, if the
+ cb function destroys str, then it *must* return with a negative value,
+ otherwise bsplitscb will continue in an undefined manner.
+
+ This function is provided as an incremental alternative to bsplits that
+ is abortable and which does not impose additional memory allocation.
+
+ ..........................................................................
+
+ extern int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos,
+ int (* cb) (void * parm, int ofs, int len), void * parm);
+
+ Iterate the set of disjoint sequential substrings over str starting at
+ position pos divided by the entire substring splitStr. An empty splitStr
+ causes each character of str to be iterated. The parm passed to bsplitcb
+ is passed on to cb. If the function cb returns a value < 0, then further
+ iterating is halted and this value is returned by bsplitcb.
+
+ Note: Non-destructive modification of str from within the cb function
+ while performing this split is not undefined. bsplitstrcb behaves in
+ sequential lock step with calls to cb. I.e., after returning from a cb
+ that return a non-negative integer, bsplitstrcb continues from the position
+ 1 character after the last detected split character and it will halt
+ immediately if the length of str falls below this point. However, if the
+ cb function destroys str, then it *must* return with a negative value,
+ otherwise bsplitscb will continue in an undefined manner.
+
+ This function is provided as an incremental alternative to bsplitstr that
+ is abortable and which does not impose additional memory allocation.
+
+ ..........................................................................
+
+ extern bstring bformat (const char * fmt, ...);
+
+ Takes the same parameters as printf (), but rather than outputting
+ results to stdio, it forms a bstring which contains what would have been
+ output. Note that if there is an early generation of a '\0' character,
+ the bstring will be truncated to this end point.
+
+ Note that %s format tokens correspond to '\0' terminated char * buffers,
+ not bstrings. To print a bstring, first dereference data element of the
+ the bstring:
+
+ /* b1->data needs to be '\0' terminated, so tagbstrings generated
+ by blk2tbstr () might not be suitable. */
+ b0 = bformat ("Hello, %s", b1->data);
+
+ Note that if the BSTRLIB_NOVSNP macro has been set when bstrlib has been
+ compiled the bformat function is not present.
+
+ ..........................................................................
+
+ extern int bformata (bstring b, const char * fmt, ...);
+
+ In addition to the initial output buffer b, bformata takes the same
+ parameters as printf (), but rather than outputting results to stdio, it
+ appends the results to the initial bstring parameter. Note that if
+ there is an early generation of a '\0' character, the bstring will be
+ truncated to this end point.
+
+ Note that %s format tokens correspond to '\0' terminated char * buffers,
+ not bstrings. To print a bstring, first dereference data element of the
+ the bstring:
+
+ /* b1->data needs to be '\0' terminated, so tagbstrings generated
+ by blk2tbstr () might not be suitable. */
+ bformata (b0 = bfromcstr ("Hello"), ", %s", b1->data);
+
+ Note that if the BSTRLIB_NOVSNP macro has been set when bstrlib has been
+ compiled the bformata function is not present.
+
+ ..........................................................................
+
+ extern int bassignformat (bstring b, const char * fmt, ...);
+
+ After the first parameter, it takes the same parameters as printf (), but
+ rather than outputting results to stdio, it outputs the results to
+ the bstring parameter b. Note that if there is an early generation of a
+ '\0' character, the bstring will be truncated to this end point.
+
+ Note that %s format tokens correspond to '\0' terminated char * buffers,
+ not bstrings. To print a bstring, first dereference data element of the
+ the bstring:
+
+ /* b1->data needs to be '\0' terminated, so tagbstrings generated
+ by blk2tbstr () might not be suitable. */
+ bassignformat (b0 = bfromcstr ("Hello"), ", %s", b1->data);
+
+ Note that if the BSTRLIB_NOVSNP macro has been set when bstrlib has been
+ compiled the bassignformat function is not present.
+
+ ..........................................................................
+
+ extern int bvcformata (bstring b, int count, const char * fmt, va_list arglist);
+
+ The bvcformata function formats data under control of the format control
+ string fmt and attempts to append the result to b. The fmt parameter is
+ the same as that of the printf function. The variable argument list is
+ replaced with arglist, which has been initialized by the va_start macro.
+ The size of the output is upper bounded by count. If the required output
+ exceeds count, the string b is not augmented with any contents and a value
+ below BSTR_ERR is returned. If a value below -count is returned then it
+ is recommended that the negative of this value be used as an update to the
+ count in a subsequent pass. On other errors, such as running out of
+ memory, parameter errors or numeric wrap around BSTR_ERR is returned.
+ BSTR_OK is returned when the output is successfully generated and
+ appended to b.
+
+ Note: There is no sanity checking of arglist, and this function is
+ destructive of the contents of b from the b->slen point onward. If there
+ is an early generation of a '\0' character, the bstring will be truncated
+ to this end point.
+
+ Although this function is part of the external API for Bstrlib, the
+ interface and semantics (length limitations, and unusual return codes)
+ are fairly atypical. The real purpose for this function is to provide an
+ engine for the bvformata macro.
+
+ Note that if the BSTRLIB_NOVSNP macro has been set when bstrlib has been
+ compiled the bvcformata function is not present.
+
+ ..........................................................................
+
+ extern bstring bread (bNread readPtr, void * parm);
+ typedef size_t (* bNread) (void *buff, size_t elsize, size_t nelem,
+ void *parm);
+
+ Read an entire stream into a bstring, verbatum. The readPtr function
+ pointer is compatible with fread sematics, except that it need not obtain
+ the stream data from a file. The intention is that parm would contain
+ the stream data context/state required (similar to the role of the FILE*
+ I/O stream parameter of fread.)
+
+ Abstracting the block read function allows for block devices other than
+ file streams to be read if desired. Note that there is an ANSI
+ compatibility issue if "fread" is used directly; see the ANSI issues
+ section below.
+
+ ..........................................................................
+
+ extern int breada (bstring b, bNread readPtr, void * parm);
+
+ Read an entire stream and append it to a bstring, verbatum. Behaves
+ like bread, except that it appends it results to the bstring b.
+ BSTR_ERR is returned on error, otherwise 0 is returned.
+
+ ..........................................................................
+
+ extern bstring bgets (bNgetc getcPtr, void * parm, char terminator);
+ typedef int (* bNgetc) (void * parm);
+
+ Read a bstring from a stream. As many bytes as is necessary are read
+ until the terminator is consumed or no more characters are available from
+ the stream. If read from the stream, the terminator character will be
+ appended to the end of the returned bstring. The getcPtr function must
+ have the same semantics as the fgetc C library function (i.e., returning
+ an integer whose value is negative when there are no more characters
+ available, otherwise the value of the next available unsigned character
+ from the stream.) The intention is that parm would contain the stream
+ data context/state required (similar to the role of the FILE* I/O stream
+ parameter of fgets.) If no characters are read, or there is some other
+ detectable error, NULL is returned.
+
+ bgets will never call the getcPtr function more often than necessary to
+ construct its output (including a single call, if required, to determine
+ that the stream contains no more characters.)
+
+ Abstracting the character stream function and terminator character allows
+ for different stream devices and string formats other than '\n'
+ terminated lines in a file if desired (consider \032 terminated email
+ messages, in a UNIX mailbox for example.)
+
+ For files, this function can be used analogously as fgets as follows:
+
+ fp = fopen ( ... );
+ if (fp) b = bgets ((bNgetc) fgetc, fp, '\n');
+
+ (Note that only one terminator character can be used, and that '\0' is
+ not assumed to terminate the stream in addition to the terminator
+ character. This is consistent with the semantics of fgets.)
+
+ ..........................................................................
+
+ extern int bgetsa (bstring b, bNgetc getcPtr, void * parm, char terminator);
+
+ Read from a stream and concatenate to a bstring. Behaves like bgets,
+ except that it appends it results to the bstring b. The value 1 is
+ returned if no characters are read before a negative result is returned
+ from getcPtr. Otherwise BSTR_ERR is returned on error, and 0 is returned
+ in other normal cases.
+
+ ..........................................................................
+
+ extern int bassigngets (bstring b, bNgetc getcPtr, void * parm, char terminator);
+
+ Read from a stream and concatenate to a bstring. Behaves like bgets,
+ except that it assigns the results to the bstring b. The value 1 is
+ returned if no characters are read before a negative result is returned
+ from getcPtr. Otherwise BSTR_ERR is returned on error, and 0 is returned
+ in other normal cases.
+
+ ..........................................................................
+
+ extern struct bStream * bsopen (bNread readPtr, void * parm);
+
+ Wrap a given open stream (described by a fread compatible function
+ pointer and stream handle) into an open bStream suitable for the bstring
+ library streaming functions.
+
+ ..........................................................................
+
+ extern void * bsclose (struct bStream * s);
+
+ Close the bStream, and return the handle to the stream that was
+ originally used to open the given stream. If s is NULL or detectably
+ invalid, NULL will be returned.
+
+ ..........................................................................
+
+ extern int bsbufflength (struct bStream * s, int sz);
+
+ Set the length of the buffer used by the bStream. If sz is the macro
+ BSTR_BS_BUFF_LENGTH_GET (which is 0), the length is not set. If s is
+ NULL or sz is negative, the function will return with BSTR_ERR, otherwise
+ this function returns with the previous length.
+
+ ..........................................................................
+
+ extern int bsreadln (bstring r, struct bStream * s, char terminator);
+
+ Read a bstring terminated by the terminator character or the end of the
+ stream from the bStream (s) and return it into the parameter r. The
+ matched terminator, if found, appears at the end of the line read. If
+ the stream has been exhausted of all available data, before any can be
+ read, BSTR_ERR is returned. This function may read additional characters
+ into the stream buffer from the core stream that are not returned, but
+ will be retained for subsequent read operations. When reading from high
+ speed streams, this function can perform significantly faster than bgets.
+
+ ..........................................................................
+
+ extern int bsreadlna (bstring r, struct bStream * s, char terminator);
+
+ Read a bstring terminated by the terminator character or the end of the
+ stream from the bStream (s) and concatenate it to the parameter r. The
+ matched terminator, if found, appears at the end of the line read. If
+ the stream has been exhausted of all available data, before any can be
+ read, BSTR_ERR is returned. This function may read additional characters
+ into the stream buffer from the core stream that are not returned, but
+ will be retained for subsequent read operations. When reading from high
+ speed streams, this function can perform significantly faster than bgets.
+
+ ..........................................................................
+
+ extern int bsreadlns (bstring r, struct bStream * s, bstring terminators);
+
+ Read a bstring terminated by any character in the terminators bstring or
+ the end of the stream from the bStream (s) and return it into the
+ parameter r. This function may read additional characters from the core
+ stream that are not returned, but will be retained for subsequent read
+ operations.
+
+ ..........................................................................
+
+ extern int bsreadlnsa (bstring r, struct bStream * s, bstring terminators);
+
+ Read a bstring terminated by any character in the terminators bstring or
+ the end of the stream from the bStream (s) and concatenate it to the
+ parameter r. If the stream has been exhausted of all available data,
+ before any can be read, BSTR_ERR is returned. This function may read
+ additional characters from the core stream that are not returned, but
+ will be retained for subsequent read operations.
+
+ ..........................................................................
+
+ extern int bsread (bstring r, struct bStream * s, int n);
+
+ Read a bstring of length n (or, if it is fewer, as many bytes as is
+ remaining) from the bStream. This function will read the minimum
+ required number of additional characters from the core stream. When the
+ stream is at the end of the file BSTR_ERR is returned, otherwise BSTR_OK
+ is returned.
+
+ ..........................................................................
+
+ extern int bsreada (bstring r, struct bStream * s, int n);
+
+ Read a bstring of length n (or, if it is fewer, as many bytes as is
+ remaining) from the bStream and concatenate it to the parameter r. This
+ function will read the minimum required number of additional characters
+ from the core stream. When the stream is at the end of the file BSTR_ERR
+ is returned, otherwise BSTR_OK is returned.
+
+ ..........................................................................
+
+ extern int bsunread (struct bStream * s, const_bstring b);
+
+ Insert a bstring into the bStream at the current position. These
+ characters will be read prior to those that actually come from the core
+ stream.
+
+ ..........................................................................
+
+ extern int bspeek (bstring r, const struct bStream * s);
+
+ Return the number of currently buffered characters from the bStream that
+ will be read prior to reads from the core stream, and append it to the
+ the parameter r.
+
+ ..........................................................................
+
+ extern int bssplitscb (struct bStream * s, const_bstring splitStr,
+ int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
+
+ Iterate the set of disjoint sequential substrings over the stream s
+ divided by any character from the bstring splitStr. The parm passed to
+ bssplitscb is passed on to cb. If the function cb returns a value < 0,
+ then further iterating is halted and this return value is returned by
+ bssplitscb.
+
+ Note: At the point of calling the cb function, the bStream pointer is
+ pointed exactly at the position right after having read the split
+ character. The cb function can act on the stream by causing the bStream
+ pointer to move, and bssplitscb will continue by starting the next split
+ at the position of the pointer after the return from cb.
+
+ However, if the cb causes the bStream s to be destroyed then the cb must
+ return with a negative value, otherwise bssplitscb will continue in an
+ undefined manner.
+
+ This function is provided as way to incrementally parse through a file
+ or other generic stream that in total size may otherwise exceed the
+ practical or desired memory available. As with the other split callback
+ based functions this is abortable and does not impose additional memory
+ allocation.
+
+ ..........................................................................
+
+ extern int bssplitstrcb (struct bStream * s, const_bstring splitStr,
+ int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
+
+ Iterate the set of disjoint sequential substrings over the stream s
+ divided by the entire substring splitStr. The parm passed to
+ bssplitstrcb is passed on to cb. If the function cb returns a
+ value < 0, then further iterating is halted and this return value is
+ returned by bssplitstrcb.
+
+ Note: At the point of calling the cb function, the bStream pointer is
+ pointed exactly at the position right after having read the split
+ character. The cb function can act on the stream by causing the bStream
+ pointer to move, and bssplitstrcb will continue by starting the next
+ split at the position of the pointer after the return from cb.
+
+ However, if the cb causes the bStream s to be destroyed then the cb must
+ return with a negative value, otherwise bssplitscb will continue in an
+ undefined manner.
+
+ This function is provided as way to incrementally parse through a file
+ or other generic stream that in total size may otherwise exceed the
+ practical or desired memory available. As with the other split callback
+ based functions this is abortable and does not impose additional memory
+ allocation.
+
+ ..........................................................................
+
+ extern int bseof (const struct bStream * s);
+
+ Return the defacto "EOF" (end of file) state of a stream (1 if the
+ bStream is in an EOF state, 0 if not, and BSTR_ERR if stream is closed or
+ detectably erroneous.) When the readPtr callback returns a value <= 0
+ the stream reaches its "EOF" state. Note that bunread with non-empty
+ content will essentially turn off this state, and the stream will not be
+ in its "EOF" state so long as its possible to read more data out of it.
+
+ Also note that the semantics of bseof() are slightly different from
+ something like feof(). I.e., reaching the end of the stream does not
+ necessarily guarantee that bseof() will return with a value indicating
+ that this has happened. bseof() will only return indicating that it has
+ reached the "EOF" and an attempt has been made to read past the end of
+ the bStream.
+
+The macros
+----------
+
+ The macros described below are shown in a prototype form indicating their
+ intended usage. Note that the parameters passed to these macros will be
+ referenced multiple times. As with all macros, programmer care is
+ required to guard against unintended side effects.
+
+ int blengthe (const_bstring b, int err);
+
+ Returns the length of the bstring. If the bstring is NULL err is
+ returned.
+
+ ..........................................................................
+
+ int blength (const_bstring b);
+
+ Returns the length of the bstring. If the bstring is NULL, the length
+ returned is 0.
+
+ ..........................................................................
+
+ int bchare (const_bstring b, int p, int c);
+
+ Returns the p'th character of the bstring b. If the position p refers to
+ a position that does not exist in the bstring or the bstring is NULL,
+ then c is returned.
+
+ ..........................................................................
+
+ char bchar (const_bstring b, int p);
+
+ Returns the p'th character of the bstring b. If the position p refers to
+ a position that does not exist in the bstring or the bstring is NULL,
+ then '\0' is returned.
+
+ ..........................................................................
+
+ char * bdatae (bstring b, char * err);
+
+ Returns the char * data portion of the bstring b. If b is NULL, err is
+ returned.
+
+ ..........................................................................
+
+ char * bdata (bstring b);
+
+ Returns the char * data portion of the bstring b. If b is NULL, NULL is
+ returned.
+
+ ..........................................................................
+
+ char * bdataofse (bstring b, int ofs, char * err);
+
+ Returns the char * data portion of the bstring b offset by ofs. If b is
+ NULL, err is returned.
+
+ ..........................................................................
+
+ char * bdataofs (bstring b, int ofs);
+
+ Returns the char * data portion of the bstring b offset by ofs. If b is
+ NULL, NULL is returned.
+
+ ..........................................................................
+
+ struct tagbstring var = bsStatic ("...");
+
+ The bsStatic macro allows for static declarations of literal string
+ constants as struct tagbstring structures. The resulting tagbstring does
+ not need to be freed or destroyed. Note that this macro is only well
+ defined for string literal arguments. For more general string pointers,
+ use the btfromcstr macro.
+
+ The resulting struct tagbstring is permanently write protected. Attempts
+ to write to this struct tagbstring from any bstrlib function will lead to
+ BSTR_ERR being returned. Invoking the bwriteallow macro onto this struct
+ tagbstring has no effect.
+
+ ..........................................................................
+
+ <void * blk, int len> <- bsStaticBlkParms ("...")
+
+ The bsStaticBlkParms macro emits a pair of comma seperated parameters
+ corresponding to the block parameters for the block functions in Bstrlib
+ (i.e., blk2bstr, bcatblk, blk2tbstr, bisstemeqblk, bisstemeqcaselessblk.)
+ Note that this macro is only well defined for string literal arguments.
+
+ Examples:
+
+ bstring b = blk2bstr (bsStaticBlkParms ("Fast init. "));
+ bcatblk (b, bsStaticBlkParms ("No frills fast concatenation."));
+
+ These are faster than using bfromcstr() and bcatcstr() respectively
+ because the length of the inline string is known as a compile time
+ constant. Also note that seperate struct tagbstring declarations for
+ holding the output of a bsStatic() macro are not required.
+
+ ..........................................................................
+
+ void btfromcstr (struct tagbstring& t, const char * s);
+
+ Fill in the tagbstring t with the '\0' terminated char buffer s. This
+ action is purely reference oriented; no memory management is done. The
+ data member is just assigned s, and slen is assigned the strlen of s.
+ The s parameter is accessed exactly once in this macro.
+
+ The resulting struct tagbstring is initially write protected. Attempts
+ to write to this struct tagbstring in a write protected state from any
+ bstrlib function will lead to BSTR_ERR being returned. Invoke the
+ bwriteallow on this struct tagbstring to make it writeable (though this
+ requires that s be obtained from a function compatible with malloc.)
+
+ ..........................................................................
+
+ void btfromblk (struct tagbstring& t, void * s, int len);
+
+ Fill in the tagbstring t with the data buffer s with length len. This
+ action is purely reference oriented; no memory management is done. The
+ data member of t is just assigned s, and slen is assigned len. Note that
+ the buffer is not appended with a '\0' character. The s and len
+ parameters are accessed exactly once each in this macro.
+
+ The resulting struct tagbstring is initially write protected. Attempts
+ to write to this struct tagbstring in a write protected state from any
+ bstrlib function will lead to BSTR_ERR being returned. Invoke the
+ bwriteallow on this struct tagbstring to make it writeable (though this
+ requires that s be obtained from a function compatible with malloc.)
+
+ ..........................................................................
+
+ void btfromblkltrimws (struct tagbstring& t, void * s, int len);
+
+ Fill in the tagbstring t with the data buffer s with length len after it
+ has been left trimmed. This action is purely reference oriented; no
+ memory management is done. The data member of t is just assigned to a
+ pointer inside the buffer s. Note that the buffer is not appended with a
+ '\0' character. The s and len parameters are accessed exactly once each
+ in this macro.
+
+ The resulting struct tagbstring is permanently write protected. Attempts
+ to write to this struct tagbstring from any bstrlib function will lead to
+ BSTR_ERR being returned. Invoking the bwriteallow macro onto this struct
+ tagbstring has no effect.
+
+ ..........................................................................
+
+ void btfromblkrtrimws (struct tagbstring& t, void * s, int len);
+
+ Fill in the tagbstring t with the data buffer s with length len after it
+ has been right trimmed. This action is purely reference oriented; no
+ memory management is done. The data member of t is just assigned to a
+ pointer inside the buffer s. Note that the buffer is not appended with a
+ '\0' character. The s and len parameters are accessed exactly once each
+ in this macro.
+
+ The resulting struct tagbstring is permanently write protected. Attempts
+ to write to this struct tagbstring from any bstrlib function will lead to
+ BSTR_ERR being returned. Invoking the bwriteallow macro onto this struct
+ tagbstring has no effect.
+
+ ..........................................................................
+
+ void btfromblktrimws (struct tagbstring& t, void * s, int len);
+
+ Fill in the tagbstring t with the data buffer s with length len after it
+ has been left and right trimmed. This action is purely reference
+ oriented; no memory management is done. The data member of t is just
+ assigned to a pointer inside the buffer s. Note that the buffer is not
+ appended with a '\0' character. The s and len parameters are accessed
+ exactly once each in this macro.
+
+ The resulting struct tagbstring is permanently write protected. Attempts
+ to write to this struct tagbstring from any bstrlib function will lead to
+ BSTR_ERR being returned. Invoking the bwriteallow macro onto this struct
+ tagbstring has no effect.
+
+ ..........................................................................
+
+ void bmid2tbstr (struct tagbstring& t, bstring b, int pos, int len);
+
+ Fill the tagbstring t with the substring from b, starting from position
+ pos with a length len. The segment is clamped by the boundaries of
+ the bstring b. This action is purely reference oriented; no memory
+ management is done. Note that the buffer is not appended with a '\0'
+ character. Note that the t parameter to this macro may be accessed
+ multiple times. Note that the contents of t will become undefined
+ if the contents of b change or are destroyed.
+
+ The resulting struct tagbstring is permanently write protected. Attempts
+ to write to this struct tagbstring in a write protected state from any
+ bstrlib function will lead to BSTR_ERR being returned. Invoking the
+ bwriteallow macro on this struct tagbstring will have no effect.
+
+ ..........................................................................
+
+ void bvformata (int& ret, bstring b, const char * format, lastarg);
+
+ Append the bstring b with printf like formatting with the format control
+ string, and the arguments taken from the ... list of arguments after
+ lastarg passed to the containing function. If the containing function
+ does not have ... parameters or lastarg is not the last named parameter
+ before the ... then the results are undefined. If successful, the
+ results are appended to b and BSTR_OK is assigned to ret. Otherwise
+ BSTR_ERR is assigned to ret.
+
+ Example:
+
+ void dbgerror (FILE * fp, const char * fmt, ...) {
+ int ret;
+ bstring b;
+ bvformata (ret, b = bfromcstr ("DBG: "), fmt, fmt);
+ if (BSTR_OK == ret) fputs ((char *) bdata (b), fp);
+ bdestroy (b);
+ }
+
+ Note that if the BSTRLIB_NOVSNP macro was set when bstrlib had been
+ compiled the bvformata macro will not link properly. If the
+ BSTRLIB_NOVSNP macro has been set, the bvformata macro will not be
+ available.
+
+ ..........................................................................
+
+ void bwriteprotect (struct tagbstring& t);
+
+ Disallow bstring from being written to via the bstrlib API. Attempts to
+ write to the resulting tagbstring from any bstrlib function will lead to
+ BSTR_ERR being returned.
+
+ Note: bstrings which are write protected cannot be destroyed via bdestroy.
+
+ Note to C++ users: Setting a CBString as write protected will not prevent
+ it from being destroyed by the destructor.
+
+ ..........................................................................
+
+ void bwriteallow (struct tagbstring& t);
+
+ Allow bstring to be written to via the bstrlib API. Note that such an
+ action makes the bstring both writable and destroyable. If the bstring is
+ not legitimately writable (as is the case for struct tagbstrings
+ initialized with a bsStatic value), the results of this are undefined.
+
+ Note that invoking the bwriteallow macro may increase the number of
+ reallocs by one more than necessary for every call to bwriteallow
+ interleaved with any bstring API which writes to this bstring.
+
+ ..........................................................................
+
+ int biswriteprotected (struct tagbstring& t);
+
+ Returns 1 if the bstring is write protected, otherwise 0 is returned.
+
+===============================================================================
+
+The bstest module
+-----------------
+
+The bstest module is just a unit test for the bstrlib module. For correct
+implementations of bstrlib, it should execute with 0 failures being reported.
+This test should be utilized if modifications/customizations to bstrlib have
+been performed. It tests each core bstrlib function with bstrings of every
+mode (read-only, NULL, static and mutable) and ensures that the expected
+semantics are observed (including results that should indicate an error). It
+also tests for aliasing support. Passing bstest is a necessary but not a
+sufficient condition for ensuring the correctness of the bstrlib module.
+
+
+The test module
+---------------
+
+The test module is just a unit test for the bstrwrap module. For correct
+implementations of bstrwrap, it should execute with 0 failures being
+reported. This test should be utilized if modifications/customizations to
+bstrwrap have been performed. It tests each core bstrwrap function with
+CBStrings write protected or not and ensures that the expected semantics are
+observed (including expected exceptions.) Note that exceptions cannot be
+disabled to run this test. Passing test is a necessary but not a sufficient
+condition for ensuring the correctness of the bstrwrap module.
+
+===============================================================================
+
+Using Bstring and CBString as an alternative to the C library
+-------------------------------------------------------------
+
+First let us give a table of C library functions and the alternative bstring
+functions and CBString methods that should be used instead of them.
+
+C-library Bstring alternative CBString alternative
+--------- ------------------- --------------------
+gets bgets ::gets
+strcpy bassign = operator
+strncpy bassignmidstr ::midstr
+strcat bconcat += operator
+strncat bconcat + btrunc += operator + ::trunc
+strtok bsplit, bsplits ::split
+sprintf b(assign)format ::format
+snprintf b(assign)format + btrunc ::format + ::trunc
+vsprintf bvformata bvformata
+
+vsnprintf bvformata + btrunc bvformata + btrunc
+vfprintf bvformata + fputs use bvformata + fputs
+strcmp biseq, bstrcmp comparison operators.
+strncmp bstrncmp, memcmp bstrncmp, memcmp
+strlen ->slen, blength ::length
+strdup bstrcpy constructor
+strset bpattern ::fill
+strstr binstr ::find
+strpbrk binchr ::findchr
+stricmp bstricmp cast & use bstricmp
+strlwr btolower cast & use btolower
+strupr btoupper cast & use btoupper
+strrev bReverse (aux module) cast & use bReverse
+strchr bstrchr cast & use bstrchr
+strspnp use strspn use strspn
+ungetc bsunread bsunread
+
+The top 9 C functions listed here are troublesome in that they impose memory
+management in the calling function. The Bstring and CBstring interfaces have
+built-in memory management, so there is far less code with far less potential
+for buffer overrun problems. strtok can only be reliably called as a "leaf"
+calculation, since it (quite bizarrely) maintains hidden internal state. And
+gets is well known to be broken no matter what. The Bstrlib alternatives do
+not suffer from those sorts of problems.
+
+The substitute for strncat can be performed with higher performance by using
+the blk2tbstr macro to create a presized second operand for bconcat.
+
+C-library Bstring alternative CBString alternative
+--------- ------------------- --------------------
+strspn strspn acceptable strspn acceptable
+strcspn strcspn acceptable strcspn acceptable
+strnset strnset acceptable strnset acceptable
+printf printf acceptable printf acceptable
+puts puts acceptable puts acceptable
+fprintf fprintf acceptable fprintf acceptable
+fputs fputs acceptable fputs acceptable
+memcmp memcmp acceptable memcmp acceptable
+
+Remember that Bstring (and CBstring) functions will automatically append the
+'\0' character to the character data buffer. So by simply accessing the data
+buffer directly, ordinary C string library functions can be called directly
+on them. Note that bstrcmp is not the same as memcmp in exactly the same way
+that strcmp is not the same as memcmp.
+
+C-library Bstring alternative CBString alternative
+--------- ------------------- --------------------
+fread balloc + fread ::alloc + fread
+fgets balloc + fgets ::alloc + fgets
+
+These are odd ones because of the exact sizing of the buffer required. The
+Bstring and CBString alternatives requires that the buffers are forced to
+hold at least the prescribed length, then just use fread or fgets directly.
+However, typically the automatic memory management of Bstring and CBstring
+will make the typical use of fgets and fread to read specifically sized
+strings unnecessary.
+
+Implementation Choices
+----------------------
+
+Overhead:
+.........
+
+The bstring library has more overhead versus straight char buffers for most
+functions. This overhead is essentially just the memory management and
+string header allocation. This overhead usually only shows up for small
+string manipulations. The performance loss has to be considered in
+light of the following:
+
+1) What would be the performance loss of trying to write this management
+ code in one's own application?
+2) Since the bstring library source code is given, a sufficiently powerful
+ modern inlining globally optimizing compiler can remove function call
+ overhead.
+
+Since the data type is exposed, a developer can replace any unsatisfactory
+function with their own inline implementation. And that is besides the main
+point of what the better string library is mainly meant to provide. Any
+overhead lost has to be compared against the value of the safe abstraction
+for coupling memory management and string functionality.
+
+Performance of the C interface:
+...............................
+
+The algorithms used have performance advantages versus the analogous C
+library functions. For example:
+
+1. bfromcstr/blk2str/bstrcpy versus strcpy/strdup. By using memmove instead
+ of strcpy, the break condition of the copy loop is based on an independent
+ counter (that should be allocated in a register) rather than having to
+ check the results of the load. Modern out-of-order executing CPUs can
+ parallelize the final branch mis-predict penality with the loading of the
+ source string. Some CPUs will also tend to have better built-in hardware
+ support for counted memory moves than load-compare-store. (This is a
+ minor, but non-zero gain.)
+2. biseq versus strcmp. If the strings are unequal in length, bsiseq will
+ return in O(1) time. If the strings are aliased, or have aliased data
+ buffers, biseq will return in O(1) time. strcmp will always be O(k),
+ where k is the length of the common prefix or the whole string if they are
+ identical.
+3. ->slen versus strlen. ->slen is obviously always O(1), while strlen is
+ always O(n) where n is the length of the string.
+4. bconcat versus strcat. Both rely on precomputing the length of the
+ destination string argument, which will favor the bstring library. On
+ iterated concatenations the performance difference can be enormous.
+5. bsreadln versus fgets. The bsreadln function reads large blocks at a time
+ from the given stream, then parses out lines from the buffers directly.
+ Some C libraries will implement fgets as a loop over single fgetc calls.
+ Testing indicates that the bsreadln approach can be several times faster
+ for fast stream devices (such as a file that has been entirely cached.)
+6. bsplits/bsplitscb versus strspn. Accelerators for the set of match
+ characters are generated only once.
+7. binstr versus strstr. The binstr implementation unrolls the loops to
+ help reduce loop overhead. This will matter if the target string is
+ long and source string is not found very early in the target string.
+ With strstr, while it is possible to unroll the source contents, it is
+ not possible to do so with the destination contents in a way that is
+ effective because every destination character must be tested against
+ '\0' before proceeding to the next character.
+8. bReverse versus strrev. The C function must find the end of the string
+ first before swaping character pairs.
+9. bstrrchr versus no comparable C function. Its not hard to write some C
+ code to search for a character from the end going backwards. But there
+ is no way to do this without computing the length of the string with
+ strlen.
+
+Practical testing indicates that in general Bstrlib is never signifcantly
+slower than the C library for common operations, while very often having a
+performance advantage that ranges from significant to massive. Even for
+functions like b(n)inchr versus str(c)spn() (where, in theory, there is no
+advantage for the Bstrlib architecture) the performance of Bstrlib is vastly
+superior to most tested C library implementations.
+
+Some of Bstrlib's extra functionality also lead to inevitable performance
+advantages over typical C solutions. For example, using the blk2tbstr macro,
+one can (in O(1) time) generate an internal substring by reference while not
+disturbing the original string. If disturbing the original string is not an
+option, typically, a comparable char * solution would have to make a copy of
+the substring to provide similar functionality. Another example is reverse
+character set scanning -- the str(c)spn functions only scan in a forward
+direction which can complicate some parsing algorithms.
+
+Where high performance char * based algorithms are available, Bstrlib can
+still leverage them by accessing the ->data field on bstrings. So
+realistically Bstrlib can never be significantly slower than any standard
+'\0' terminated char * based solutions.
+
+Performance of the C++ interface:
+.................................
+
+The C++ interface has been designed with an emphasis on abstraction and safety
+first. However, since it is substantially a wrapper for the C bstring
+functions, for longer strings the performance comments described in the
+"Performance of the C interface" section above still apply. Note that the
+(CBString *) type can be directly cast to a (bstring) type, and passed as
+parameters to the C functions (though a CBString must never be passed to
+bdestroy.)
+
+Probably the most controversial choice is performing full bounds checking on
+the [] operator. This decision was made because 1) the fast alternative of
+not bounds checking is still available by first casting the CBString to a
+(const char *) buffer or to a (struct tagbstring) then derefencing .data and
+2) because the lack of bounds checking is seen as one of the main weaknesses
+of C/C++ versus other languages. This check being done on every access leads
+to individual character extraction being actually slower than other languages
+in this one respect (other language's compilers will normally dedicate more
+resources on hoisting or removing bounds checking as necessary) but otherwise
+bring C++ up to the level of other languages in terms of functionality.
+
+It is common for other C++ libraries to leverage the abstractions provided by
+C++ to use reference counting and "copy on write" policies. While these
+techniques can speed up some scenarios, they impose a problem with respect to
+thread safety. bstrings and CBStrings can be properly protected with
+"per-object" mutexes, meaning that two bstrlib calls can be made and execute
+simultaneously, so long as the bstrings and CBstrings are distinct. With a
+reference count and alias before copy on write policy, global mutexes are
+required that prevent multiple calls to the strings library to execute
+simultaneously regardless of whether or not the strings represent the same
+string.
+
+One interesting trade off in CBString is that the default constructor is not
+trivial. I.e., it always prepares a ready to use memory buffer. The purpose
+is to ensure that there is a uniform internal composition for any functioning
+CBString that is compatible with bstrings. It also means that the other
+methods in the class are not forced to perform "late initialization" checks.
+In the end it means that construction of CBStrings are slower than other
+comparable C++ string classes. Initial testing, however, indicates that
+CBString outperforms std::string and MFC's CString, for example, in all other
+operations. So to work around this weakness it is recommended that CBString
+declarations be pushed outside of inner loops.
+
+Practical testing indicates that with the exception of the caveats given
+above (constructors and safe index character manipulations) the C++ API for
+Bstrlib generally outperforms popular standard C++ string classes. Amongst
+the standard libraries and compilers, the quality of concatenation operations
+varies wildly and very little care has gone into search functions. Bstrlib
+dominates those performance benchmarks.
+
+Memory management:
+..................
+
+The bstring functions which write and modify bstrings will automatically
+reallocate the backing memory for the char buffer whenever it is required to
+grow. The algorithm for resizing chosen is to snap up to sizes that are a
+power of two which are sufficient to hold the intended new size. Memory
+reallocation is not performed when the required size of the buffer is
+decreased. This behavior can be relied on, and is necessary to make the
+behaviour of balloc deterministic. This trades off additional memory usage
+for decreasing the frequency for required reallocations:
+
+1. For any bstring whose size never exceeds n, its buffer is not ever
+ reallocated more than log_2(n) times for its lifetime.
+2. For any bstring whose size never exceeds n, its buffer is never more than
+ 2*(n+1) in length. (The extra characters beyond 2*n are to allow for the
+ implicit '\0' which is always added by the bstring modifying functions.)
+
+Decreasing the buffer size when the string decreases in size would violate 1)
+above and in real world case lead to pathological heap thrashing. Similarly,
+allocating more tightly than "least power of 2 greater than necessary" would
+lead to a violation of 1) and have the same potential for heap thrashing.
+
+Property 2) needs emphasizing. Although the memory allocated is always a
+power of 2, for a bstring that grows linearly in size, its buffer memory also
+grows linearly, not exponentially. The reason is that the amount of extra
+space increases with each reallocation, which decreases the frequency of
+future reallocations.
+
+Obviously, given that bstring writing functions may reallocate the data
+buffer backing the target bstring, one should not attempt to cache the data
+buffer address and use it after such bstring functions have been called.
+This includes making reference struct tagbstrings which alias to a writable
+bstring.
+
+balloc or bfromcstralloc can be used to preallocate the minimum amount of
+space used for a given bstring. This will reduce even further the number of
+times the data portion is reallocated. If the length of the string is never
+more than one less than the memory length then there will be no further
+reallocations.
+
+Note that invoking the bwriteallow macro may increase the number of reallocs
+by one more than necessary for every call to bwriteallow interleaved with any
+bstring API which writes to this bstring.
+
+The library does not use any mechanism for automatic clean up for the C API.
+Thus explicit clean up via calls to bdestroy() are required to avoid memory
+leaks.
+
+Constant and static tagbstrings:
+................................
+
+A struct tagbstring can be write protected from any bstrlib function using
+the bwriteprotect macro. A write protected struct tagbstring can then be
+reset to being writable via the bwriteallow macro. There is, of course, no
+protection from attempts to directly access the bstring members. Modifying a
+bstring which is write protected by direct access has undefined behavior.
+
+static struct tagbstrings can be declared via the bsStatic macro. They are
+considered permanently unwritable. Such struct tagbstrings's are declared
+such that attempts to write to it are not well defined. Invoking either
+bwriteallow or bwriteprotect on static struct tagbstrings has no effect.
+
+struct tagbstring's initialized via btfromcstr or blk2tbstr are protected by
+default but can be made writeable via the bwriteallow macro. If bwriteallow
+is called on such struct tagbstring's, it is the programmer's responsibility
+to ensure that:
+
+1) the buffer supplied was allocated from the heap.
+2) bdestroy is not called on this tagbstring (unless the header itself has
+ also been allocated from the heap.)
+3) free is called on the buffer to reclaim its memory.
+
+bwriteallow and bwriteprotect can be invoked on ordinary bstrings (they have
+to be dereferenced with the (*) operator to get the levels of indirection
+correct) to give them write protection.
+
+Buffer declaration:
+...................
+
+The memory buffer is actually declared "unsigned char *" instead of "char *".
+The reason for this is to trigger compiler warnings whenever uncasted char
+buffers are assigned to the data portion of a bstring. This will draw more
+diligent programmers into taking a second look at the code where they
+have carelessly left off the typically required cast. (Research from
+AT&T/Lucent indicates that additional programmer eyeballs is one of the most
+effective mechanisms at ferreting out bugs.)
+
+Function pointers:
+..................
+
+The bgets, bread and bStream functions use function pointers to obtain
+strings from data streams. The function pointer declarations have been
+specifically chosen to be compatible with the fgetc and fread functions.
+While this may seem to be a convoluted way of implementing fgets and fread
+style functionality, it has been specifically designed this way to ensure
+that there is no dependency on a single narrowly defined set of device
+interfaces, such as just stream I/O. In the embedded world, its quite
+possible to have environments where such interfaces may not exist in the
+standard C library form. Furthermore, the generalization that this opens up
+allows for more sophisticated uses for these functions (performing an fgets
+like function on a socket, for example.) By using function pointers, it also
+allows such abstract stream interfaces to be created using the bstring library
+itself while not creating a circular dependency.
+
+Use of int's for sizes:
+.......................
+
+This is just a recognition that 16bit platforms with requirements for strings
+that are larger than 64K and 32bit+ platforms with requirements for strings
+that are larger than 4GB are pretty marginal. The main focus is for 32bit
+platforms, and emerging 64bit platforms with reasonable < 4GB string
+requirements. Using ints allows for negative values which has meaning
+internally to bstrlib.
+
+Semantic consideration:
+.......................
+
+Certain care needs to be taken when copying and aliasing bstrings. A bstring
+is essentially a pointer type which points to a multipart abstract data
+structure. Thus usage, and lifetime of bstrings have semantics that follow
+these considerations. For example:
+
+ bstring a, b;
+ struct tagbstring t;
+
+ a = bfromcstr("Hello"); /* Create new bstring and copy "Hello" into it. */
+ b = a; /* Alias b to the contents of a. */
+ t = *a; /* Create a current instance pseudo-alias of a. */
+ bconcat (a, b); /* Double a and b, t is now undefined. */
+ bdestroy (a); /* Destroy the contents of both a and b. */
+
+Variables of type bstring are really just references that point to real
+bstring objects. The equal operator (=) creates aliases, and the asterisk
+dereference operator (*) creates a kind of alias to the current instance (which
+is generally not useful for any purpose.) Using bstrcpy() is the correct way
+of creating duplicate instances. The ampersand operator (&) is useful for
+creating aliases to struct tagbstrings (remembering that constructed struct
+tagbstrings are not writable by default.)
+
+CBStrings use complete copy semantics for the equal operator (=), and thus do
+not have these sorts of issues.
+
+Debugging:
+..........
+
+Bstrings have a simple, exposed definition and construction, and the library
+itself is open source. So most debugging is going to be fairly straight-
+forward. But the memory for bstrings come from the heap, which can often be
+corrupted indirectly, and it might not be obvious what has happened even from
+direct examination of the contents in a debugger or a core dump. There are
+some tools such as Purify, Insure++ and Electric Fence which can help solve
+such problems, however another common approach is to directly instrument the
+calls to malloc, realloc, calloc, free, memcpy, memmove and/or other calls
+by overriding them with macro definitions.
+
+Although the user could hack on the Bstrlib sources directly as necessary to
+perform such an instrumentation, Bstrlib comes with a built-in mechanism for
+doing this. By defining the macro BSTRLIB_MEMORY_DEBUG and providing an
+include file named memdbg.h this will force the core Bstrlib modules to
+attempt to include this file. In such a file, macros could be defined which
+overrides Bstrlib's useage of the C standard library.
+
+Rather than calling malloc, realloc, free, memcpy or memmove directly, Bstrlib
+emits the macros bstr__alloc, bstr__realloc, bstr__free, bstr__memcpy and
+bstr__memmove in their place respectively. By default these macros are simply
+assigned to be equivalent to their corresponding C standard library function
+call. However, if they are given earlier macro definitions (via the back
+door include file) they will not be given their default definition. In this
+way Bstrlib's interface to the standard library can be changed but without
+having to directly redefine or link standard library symbols (both of which
+are not strictly ANSI C compliant.)
+
+An example definition might include:
+
+ #define bstr__alloc(sz) X_malloc ((sz), __LINE__, __FILE__)
+
+which might help contextualize heap entries in a debugging environment.
+
+The NULL parameter and sanity checking of bstrings is part of the Bstrlib
+API, and thus Bstrlib itself does not present any different modes which would
+correspond to "Debug" or "Release" modes. Bstrlib always contains mechanisms
+which one might think of as debugging features, but retains the performance
+and small memory footprint one would normally associate with release mode
+code.
+
+Integration Microsoft's Visual Studio debugger:
+...............................................
+
+Microsoft's Visual Studio debugger has a capability of customizable mouse
+float over data type descriptions. This is accomplished by editting the
+AUTOEXP.DAT file to include the following:
+
+ ; new for CBString
+ tagbstring =slen=<slen> mlen=<mlen> <data,st>
+ Bstrlib::CBStringList =count=<size()>
+
+In Visual C++ 6.0 this file is located in the directory:
+
+ C:\Program Files\Microsoft Visual Studio\Common\MSDev98\Bin
+
+and in Visual Studio .NET 2003 its located here:
+
+ C:\Program Files\Microsoft Visual Studio .NET 2003\Common7\Packages\Debugger
+
+This will improve the ability of debugging with Bstrlib under Visual Studio.
+
+Security
+--------
+
+Bstrlib does not come with explicit security features outside of its fairly
+comprehensive error detection, coupled with its strict semantic support.
+That is to say that certain common security problems, such as buffer overrun,
+constant overwrite, arbitrary truncation etc, are far less likely to happen
+inadvertently. Where it does help, Bstrlib maximizes its advantage by
+providing developers a simple adoption path that lets them leave less secure
+string mechanisms behind. The library will not leave developers wanting, so
+they will be less likely to add new code using a less secure string library
+to add functionality that might be missing from Bstrlib.
+
+That said there are a number of security ideas not addressed by Bstrlib:
+
+1. Race condition exploitation (i.e., verifying a string's contents, then
+raising the privilege level and execute it as a shell command as two
+non-atomic steps) is well beyond the scope of what Bstrlib can provide. It
+should be noted that MFC's built-in string mutex actually does not solve this
+problem either -- it just removes immediate data corruption as a possible
+outcome of such exploit attempts (it can be argued that this is worse, since
+it will leave no trace of the exploitation). In general race conditions have
+to be dealt with by careful design and implementation; it cannot be assisted
+by a string library.
+
+2. Any kind of access control or security attributes to prevent usage in
+dangerous interfaces such as system(). Perl includes a "trust" attribute
+which can be endowed upon strings that are intended to be passed to such
+dangerous interfaces. However, Perl's solution reflects its own limitations
+-- notably that it is not a strongly typed language. In the example code for
+Bstrlib, there is a module called taint.cpp. It demonstrates how to write a
+simple wrapper class for managing "untainted" or trusted strings using the
+type system to prevent questionable mixing of ordinary untrusted strings with
+untainted ones then passing them to dangerous interfaces. In this way the
+security correctness of the code reduces to auditing the direct usages of
+dangerous interfaces or promotions of tainted strings to untainted ones.
+
+3. Encryption of string contents is way beyond the scope of Bstrlib.
+Maintaining encrypted string contents in the futile hopes of thwarting things
+like using system-level debuggers to examine sensitive string data is likely
+to be a wasted effort (imagine a debugger that runs at a higher level than a
+virtual processor where the application runs). For more standard encryption
+usages, since the bstring contents are simply binary blocks of data, this
+should pose no problem for usage with other standard encryption libraries.
+
+Compatibility
+-------------
+
+The Better String Library is known to compile and function correctly with the
+following compilers:
+
+ - Microsoft Visual C++
+ - Watcom C/C++
+ - Intel's C/C++ compiler (Windows)
+ - The GNU C/C++ compiler (cygwin and Linux on PPC64)
+ - Borland C
+ - Turbo C
+
+Setting of configuration options should be unnecessary for these compilers
+(unless exceptions are being disabled or STLport has been added to WATCOM
+C/C++). Bstrlib has been developed with an emphasis on portability. As such
+porting it to other compilers should be straight forward. This package
+includes a porting guide (called porting.txt) which explains what issues may
+exist for porting Bstrlib to different compilers and environments.
+
+ANSI issues
+-----------
+
+1. The function pointer types bNgetc and bNread have prototypes which are very
+similar to, but not exactly the same as fgetc and fread respectively.
+Basically the FILE * parameter is replaced by void *. The purpose of this
+was to allow one to create other functions with fgetc and fread like
+semantics without being tied to ANSI C's file streaming mechanism. I.e., one
+could very easily adapt it to sockets, or simply reading a block of memory,
+or procedurally generated strings (for fractal generation, for example.)
+
+The problem is that invoking the functions (bNgetc)fgetc and (bNread)fread is
+not technically legal in ANSI C. The reason being that the compiler is only
+able to coerce the function pointers themselves into the target type, however
+are unable to perform any cast (implicit or otherwise) on the parameters
+passed once invoked. I.e., if internally void * and FILE * need some kind of
+mechanical coercion, the compiler will not properly perform this conversion
+and thus lead to undefined behavior.
+
+Apparently a platform from Data General called "Eclipse" and another from
+Tandem called "NonStop" have a different representation for pointers to bytes
+and pointers to words, for example, where coercion via casting is necessary.
+(Actual confirmation of the existence of such machines is hard to come by, so
+it is prudent to be skeptical about this information.) However, this is not
+an issue for any known contemporary platforms. One may conclude that such
+platforms are effectively apocryphal even if they do exist.
+
+To correctly work around this problem to the satisfaction of the ANSI
+limitations, one needs to create wrapper functions for fgets and/or
+fread with the prototypes of bNgetc and/or bNread respectively which performs
+no other action other than to explicitely cast the void * parameter to a
+FILE *, and simply pass the remaining parameters straight to the function
+pointer call.
+
+The wrappers themselves are trivial:
+
+ size_t freadWrap (void * buff, size_t esz, size_t eqty, void * parm) {
+ return fread (buff, esz, eqty, (FILE *) parm);
+ }
+
+ int fgetcWrap (void * parm) {
+ return fgetc ((FILE *) parm);
+ }
+
+These have not been supplied in bstrlib or bstraux to prevent unnecessary
+linking with file I/O functions.
+
+2. vsnprintf is not available on all compilers. Because of this, the bformat
+and bformata functions (and format and formata methods) are not guaranteed to
+work properly. For those compilers that don't have vsnprintf, the
+BSTRLIB_NOVSNP macro should be set before compiling bstrlib, and the format
+functions/method will be disabled.
+
+The more recent ANSI C standards have specified the required inclusion of a
+vsnprintf function.
+
+3. The bstrlib function names are not unique in the first 6 characters. This
+is only an issue for older C compiler environments which do not store more
+than 6 characters for function names.
+
+4. The bsafe module defines macros and function names which are part of the
+C library. This simply overrides the definition as expected on all platforms
+tested, however it is not sanctioned by the ANSI standard. This module is
+clearly optional and should be omitted on platforms which disallow its
+undefined semantics.
+
+In practice the real issue is that some compilers in some modes of operation
+can/will inline these standard library functions on a module by module basis
+as they appear in each. The linker will thus have no opportunity to override
+the implementation of these functions for those cases. This can lead to
+inconsistent behaviour of the bsafe module on different platforms and
+compilers.
+
+===============================================================================
+
+Comparison with Microsoft's CString class
+-----------------------------------------
+
+Although developed independently, CBStrings have very similar functionality to
+Microsoft's CString class. However, the bstring library has significant
+advantages over CString:
+
+1. Bstrlib is a C-library as well as a C++ library (using the C++ wrapper).
+
+ - Thus it is compatible with more programming environments and
+ available to a wider population of programmers.
+
+2. The internal structure of a bstring is considered exposed.
+
+ - A single contiguous block of data can be cut into read-only pieces by
+ simply creating headers, without allocating additional memory to create
+ reference copies of each of these sub-strings.
+ - In this way, using bstrings in a totally abstracted way becomes a choice
+ rather than an imposition. Further this choice can be made differently
+ at different layers of applications that use it.
+
+3. Static declaration support precludes the need for constructor
+ invocation.
+
+ - Allows for static declarations of constant strings that has no
+ additional constructor overhead.
+
+4. Bstrlib is not attached to another library.
+
+ - Bstrlib is designed to be easily plugged into any other library
+ collection, without dependencies on other libraries or paradigms (such
+ as "MFC".)
+
+The bstring library also comes with a few additional functions that are not
+available in the CString class:
+
+ - bsetstr
+ - bsplit
+ - bread
+ - breplace (this is different from CString::Replace())
+ - Writable indexed characters (for example a[i]='x')
+
+Interestingly, although Microsoft did implement mid$(), left$() and right$()
+functional analogues (these are functions from GWBASIC) they seem to have
+forgotten that mid$() could be also used to write into the middle of a string.
+This functionality exists in Bstrlib with the bsetstr() and breplace()
+functions.
+
+Among the disadvantages of Bstrlib is that there is no special support for
+localization or wide characters. Such things are considered beyond the scope
+of what bstrings are trying to deliver. CString essentially supports the
+older UCS-2 version of Unicode via widechar_t as an application-wide compile
+time switch.
+
+CString's also use built-in mechanisms for ensuring thread safety under all
+situations. While this makes writing thread safe code that much easier, this
+built-in safety feature has a price -- the inner loops of each CString method
+runs in its own critical section (grabbing and releasing a light weight mutex
+on every operation.) The usual way to decrease the impact of a critical
+section performance penalty is to amortize more operations per critical
+section. But since the implementation of CStrings is fixed as a one critical
+section per-operation cost, there is no way to leverage this common
+performance enhancing idea.
+
+The search facilities in Bstrlib are comparable to those in MFC's CString
+class, though it is missing locale specific collation. But because Bstrlib
+is interoperable with C's char buffers, it will allow programmers to write
+their own string searching mechanism (such as Boyer-Moore), or be able to
+choose from a variety of available existing string searching libraries (such
+as those for regular expressions) without difficulty.
+
+Microsoft used a very non-ANSI conforming trick in its implementation to
+allow printf() to use the "%s" specifier to output a CString correctly. This
+can be convenient, but it is inherently not portable. CBString requires an
+explicit cast, while bstring requires the data member to be dereferenced.
+Microsoft's own documentation recommends casting, instead of relying on this
+feature.
+
+Comparison with C++'s std::string
+---------------------------------
+
+This is the C++ language's standard STL based string class.
+
+1. There is no C implementation.
+2. The [] operator is not bounds checked.
+3. Missing a lot of useful functions like printf-like formatting.
+4. Some sub-standard std::string implementations (SGI) are necessarily unsafe
+ to use with multithreading.
+5. Limited by STL's std::iostream which in turn is limited by ifstream which
+ can only take input from files. (Compare to CBStream's API which can take
+ abstracted input.)
+6. Extremely uneven performance across implementations.
+
+Comparison with ISO C TR 24731 proposal
+---------------------------------------
+
+Following the ISO C99 standard, Microsoft has proposed a group of C library
+extensions which are supposedly "safer and more secure". This proposal is
+expected to be adopted by the ISO C standard which follows C99.
+
+The proposal reveals itself to be very similar to Microsoft's "StrSafe"
+library. The functions are basically the same as other standard C library
+string functions except that destination parameters are paired with an
+additional length parameter of type rsize_t. rsize_t is the same as size_t,
+however, the range is checked to make sure its between 1 and RSIZE_MAX. Like
+Bstrlib, the functions perform a "parameter check". Unlike Bstrlib, when a
+parameter check fails, rather than simply outputing accumulatable error
+statuses, they call a user settable global error function handler, and upon
+return of control performs no (additional) detrimental action. The proposal
+covers basic string functions as well as a few non-reenterable functions
+(asctime, ctime, and strtok).
+
+1. Still based solely on char * buffers (and therefore strlen() and strcat()
+ is still O(n), and there are no faster streq() comparison functions.)
+2. No growable string semantics.
+3. Requires manual buffer length synchronization in the source code.
+4. No attempt to enhance functionality of the C library.
+5. Introduces a new error scenario (strings exceeding RSIZE_MAX length).
+
+The hope is that by exposing the buffer length requirements there will be
+fewer buffer overrun errors. However, the error modes are really just
+transformed, rather than removed. The real problem of buffer overflows is
+that they all happen as a result of erroneous programming. So forcing
+programmers to manually deal with buffer limits, will make them more aware of
+the problem but doesn't remove the possibility of erroneous programming. So
+a programmer that erroneously mixes up the rsize_t parameters is no better off
+from a programmer that introduces potential buffer overflows through other
+more typical lapses. So at best this may reduce the rate of erroneous
+programming, rather than making any attempt at removing failure modes.
+
+The error handler can discriminate between types of failures, but does not
+take into account any callsite context. So the problem is that the error is
+going to be manifest in a piece of code, but there is no pointer to that
+code. It would seem that passing in the call site __FILE__, __LINE__ as
+parameters would be very useful, but the API clearly doesn't support such a
+thing (it would increase code bloat even more than the extra length
+parameter does, and would require macro tricks to implement).
+
+The Bstrlib C API takes the position that error handling needs to be done at
+the callsite, and just tries to make it as painless as possible. Furthermore,
+error modes are removed by supporting auto-growing strings and aliasing. For
+capturing errors in more central code fragments, Bstrlib's C++ API uses
+exception handling extensively, which is superior to the leaf-only error
+handler approach.
+
+Comparison with Managed String Library CERT proposal
+----------------------------------------------------
+
+The main webpage for the managed string library:
+http://www.cert.org/secure-coding/managedstring.html
+
+Robert Seacord at CERT has proposed a C string library that he calls the
+"Managed String Library" for C. Like Bstrlib, it introduces a new type
+which is called a managed string. The structure of a managed string
+(string_m) is like a struct tagbstring but missing the length field. This
+internal structure is considered opaque. The length is, like the C standard
+library, always computed on the fly by searching for a terminating NUL on
+every operation that requires it. So it suffers from every performance
+problem that the C standard library suffers from. Interoperating with C
+string APIs (like printf, fopen, or anything else that takes a string
+parameter) requires copying to additionally allocating buffers that have to
+be manually freed -- this makes this library probably slower and more
+cumbersome than any other string library in existence.
+
+The library gives a fully populated error status as the return value of every
+string function. The hope is to be able to diagnose all problems
+specifically from the return code alone. Comparing this to Bstrlib, which
+aways returns one consistent error message, might make it seem that Bstrlib
+would be harder to debug; but this is not true. With Bstrlib, if an error
+occurs there is always enough information from just knowing there was an error
+and examining the parameters to deduce exactly what kind of error has
+happened. The managed string library thus gives up nested function calls
+while achieving little benefit, while Bstrlib does not.
+
+One interesting feature that "managed strings" has is the idea of data
+sanitization via character set whitelisting. That is to say, a globally
+definable filter that makes any attempt to put invalid characters into strings
+lead to an error and not modify the string. The author gives the following
+example:
+
+ // create valid char set
+ if (retValue = strcreate_m(&str1, "abc") ) {
+ fprintf(
+ stderr,
+ "Error %d from strcreate_m.\n",
+ retValue
+ );
+ }
+ if (retValue = setcharset(str1)) {
+ fprintf(
+ stderr,
+ "Error %d from setcharset().\n",
+ retValue
+ );
+ }
+ if (retValue = strcreate_m(&str1, "aabbccabc")) {
+ fprintf(
+ stderr,
+ "Error %d from strcreate_m.\n",
+ retValue
+ );
+ }
+ // create string with invalid char set
+ if (retValue = strcreate_m(&str1, "abbccdabc")) {
+ fprintf(
+ stderr,
+ "Error %d from strcreate_m.\n",
+ retValue
+ );
+ }
+
+Which we can compare with a more Bstrlib way of doing things:
+
+ bstring bCreateWithFilter (const char * cstr, const_bstring filter) {
+ bstring b = bfromcstr (cstr);
+ if (BSTR_ERR != bninchr (b, filter) && NULL != b) {
+ fprintf (stderr, "Filter violation.\n");
+ bdestroy (b);
+ b = NULL;
+ }
+ return b;
+ }
+
+ struct tagbstring charFilter = bsStatic ("abc");
+ bstring str1 = bCreateWithFilter ("aabbccabc", &charFilter);
+ bstring str2 = bCreateWithFilter ("aabbccdabc", &charFilter);
+
+The first thing we should notice is that with the Bstrlib approach you can
+have different filters for different strings if necessary. Furthermore,
+selecting a charset filter in the Managed String Library is uni-contextual.
+That is to say, there can only be one such filter active for the entire
+program, which means its usage is not well defined for intermediate library
+usage (a library that uses it will interfere with user code that uses it, and
+vice versa.) It is also likely to be poorly defined in multi-threading
+environments.
+
+There is also a question as to whether the data sanitization filter is checked
+on every operation, or just on creation operations. Since the charset can be
+set arbitrarily at run time, it might be set *after* some managed strings have
+been created. This would seem to imply that all functions should run this
+additional check every time if there is an attempt to enforce this. This
+would make things tremendously slow. On the other hand, if it is assumed that
+only creates and other operations that take char *'s as input need be checked
+because the charset was only supposed to be called once at and before any
+other managed string was created, then one can see that its easy to cover
+Bstrlib with equivalent functionality via a few wrapper calls such as the
+example given above.
+
+And finally we have to question the value of sanitation in the first place.
+For example, for httpd servers, there is generally a requirement that the
+URLs parsed have some form that avoids undesirable translation to local file
+system filenames or resources. The problem is that the way URLs can be
+encoded, it must be completely parsed and translated to know if it is using
+certain invalid character combinations. That is to say, merely filtering
+each character one at a time is not necessarily the right way to ensure that
+a string has safe contents.
+
+In the article that describes this proposal, it is claimed that it fairly
+closely approximates the existing C API semantics. On this point we should
+compare this "closeness" with Bstrlib:
+
+ Bstrlib Managed String Library
+ ------- ----------------------
+
+Pointer arithmetic Segment arithmetic N/A
+
+Use in C Std lib ->data, or bdata{e} getstr_m(x,*) ... free(x)
+
+String literals bsStatic, bsStaticBlk strcreate_m()
+
+Transparency Complete None
+
+Its pretty clear that the semantic mapping from C strings to Bstrlib is fairly
+straightforward, and that in general semantic capabilities are the same or
+superior in Bstrlib. On the other hand the Managed String Library is either
+missing semantics or changes things fairly significantly.
+
+Comparison with Annexia's c2lib library
+---------------------------------------
+
+This library is available at:
+http://www.annexia.org/freeware/c2lib
+
+1. Still based solely on char * buffers (and therefore strlen() and strcat()
+ is still O(n), and there are no faster streq() comparison functions.)
+ Their suggestion that alternatives which wrap the string data type (such as
+ bstring does) imposes a difficulty in interoperating with the C langauge's
+ ordinary C string library is not founded.
+2. Introduction of memory (and vector?) abstractions imposes a learning
+ curve, and some kind of memory usage policy that is outside of the strings
+ themselves (and therefore must be maintained by the developer.)
+3. The API is massive, and filled with all sorts of trivial (pjoin) and
+ controvertial (pmatch -- regular expression are not sufficiently
+ standardized, and there is a very large difference in performance between
+ compiled and non-compiled, REs) functions. Bstrlib takes a decidely
+ minimal approach -- none of the functionality in c2lib is difficult or
+ challenging to implement on top of Bstrlib (except the regex stuff, which
+ is going to be difficult, and controvertial no matter what.)
+4. Understanding why c2lib is the way it is pretty much requires a working
+ knowledge of Perl. bstrlib requires only knowledge of the C string library
+ while providing just a very select few worthwhile extras.
+5. It is attached to a lot of cruft like a matrix math library (that doesn't
+ include any functions for getting the determinant, eigenvectors,
+ eigenvalues, the matrix inverse, test for singularity, test for
+ orthogonality, a grahm schmit orthogonlization, LU decomposition ... I
+ mean why bother?)
+
+Convincing a development house to use c2lib is likely quite difficult. It
+introduces too much, while not being part of any kind of standards body. The
+code must therefore be trusted, or maintained by those that use it. While
+bstring offers nothing more on this front, since its so much smaller, covers
+far less in terms of scope, and will typically improve string performance,
+the barrier to usage should be much smaller.
+
+Comparison with stralloc/qmail
+------------------------------
+
+More information about this library can be found here:
+http://www.canonical.org/~kragen/stralloc.html or here:
+http://cr.yp.to/lib/stralloc.html
+
+1. Library is very very minimal. A little too minimal.
+2. Untargetted source parameters are not declared const.
+3. Slightly different expected emphasis (like _cats function which takes an
+ ordinary C string char buffer as a parameter.) Its clear that the
+ remainder of the C string library is still required to perform more
+ useful string operations.
+
+The struct declaration for their string header is essentially the same as that
+for bstring. But its clear that this was a quickly written hack whose goals
+are clearly a subset of what Bstrlib supplies. For anyone who is served by
+stralloc, Bstrlib is complete substitute that just adds more functionality.
+
+stralloc actually uses the interesting policy that a NULL data pointer
+indicates an empty string. In this way, non-static empty strings can be
+declared without construction. This advantage is minimal, since static empty
+bstrings can be declared inline without construction, and if the string needs
+to be written to it should be constructed from an empty string (or its first
+initializer) in any event.
+
+wxString class
+--------------
+
+This is the string class used in the wxWindows project. A description of
+wxString can be found here:
+http://www.wxwindows.org/manuals/2.4.2/wx368.htm#wxstring
+
+This C++ library is similar to CBString. However, it is littered with
+trivial functions (IsAscii, UpperCase, RemoveLast etc.)
+
+1. There is no C implementation.
+2. The memory management strategy is to allocate a bounded fixed amount of
+ additional space on each resize, meaning that it does not have the
+ log_2(n) property that Bstrlib has (it will thrash very easily, cause
+ massive fragmentation in common heap implementations, and can easily be a
+ common source of performance problems).
+3. The library uses a "copy on write" strategy, meaning that it has to deal
+ with multithreading problems.
+
+Vstr
+----
+
+This is a highly orthogonal C string library with an emphasis on
+networking/realtime programming. It can be found here:
+http://www.and.org/vstr/
+
+1. The convoluted internal structure does not contain a '\0' char * compatible
+ buffer, so interoperability with the C library a non-starter.
+2. The API and implementation is very large (owing to its orthogonality) and
+ can lead to difficulty in understanding its exact functionality.
+3. An obvious dependency on gnu tools (confusing make configure step)
+4. Uses a reference counting system, meaning that it is not likely to be
+ thread safe.
+
+The implementation has an extreme emphasis on performance for nontrivial
+actions (adds, inserts and deletes are all constant or roughly O(#operations)
+time) following the "zero copy" principle. This trades off performance of
+trivial functions (character access, char buffer access/coersion, alias
+detection) which becomes significantly slower, as well as incremental
+accumulative costs for its searching/parsing functions. Whether or not Vstr
+wins any particular performance benchmark will depend a lot on the benchmark,
+but it should handily win on some, while losing dreadfully on others.
+
+The learning curve for Vstr is very steep, and it doesn't come with any
+obvious way to build for Windows or other platforms without gnu tools. At
+least one mechanism (the iterator) introduces a new undefined scenario
+(writing to a Vstr while iterating through it.) Vstr has a very large
+footprint, and is very ambitious in its total functionality. Vstr has no C++
+API.
+
+Vstr usage requires context initialization via vstr_init() which must be run
+in a thread-local context. Given the totally reference based architecture
+this means that sharing Vstrings across threads is not well defined, or at
+least not safe from race conditions. This API is clearly geared to the older
+standard of fork() style multitasking in UNIX, and is not safely transportable
+to modern shared memory multithreading available in Linux and Windows. There
+is no portable external solution making the library thread safe (since it
+requires a mutex around each Vstr context -- not each string.)
+
+In the documentation for this library, a big deal is made of its self hosted
+s(n)printf-like function. This is an issue for older compilers that don't
+include vsnprintf(), but also an issue because Vstr has a slow conversion to
+'\0' terminated char * mechanism. That is to say, using "%s" to format data
+that originates from Vstr would be slow without some sort of native function
+to do so. Bstrlib sidesteps the issue by relying on what snprintf-like
+functionality does exist and having a high performance conversion to a char *
+compatible string so that "%s" can be used directly.
+
+Str Library
+-----------
+
+This is a fairly extensive string library, that includes full unicode support
+and targetted at the goal of out performing MFC and STL. The architecture,
+similarly to MFC's CStrings, is a copy on write reference counting mechanism.
+
+http://www.utilitycode.com/str/default.aspx
+
+1. Commercial.
+2. C++ only.
+
+This library, like Vstr, uses a ref counting system. There is only so deeply
+I can analyze it, since I don't have a license for it. However, performance
+improvements over MFC's and STL, doesn't seem like a sufficient reason to
+move your source base to it. For example, in the future, Microsoft may
+improve the performance CString.
+
+It should be pointed out that performance testing of Bstrlib has indicated
+that its relative performance advantage versus MFC's CString and STL's
+std::string is at least as high as that for the Str library.
+
+libmib astrings
+---------------
+
+A handful of functional extensions to the C library that add dynamic string
+functionality.
+http://www.mibsoftware.com/libmib/astring/
+
+This package basically references strings through char ** pointers and assumes
+they are pointing to the top of an allocated heap entry (or NULL, in which
+case memory will be newly allocated from the heap.) So its still up to user
+to mix and match the older C string functions with these functions whenever
+pointer arithmetic is used (i.e., there is no leveraging of the type system
+to assert semantic differences between references and base strings as Bstrlib
+does since no new types are introduced.) Unlike Bstrlib, exact string length
+meta data is not stored, thus requiring a strlen() call on *every* string
+writing operation. The library is very small, covering only a handful of C's
+functions.
+
+While this is better than nothing, it is clearly slower than even the
+standard C library, less safe and less functional than Bstrlib.
+
+To explain the advantage of using libmib, their website shows an example of
+how dangerous C code:
+
+ char buf[256];
+ char *pszExtraPath = ";/usr/local/bin";
+
+ strcpy(buf,getenv("PATH")); /* oops! could overrun! */
+ strcat(buf,pszExtraPath); /* Could overrun as well! */
+
+ printf("Checking...%s\n",buf); /* Some printfs overrun too! */
+
+is avoided using libmib:
+
+ char *pasz = 0; /* Must initialize to 0 */
+ char *paszOut = 0;
+ char *pszExtraPath = ";/usr/local/bin";
+
+ if (!astrcpy(&pasz,getenv("PATH"))) /* malloc error */ exit(-1);
+ if (!astrcat(&pasz,pszExtraPath)) /* malloc error */ exit(-1);
+
+ /* Finally, a "limitless" printf! we can use */
+ asprintf(&paszOut,"Checking...%s\n",pasz);fputs(paszOut,stdout);
+
+ astrfree(&pasz); /* Can use free(pasz) also. */
+ astrfree(&paszOut);
+
+However, compare this to Bstrlib:
+
+ bstring b, out;
+
+ bcatcstr (b = bfromcstr (getenv ("PATH")), ";/usr/local/bin");
+ out = bformat ("Checking...%s\n", bdatae (b, "<Out of memory>"));
+ /* if (out && b) */ fputs (bdatae (out, "<Out of memory>"), stdout);
+ bdestroy (b);
+ bdestroy (out);
+
+Besides being shorter, we can see that error handling can be deferred right
+to the very end. Also, unlike the above two versions, if getenv() returns
+with NULL, the Bstrlib version will not exhibit undefined behavior.
+Initialization starts with the relevant content rather than an extra
+autoinitialization step.
+
+libclc
+------
+
+An attempt to add to the standard C library with a number of common useful
+functions, including additional string functions.
+http://libclc.sourceforge.net/
+
+1. Uses standard char * buffer, and adopts C 99's usage of "restrict" to pass
+ the responsibility to guard against aliasing to the programmer.
+2. Adds no safety or memory management whatsoever.
+3. Most of the supplied string functions are completely trivial.
+
+The goals of libclc and Bstrlib are clearly quite different.
+
+fireString
+----------
+
+http://firestuff.org/
+
+1. Uses standard char * buffer, and adopts C 99's usage of "restrict" to pass
+ the responsibility to guard against aliasing to the programmer.
+2. Mixes char * and length wrapped buffers (estr) functions, doubling the API
+ size, with safety limited to only half of the functions.
+
+Firestring was originally just a wrapper of char * functionality with extra
+length parameters. However, it has been augmented with the inclusion of the
+estr type which has similar functionality to stralloc. But firestring does
+not nearly cover the functional scope of Bstrlib.
+
+Safe C String Library
+---------------------
+
+A library written for the purpose of increasing safety and power to C's string
+handling capabilities.
+http://www.zork.org/safestr/safestr.html
+
+1. While the safestr_* functions are safe in of themselves, interoperating
+ with char * string has dangerous unsafe modes of operation.
+2. The architecture of safestr's causes the base pointer to change. Thus,
+ its not practical/safe to store a safestr in multiple locations if any
+ single instance can be manipulated.
+3. Dependent on an additional error handling library.
+4. Uses reference counting, meaning that it is either not thread safe or
+ slow and not portable.
+
+I think the idea of reallocating (and hence potentially changing) the base
+pointer is a serious design flaw that is fatal to this architecture. True
+safety is obtained by having automatic handling of all common scenarios
+without creating implicit constraints on the user.
+
+Because of its automatic temporary clean up system, it cannot use "const"
+semantics on input arguments. Interesting anomolies such as:
+
+ safestr_t s, t;
+ s = safestr_replace (t = SAFESTR_TEMP ("This is a test"),
+ SAFESTR_TEMP (" "), SAFESTR_TEMP ("."));
+ /* t is now undefined. */
+
+are possible. If one defines a function which takes a safestr_t as a
+parameter, then the function would not know whether or not the safestr_t is
+defined after it passes it to a safestr library function. The author
+recommended method for working around this problem is to examine the
+attributes of the safestr_t within the function which is to modify any of
+its parameters and play games with its reference count. I think, therefore,
+that the whole SAFESTR_TEMP idea is also fatally broken.
+
+The library implements immutability, optional non-resizability, and a "trust"
+flag. This trust flag is interesting, and suggests that applying any
+arbitrary sequence of safestr_* function calls on any set of trusted strings
+will result in a trusted string. It seems to me, however, that if one wanted
+to implement a trusted string semantic, one might do so by actually creating
+a different *type* and only implement the subset of string functions that are
+deemed safe (i.e., user input would be excluded, for example.) This, in
+essence, would allow the compiler to enforce trust propogation at compile
+time rather than run time. Non-resizability is also interesting, however,
+it seems marginal (i.e., to want a string that cannot be resized, yet can be
+modified and yet where a fixed sized buffer is undesirable.)
+
+===============================================================================
+
+Examples
+--------
+
+ Dumping a line numbered file:
+
+ FILE * fp;
+ int i, ret;
+ struct bstrList * lines;
+ struct tagbstring prefix = bsStatic ("-> ");
+
+ if (NULL != (fp = fopen ("bstrlib.txt", "rb"))) {
+ bstring b = bread ((bNread) fread, fp);
+ fclose (fp);
+ if (NULL != (lines = bsplit (b, '\n'))) {
+ for (i=0; i < lines->qty; i++) {
+ binsert (lines->entry[i], 0, &prefix, '?');
+ printf ("%04d: %s\n", i, bdatae (lines->entry[i], "NULL"));
+ }
+ bstrListDestroy (lines);
+ }
+ bdestroy (b);
+ }
+
+For numerous other examples, see bstraux.c, bstraux.h and the example archive.
+
+===============================================================================
+
+License
+-------
+
+The Better String Library is available under either the BSD license (see the
+accompanying license.txt) or the Gnu Public License version 2 (see the
+accompanying gpl.txt) at the option of the user.
+
+===============================================================================
+
+Acknowledgements
+----------------
+
+The following individuals have made significant contributions to the design
+and testing of the Better String Library:
+
+Bjorn Augestad
+Clint Olsen
+Darryl Bleau
+Fabian Cenedese
+Graham Wideman
+Ignacio Burgueno
+International Business Machines Corporation
+Ira Mica
+John Kortink
+Manuel Woelker
+Marcel van Kervinck
+Michael Hsieh
+Richard A. Smith
+Simon Ekstrom
+Wayne Scott
+
+===============================================================================
diff --git a/doc/likwid-accessD.1 b/doc/likwid-accessD.1
index 7d444af..7285772 100644
--- a/doc/likwid-accessD.1
+++ b/doc/likwid-accessD.1
@@ -1,7 +1,7 @@
.TH LIKWID-ACCESSD 1 <DATE> likwid\-<VERSION>
.SH NAME
likwid-accessD \- This tool forwards the access operations from LIKWID PerfMon tools
-to the MSR device files
+to the MSR and PCI device files
.SH DESCRIPTION
.B likwid-accessD
is a command line application that opens a UNIX file socket and waits for access
@@ -9,14 +9,14 @@ operations from LIKWID tools that require access to the MSR and PCI device
files. The MSR and PCI device files are only accessible for users with root
privileges, therefore
.B likwid-accessD
-requires the suid-bit set.
+requires the suid-bit set or a suitable libcap setting.
Depending on the current system architecture,
.B likwid-accessD
permits only access to registers defined for the architecture.
.SH AUTHOR
-Written by Thomas Roehl <thomas.roehl at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
.SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
.SH "SEE ALSO"
-likwid-perfctr(1), likwid-powermeter(1), likwid-features(1), likwid-pin(1), likwid-topology(1),
+likwid-perfctr(1), likwid-powermeter(1), likwid-features(1)
diff --git a/doc/likwid-agent.1 b/doc/likwid-agent.1
new file mode 100644
index 0000000..f50dbca
--- /dev/null
+++ b/doc/likwid-agent.1
@@ -0,0 +1,94 @@
+.TH LIKWID-AGENT 1 <DATE> likwid\-VERSION
+.SH NAME
+likwid-agent \- monitoring daemon for hardware performance counters
+.SH SYNOPSIS
+.B likwid-agent <config_file>
+.SH DESCRIPTION
+.B likwid-agent
+is a daemon application that uses
+.B likwid-perfctr(1)
+to measure hardware performance counters. The basic configuration is in a global configuration file. The configuration of the hardware event sets is done with extra files suitable for each architecture. Besides the hardware event configuration, the raw data can be transformed using formulas to interested metrics. In order to output to much data, the data can be further filtered or aggregated.
+.B likwid-agent
+provides multiple store backends like logfiles, RRD (Round Robin Database) or gmetric (Ganglia Monitoring System).
+
+.SH CONFIG FILE
+The global configuration file has the following options:
+.TP
+.B GROUPPATH <path>
+Path to the group files containing event set and output defintitions. See section
+.B GROUP FILES
+for information.
+.TP
+.B EVENTSET <group1> <group2> ...
+Space separated list of groups (without .txt) that should be monitored.
+.TP
+.B DURATION <time>
+Measurement duration in seconds.
+.TP
+.B LOGPATH <path>
+Specify a logfile.
+.TP
+.B GMETRIC <True/False>
+Activates the output to gmetric.
+.TP
+.B GMETRICPATH <path>
+Set path to the gmetric executable.
+.TP
+.B GMETRICCONFIG <path>
+Set a custom configuration file is needed for gmetric.
+.TP
+.B RRD <True/False>
+Activates the output to RRD files (Round Robin Database).
+.TP
+.B RRDPATH <path>
+Output path for the RRD files. The files are named according to the group and each output metric is saved as DS with function GAUGE. The RRD is configured with RRA entries to store average, minimum and maximum of 10 minutes for one hour, of 60 min for one day and daily data for one month.
+.TP
+.B SYSLOG <True/False>
+Activates the output to system log using logger.
+.TP
+.B SYSLOGPRIO <prio>
+Set the priority string for logger, default is 'local0.notice'.
+
+.SH GROUP FILES
+The group files are adapted performance group files as used by
+.B likwid-perfctr(1).
+This makes it easy to uses the predefined and often used performance groups as basis for the monitoring. The folder structure of for the groups is
+.B <GROUPPATH>/<SHORT_ARCH_NAME>/
+with
+.B <SHORT_ARCH_NAME>
+similar to the ones for the performance groups, like 'sandybridge' or 'haswellEP'.
+.TP
+.B SHORT <string>
+A short descriptive information about the group.
+.TP
+.B EVENTSET
+.TP
+.B <counter1> <event1>
+.TP
+.B <counter2>:<option> <event2>
+Defintion of the eventset similar to the performance groups.
+.TP
+.B METRICS
+.TP
+.B <metricname> <formula>
+.TP
+.B <filter> <metricname> <formula>
+Defintion of the output metrics. The syntax follows the
+.B METRICS
+defintion of the performance groups as used by
+.B likwid-perfctr(1).
+If no function is set at the beginning of the line,
+.B <formula>
+is evaluated for every CPU and send to the output backends. The
+.B <metricname>
+gets the prefix "T<cpuid> ". To avoid writing to much data to the backends, the data can be reduced by
+.B <filter>.
+The possible filter options are MIN, MAX, AVG, SUM, ONCE. The ONCE filter sends only the data from the first CPU to the output backends commonly used for the measurement duration.
+
+
+.SH AUTHOR
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
+.SH BUGS
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
+.SH "SEE ALSO"
+likwid-perfctr(1), rrdtool(1), gmetric(1)
diff --git a/doc/likwid-bench.1 b/doc/likwid-bench.1
index 45d0f6c..3a1d719 100644
--- a/doc/likwid-bench.1
+++ b/doc/likwid-bench.1
@@ -5,29 +5,31 @@ likwid-bench \- low-level benchmark suite and microbenchmarking framework
.SH SYNOPSIS
.B likwid-bench
.RB [\-hap]
-.RB [ \-l
-.IR <testname> ]
-.RB [ \-i
-.IR <iterations> ]
-.RB [ \-g
-.IR <number_of_workgroups> ]
.RB [ \-t
.IR <testname> ]
+.RB [ \-s
+.IR <min_time> ]
.RB [ \-w
.IR <workgroup_expression> ]
+.RB [ \-l
+.IR <testname> ]
+.RB [ \-d
+.IR <delimiter> ]
+.RB [ \-i
+.IR <iterations> ]
.SH DESCRIPTION
.B likwid-bench
is a benchmark suite for low-level (assembly) benchmarks to measure bandwidths and instruction throughput for specific instruction code on x86 systems. The currently included benchmark codes include common data access patterns like load and store but also calculations like vector triad and sum.
.B likwid-bench
-includes architecture specific benchmarks for x86, x86_64 and x86 for Intel Xeon Phi coprocessors. The performance values can either be calculated by
-.B likwid-bench
-or measured using performance counters by using.
+includes architecture specific benchmarks for x86, x86_64 and x86 for Intel Xeon Phi coprocessors. The performance values can either be calculated by
+.B likwid-bench
+or measured using performance counters by using
.B likwid-perfctr
as a wrapper to
.B likwid-bench.
This requires to build
-.B likwid-bench.
-with Instrumentation which can be enabled in config.mk.
+.B likwid-bench
+with instrumentation enabled in config.mk.
.SH OPTIONS
.TP
.B \-\^h
@@ -39,77 +41,130 @@ list available benchmark codes for the current system.
.B \-\^p
list available thread domains.
.TP
-.B \-\^l " <testname>"
-list properties of a benchmark code.
-.TP
-.B \-\^i " <iterations>"
-number of iterations to perform inside the benchmark code.
+.B \-\^s <min_time>
+Run the benchmark for at least
+.B <min_time> seconds.
+The amount of iterations is determined using this value. Default: 1 second.
.TP
-.B \-\^t " <testname>"
+.B \-\^t <testname>
Name of the benchmark code to run (mandatory).
.TP
-.B \-\^g " <number_of_workgroups>"
-specify the number of workgroups to perform the benchmark code on (mandatory).
-.TP
-.B \-\^w " <workgroup_expression>"
+.B \-\^w <workgroup_expression>
Specify the affinity domain, thread count and data set size for the current benchmarking run (mandatory).
+.TP
+.B \-\^l <testname>
+list properties of a benchmark code.
+.TP
+.B \-\^i <iterations>
+Set the number of iterations per thread (optional)
.SH WORKGROUP SYNTAX
.B <thread_domain>:<size> [:<num_threads>[:<chunk_size>:<stride>]] [-<streamId>:<domain_id>]
-with size in kB, MB or GB. Where thread domain is where threads are placed. Size is the total data set size for the benchmark. num_threads specifies how many threads are used. Threads are always placed using a compact policy in
+with size in kB, MB or GB. The
+.B <thread_domain>
+defines where the threads are placed.
+.B <size>
+is the total data set size for the benchmark, the allocated vectors in memory sum up to this size.
+.B <num_threads>
+specifies how many threads are used in the
+.B <thread_domain>.
+Threads are always placed using a compact policy in
.B likwid-bench.
This means that per default all SMT threads are used. Optionally similar a the expression based syntax in
.B likwid-pin
-a chunk size and stride can be provided. Optionally for every stream means array the placement can be controlled. Per default all arrays are placed in the same thread domain the threads are running in. To place the data in a different domain for every stream of a benchmark case (the total number of streams can be aquired by the \-l option) the domain to place the data in can be specified. Multiple streams are comma separated. Either the placement is provided or all streams have to be ex [...]
+a
+.B <chunk_size>
+and
+.B <stride>
+can be provided. Optionally for every stream (array, vector) the placement can be controlled. Per default all arrays are placed in the same
+.B <thread_domain>
+the threads are running in. To place the data in a different domain for every stream of a benchmark case (the total number of streams can be aquired by the
+.B \-l
+option) the domain to place the data in can be specified. Multiple streams are comma separated. Either the placement is provided or all streams have to be explicitly placed. Please refer to the Wiki pages on
.B http://code.google.com/p/likwid/wiki/LikwidBench
for further details and examples on usage.
.SH EXAMPLE
.IP 1. 4
-Run the copy benchmark with 1000 iterations on socket 0 with a total data set size of 100kB.
+Run the
+.B copy
+benchmark on socket 0 (
+.B S0
+) with a total data set size of
+.B 100kB.
.TP
-.B likwid-bench -t copy -i 1000 -g 1 -w S0:100kB
+.B likwid-bench -t copy -w S0:100kB
.PP
Since no
-.B num_thread
-is given in the workload expression, each core of socket 0 gets one thread. The workload is split up between all threads.
+.B <num_threads>
+is given in the workload expression, each core of socket 0 gets one thread. The workload is split up between all threads and the number of iterations is determined automatically.
.IP 2. 4
-Run the triad benchmark code with 100 iterations with 2 threads on the socket 0 and a data size of 1 GB.
+Run the
+.B triad
+benchmark code with explicitly
+.B 100
+iterations per thread with
+.B 2
+threads on the socket 0 (
+.B S0
+) and a data size of
+.B 1GB.
.TP
-.B likwid-bench -t triad -i 100 -g 1 -w S0:1GB:2:1:2
+.B likwid-bench -t triad -i 100 -w S0:1GB:2:1:2
.PP
-Assuming socket 0 has 4 SMT threads, one thread is assigned to each physical core of socket 0.
+Assuming socket 0 (
+.B S0
+) has 2 physical cores with SMT enabled, hence in total 4 hardware threads, one thread is assigned to each physical core of socket 0.
.IP 3. 4
-Run the update benchmark with 1000 iterations on socket 0 with a workload of 100kB and on socket 1 with the same workload.
+Run the
+.B update
+benchmark on socket 0 (
+.B S0
+) with a workload of
+.B 100kB
+and on socket 1 (
+.B S1
+) with the same workload.
.TP
-.B likwid-bench -t update -i 1000 -g 2 -w S0:100kB -w S1:100kB
+.B likwid-bench -t update -w S0:100kB -w S1:100kB
.PP
The results of both workgroups are combinded for the output. Hence the workload in each workgroup expression should have the same size.
.IP 4. 4
-Run the copy benchmark but measure the memory traffic with
+Run the
+.B copy
+benchmark but measure the memory traffic with
.B likwid-perfctr.
-The option INSTRUMENT_BENCH in config.mk needs to be true at compile time to use that feature.
+The option
+.B INSTRUMENT_BENCH
+in
+.B config.mk
+needs to be true at compile time to use that feature.
.TP
-.B likwid-perfctr -C E:S0:4 -g MEM -m likwid-bench -t update -i 1000 -g 1 -w S0:100kB
+.B likwid-perfctr -c E:S0:4 -g MEM -m likwid-bench -t update -w S0:100kB
.PP
-.B likwid-perfctr
-will configure and start the performance counters on socket 0 with 4 threads prior to the execution of
+.B likwid-perfctr
+will configure and start the performance counters on socket 0 (
+.B S0
+) with 4 threads prior to the execution of
.B likwid-bench.
-The performance counters are read right before and after running the benchmarking code to
-minimize the interferences of the measurement.
+The performance counters are read right before and after running the benchmarking code to minimize the interferences of the measurement.
.IP 5. 4
-Run the copy benchmark and place the data on other socket
+Run the
+.B copy
+benchmark and place the data on another socket
.TP
-.B likwid-bench -t copy -i 50 -g 1 -w S0:1GB:10:1:2-0:S1,1:S1
+.B likwid-bench -t copy -w S0:1GB:10:1:2-0:S1,1:S1
.PP
-Stream id 0 and 1 are placed in thread domains S1, which is socket 1. This can be verified as the initialization threads output where they are running.
+Stream id 0 and 1 are placed in thread domains
+.B S1,
+which is socket 1. This can be verified as the initialization threads output where they are running.
.SH AUTHOR
-Written by Jan Treibig <jan.treibig at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
.SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
.SH SEE ALSO
-likwid-perfctr(1), likwid-pin(1), likwid-topology(1), likwid-features(1), likwid-setFrequencies(1)
+likwid-perfctr(1), likwid-pin(1), likwid-topology(1), likwid-setFrequencies(1)
diff --git a/doc/likwid-doxygen.md b/doc/likwid-doxygen.md
new file mode 100644
index 0000000..37d505e
--- /dev/null
+++ b/doc/likwid-doxygen.md
@@ -0,0 +1,262 @@
+/*! \mainpage LIKWID - Like I Knew What I Am Doing
+
+\section Introduction
+This is an effort to develop easy to use but yet powerful performance tools for the GNU Linux operating system. While the focus of LIKWID is on x86 processors some of the tools are portable and not limited to any specific architecture. LIKWID follows the philosophy:
+- Simple
+- Efficient
+- Portable
+- Extensible
+
+\ref build
+
+\ref faq
+
+\section Tools LIKWID Tools
+- \ref likwid-topology : A tool to display the thread and cache topology on multicore/multisocket computers.
+- \ref likwid-pin : A tool to pin your threaded application without changing your code. Works for pthreads and OpenMP.
+- \ref likwid-perfctr : A tool to measure hardware performance counters on recent Intel and AMD processors. It can be used as wrapper application without modifying the profiled code or with a marker API to measure only parts of the code.
+- \ref likwid-powermeter : A tool for accessing RAPL counters and query Turbo mode steps on Intel processor. RAPL counters are also available in \ref likwid-perfctr.
+- \ref likwid-setFrequencies : A tool to print and manage the clock frequency of CPU cores.
+- \ref likwid-agent : A monitoring agent for LIKWID with multiple output backends.
+- \ref likwid-memsweeper : A tool to cleanup ccNUMA domains and LLC caches to get a clean environment for benchmarks.
+- \ref likwid-bench : A benchmarking framework for streaming benchmark kernels written in assembly.
+- \ref likwid-genTopoCfg : A config file writer that gets system topology and writes them to file for faster LIKWID startup.
+<!-- - \ref likwid-features : A tool to toggle the prefetchers on Core 2 processors.-->
+
+Wrapper scripts using the basic likwid tools:
+- \ref likwid-mpirun : A wrapper script enabling simple and flexible pinning of MPI and MPI/threaded hybrid applications. With integrated \ref likwid-perfctr support.
+- \ref likwid-perfscope : A frontend application for the timeline mode of \ref likwid-perfctr that performs live plotting using gnuplot.
+
+LIKWID requires in most environments some daemon application to perform its operations with higher priviledges:
+- \ref likwid-accessD : Daemon to perform MSR and PCI read/write operations with higher priviledges.
+- \ref likwid-setFreq : Daemon to set the CPU frequencies with higher priviledges.
+
+Optionally, a global configuration file \ref likwid.cfg can be given to modify some basic run time parameters of LIKWID.
+
+\section Library LIKWID Library
+\subsection C_Interface C/C++ Interface
+- \ref MarkerAPI
+- \ref AccessClient
+- \ref Config
+- \ref CPUTopology
+- \ref NumaTopology
+- \ref AffinityDomains
+- \ref PerfMon
+- \ref PowerMon
+- \ref ThermalMon
+- \ref TimerMon
+- \ref Daemon
+- \ref MemSweep
+
+\subsection Lua_Interface Lua Interface
+- \ref lua_Info
+- \ref lua_InputOutput
+- \ref lua_Config
+- \ref lua_Access
+- \ref lua_CPUTopology
+- \ref lua_NumaInfo
+- \ref lua_AffinityInfo
+- \ref lua_Perfmon
+- \ref lua_PowerInfo
+- \ref lua_ThermalInfo
+- \ref lua_Timer
+- \ref lua_MemSweep
+- \ref lua_Misc (Some functionality not provided by Lua natively)
+
+\subsection Fortran90_Interface Fortran90 Interface
+- \ref Fortran_Interface
+
+\section Architectures Supported Architectures
+\subsection Architectures_Intel Intel®
+- \subpage pentiumm
+- \subpage core2
+- \subpage atom
+- \subpage nehalem
+- \subpage nehalemex
+- \subpage westmere
+- \subpage westmereex
+- \subpage phi
+- \subpage silvermont
+- \subpage sandybridge
+- \subpage sandybridgeep
+- \subpage ivybridge
+- \subpage ivybridgeep
+- \subpage haswell
+- \subpage haswellep
+- \subpage broadwell
+
+\subsection Architectures_AMD AMD®
+- \subpage k8
+- \subpage k10
+- \subpage interlagos
+- \subpage kabini
+
+\section Examples Example Codes
+Using the Likwid API:
+- \ref C-likwidAPI-code
+- \ref Lua-likwidAPI-code
+
+Using the Marker API:
+- \ref C-markerAPI-code
+- \ref F-markerAPI-code
+
+If you have problems with LIKWID:<BR>
+GitHub: <A HREF="https://github.com/RRZE-HPC/likwid">https://github.com/RRZE-HPC/likwid</A><BR>
+Bugs: <A HREF="https://github.com/RRZE-HPC/likwid/issues">https://github.com/RRZE-HPC/likwid/issues</A><BR>
+Mailinglist: <A HREF="http://groups.google.com/group/likwid-users">http://groups.google.com/group/likwid-users</A><BR>
+*/
+
+
+/*! \page build Build and install instructions
+\section allg Introduction
+Likwid is build using GNU make and Perl. Besides the Linux kernel and the standard C library, all required dependencies are shipped with the archive (<A HREF="http://www.lua.org/">Lua</A> and <A HREF="http://www.open-mpi.org/projects/hwloc/">hwloc</A>).
+It should build on any Linux distribution with a recent GCC compiler or CLANG compiler and 2.6 or newer kernel without any changes.
+
+There is one generic top level Makefile and one .mk configuration file for each
+compiler (at the moment GCC, CLANG and ICC). Please note that we test LIKWID only with GCC. CLANG and ICC is only tested for basic functionality.
+
+There is one exception: If you want to use LIKWID on a Intel Xeon Phi card you have to choose the MIC as compiler in config.mk, which is based on Intel ICC compiler.
+
+\subsection directory Directory structure
+All source files are in the src/ directory. All header files are located in
+src/includes/ . Lua application source files are in src/applications/. All external tools, namely HWLOC and Lua, are located in ext/. The bench/ folder contains all files of the benchmarking suite of LIKWID.
+
+All build products are generated in the directory ./TAG, where TAG is the compiler configuration, default ./GCC.
+
+\subsection config Configuration
+Usually the only thing you have to configure is the PREFIX install path in the build config file config.mk in the top directory.
+
+\subsubsection color Changing color of <CODE>likwid-pin</CODE> output
+Depending on the background of your terminal window you can choose a color for <CODE>likwid-pin</CODE> output.
+
+\subsubsection accessD Usage of the access daemon likwid-accessD
+Usually on your own system, you can use LIKWID with direct access to the MSR files. If you install LIKWID on a shared system as a HPC compute cluster you may consider to use the access daemon. This is a proxy application which was implemented with security in mind and performs address checks for allowed access. Using the access daemon, the measurements involve more overhead, especially if you use \ref likwid-perfctr in timeline mode or with the marker API.
+
+To enable using the access daemon, configure in config.mk:
+ - Set BUILDDAEMON to true
+ - Configure the path to the accessDaemon binary at ACCESSDAEMON
+ - Set the ACCESSMODE to accessdaemon
+
+ACCESSMODE can be direct, accessdaemon and sysdaemon (not yet officially supported). You can overwrite the default setting on the command line using the -M switch.
+
+If you want to access Uncore performance counters that are located in the PCI memory range, like they are implemented in Intel SandyBridge EP and IvyBridge EP, you have to use the access daemon or have root privileges because access to the PCI space is only permitted for highly privileged users.
+
+\subsubsection setfreqinstall Usage of frequency daemon likwid-setFreq
+The application \ref likwid-setFrequencies uses another daemon to modify the frequency of CPUs. The daemon is build and later installed if BUILDFREQ is set to true in config.mk.
+
+\subsubsection sharedlib Build Likwid as shared library
+Per default the LIKWID library is build as a shared library. You need the library if you want to use the Marker API. You can also use the LIKWID modules like <I>perfmon</I> directly. This is still not officially supported at the moment. In some settings it is necessary to build LIKWID as a shared library. To do so set SHARED_LIBRARY to true.
+
+\subsubsection instr_bench Instrument likwid-bench for usage with likwid-perfctr
+\ref likwid-bench is instrumented for use with \ref likwid-perfctr. This allows you to measure various metrics of your \ref likwid-bench kernels. Enable instrumentation by setting INSTRUMENT_BENCH to true in config.mk.
+
+\subsubsection fortran Enabling Fortran interface for marker API
+If you want to use the Marker API in Fortran programs LIKWID offers a native Fortran90 interface. To enable it set FORTRAN_INTERFACE to true in config.mk.
+
+\subsection targets Build targets
+You have to edit config.mk to configure your build and install path.
+
+The following make targets are available:
+
+- <B>make</B> - Build everything
+- <B>make likwid-bench</B> - Build likwid-bench
+- <B>make likwid-accessD</B> - Build likwid-accessD
+- <B>make likwid-setFreq</B> - Build likwid-setFreq
+- <B>make docs</B> - Create HTML documentation using doxygen
+- <B>make clean</B> - Remove the object file directory *./GCC*, keep the executables
+- <B>make distclean</B> - Remove all generated files
+- <B>make local</B> - Adjust paths in Lua scripts to work from the build directory. Requires the daemons and the pinning library to be already installed. Mainly used for testing.
+
+The build system has a working dependency tracking, therefore <B>make clean</B> is only needed if you change the Makefile configuration.
+
+\subsection installtargets Installing
+
+NOTE: The pinning functionality and the daemons only work if configured in config.mk and
+installed with <B>make install</B>. If you do not use the pinning functionality the tools
+can be used without installation.
+
+ - <B>make install</B> - Installs the executables, libraries, man pages and headers to the path you configured in config.mk.
+ - <B>make uninstall</B> - Delete all installed files.
+
+\subsection accessD Setting up access for hardware performance monitoring
+Hardware performance monitoring on x86 is enabled using model-specific registers (MSR). MSR registers are special registers not part of the instruction set architecture. To read and write to these registers the x86 ISA provides special instructions. These instructions can only be executed in protected mode or in other words only kernel code can execute these instructions. Fortunately, any Linux kernel 2.6 or newer provides access to these registers via a set of device files. This allows [...]
+
+Per default only root has read/write access to these msr device files. In order to use the LIKWID tools, which need access to these files (likwid-perfctr, likwid-powermeter and likwid-agent) as standard user, you need to setup access rights to these files.
+
+likwid-perfctr, likwid-powermeter and likwid-features require the Linux <CODE>msr</CODE> kernel module. This module is part of most standard distro kernels. You have to be root to do the initial setup.
+
+ - Check if the <CODE>msr</CODE> module is loaded with <CODE>lsmod | grep msr</CODE>. There should be an output.
+ - It the module is not loaded, load it with <CODE>modprobe msr</CODE>. For automatic loading at startup consult your distros documentation how to do so.
+ - Adopt access rights on the MSR device files for normal user. To grant access to anyone, you can use <CODE>chmod o+rw /dev/cpu/*/msr</CODE>. This is only recommended on single user desktop systems.
+
+As in general access to MSRs is not desired on security sensitive systems, you can either implement a more sophisticated access rights settings with e.g. setgid. A common solution used on many other device files, e.g. for audio, is to introduce a group and make a <CODE>chown</CODE> on the msr device files to that group. Now if you execute likwid-perfctr with setgid on that group, the executing user can use the tool but cannot directly write or read the MSR device files.
+
+Some distributions backported the capabilities check for the msr device to older kernels. If there are problems with accessing the msr device for older kernels with file system permissions set to read&write, please check your kernel code (<CODE>arch/x86/kernel/msr.c</CODE>) for the backport and set the MSR capabilities in case.
+
+A secure solution is to use the access daemon \ref likwid-accessD, which encapsulates the access to the MSR device files and performs a address check for allowed registers.
+
+Some newer kernels implement the so-called capabilities, a fine-grained permission system that can allow access to the MSR files for common users. On the downside it may be not enough anymore to set the suid-root flag for the access daemon, the executable must be registerd at the <CODE>libcap</CODE>.
+
+<CODE>sudo setcap cap_sys_rawio+ep EXECUTABLE</CODE>
+
+This is only possible on local file systems. A feasible way is to use the \ref likwid-accessD for all accesses and just enable the capabilities for this one binary. This will enable the usage for all LIKWID tools and also for all instrumented binaries. If \ref likwid-perfctr utility should only be used in wrapper mode, it is suitable to set the capabilities for \ref likwid-perfctr only. Please remember to set the file permission of the MSR device files to read/write for all users, even i [...]
+
+\subsubsection depends Dependencies
+Although we tried to minimize the external dependencies of LIKWID, some advanced tools or only specific tool options require external packages.<BR>
+\ref likwid-perfscope uses the Perl script <A HREF="https://github.com/dkogan/feedgnuplot">feedGnuplot</A> to forward the real-time data to gnuplot. <A HREF="https://github.com/dkogan/feedgnuplot">feedGnuplot</A> is included into LIKWID, but <A HREF="http://www.gnuplot.info/">gnuplot</A> itself is not.<BR>
+\ref likwid-agent provided multiple backends to output the periodically measured data. The syslog backend requires the shell tool \a logger to be installed. The <A HREF="https://oss.oetiker.ch/rrdtool/">RRD</A> backend requires \a rrdtool and the GMetric backend the \a gmetric tool, part of the <A HREF="http://ganglia.sourceforge.net/">Ganglia Monitoring System</A>.<BR>
+In order to create the HTML documentation of LIKWID, the tool <A HREF="www.doxygen.org">Doxygen</A> is required.
+*/
+
+/*! \page C-markerAPI-code Marker API in a C/C++ application
+\include C-markerAPI.c
+*/
+
+/*! \page F-markerAPI-code Marker API in a Fortran90 application
+\include F-markerAPI.F90
+*/
+
+/*! \page C-likwidAPI-code LIKWID API in a C/C++ application
+\include C-likwidAPI.c
+*/
+/*! \page Lua-likwidAPI-code LIKWID API in a Lua application
+\include Lua-likwidAPI.lua
+*/
+
+/*! \page faq FAQ
+\section faq1 Which architectures are supported?
+LIKWID supports a range of x86 CPU architectures but likely not all. We concentrated the development effort on Intel and AMD machines. Almost all architecture code is tested. For a list of architectures see section \ref Architectures or call <CODE>likwid-perfctr -i</CODE>.
+
+\section faq2 Are all hardware events supported?
+LIKWID offers almost all events that are defined in the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual</A> and the <A HREF="http://developer.amd.com/resources/documentation-articles/developer-guides-manuals/">AMD® BIOS and Kernel Developer’s Guides</A>. Some may be missing caused by special handling likely with additional registers. But, LIKWID also provides some events [...]
+
+\section faq3 Does LIKWID support Intel's PEBS?
+No, PEBS is an interface that must be initialized at kernel level. Since LIKWID is a user-space tool, there is no possibility to maintain PEBS.
+
+\section faq4 Which unit does LIKWID use internally for B, kB, MB, GB?
+As the units imply, you get from one unit to the other by multiplying or dividing it by 1000. E.g. 1kB = 1000B. There is no kiB or MiB possible by now.
+
+\section faq5 Does LIKWID support power capping (Intel only)?
+No, by now LIKWID does not support limiting the power consumption of your machine using the RAPL interface. We added some functions but they are not exported because they need to be rechecked.
+
+\section faq6 Is LIKWID case-sensitive?
+Yes, all strings are case-sensitive. The only exception are the event options, they are case-insensitive. For upcomming versions we change to case-insensitive for all string parsing where possible.
+
+\section faq7 I have given multiple eventsets on the command line but the values are too low? Are they multiplexed?
+LIKWID does not support multiplexing of eventsets. It rotates through its eventset list and measures each for a specific amount of time. The output contains the results of all measurements of that eventset, no interpolation to the complete runtime is done. Since most other tools that support multiplexing use linear interpolation, you can scale the results yourself with <CODE>(1.0 - (measurement_time/all_time)) * result</CODE>. As you can see, the calculation is pretty simple, but it intr [...]
+
+\section faq8 Are there plans to port LIKWID to other operating systems?
+We do not really plan to port LIKWID to other operating systems. We come from the HPC world and there the main operating systems base on the Linux kernel. The latest Top500 list contains 13 systems using Unix and 1 system with Microsoft® Windows.
+
+\section faq9 Are there plans to port LIKWID to other CPU architectures?
+We would like to port LIKWID to other CPU architectures that support hardware performance measurements but currently there is no time for that and we do not have other architectures than x86 inhouse. We follow the developements and if an architecture gets HPC relevant, we will likely port LIKWID to make it work. The highest probability has ARM and with lower probability we will include SPARC.
+
+\section faq10 Do you plan to introduce a graphical frontend for LIKWID?
+No, we do not!
+
+\section faq12 Why does the startup of likwid-perfctr take so long?
+In order to get reliable time measurements, LIKWID must determine the base clock frequency of your CPU. This is done by a measurement loop that takes about 1 second. You can avoid the measurement loop by creating a topology configuration file with \ref likwid-genTopoCfg.
+
+\section faq13 I want to help, were do I start?
+The best way is to talk to us at the <A HREF="http://groups.google.com/group/likwid-users">mailing list</A>. There are a bunch of small work packages on our ToDo list that can be used as a good starting point for learning how LIKWID works. If you are not a programmer but you have a good idea, let us know and we will discuss it.
+*/
diff --git a/doc/likwid-features.1 b/doc/likwid-features.1
index e67cf44..c73caa9 100644
--- a/doc/likwid-features.1
+++ b/doc/likwid-features.1
@@ -1,35 +1,35 @@
.TH LIKWID-FEATURES 1 <DATE> likwid\-<VERSION>
.SH NAME
-likwid-features \- print and toggle the flags of the MSR_IA32_MISC_ENABLE model specific register
+likwid-features \- print and manipulate cpu features like hardware prefetchers
.SH SYNOPSIS
.B likwid-features
-.RB [ \-vh ]
+.RB [ \-vhal ]
.RB [ \-c
-.IR <coreId> ]
-.RB [ \-s
-.IR <prefetcher_tag> ]
-.RB [ \-u
-.IR <prefetcher_tag> ]
+.IR cpus ]
+.RB [ \-e
+.IR taglist ]
+.RB [ \-d
+.IR taglist ]
.SH DESCRIPTION
.B likwid-features
is a command line application to print the flags in the model
-specific register (MSR) MSR_IA32_MISC_ENABLE on Intel x86 processors. On Core2 processors
+specific register (MSR) MSR_IA32_MISC_ENABLE on Intel x86 processors. On Core2 and later processors
it can be used to toggle the hardware prefetch flags. It does not work on AMD processors.
For a documentation what flags are supported on which processor refer to the Intel
-Software Developer's Manual Volume 3B, Table B.2. The MSR are set individually for every core.
+Software Developer's Manual Volume 3B, Table B.2 and https://software.intel.com/en-us/articles/disclosure-of-hw-prefetcher-control-on-some-intel-processors. The MSR are set individually for every core.
The following hardware prefetchers can be toggled:
-.IP \[bu]
+.IP \[bu]
.B HW_PREFETCHER:
Hardware prefetcher.
-.IP \[bu]
+.IP \[bu]
.B CL_PREFETCHER:
Adjacent cache line prefetcher.
-.IP \[bu]
+.IP \[bu]
.B DCU_PREFETCHER:
When the DCU prefetcher detects multiple loads from the same line done within a
time limit, the DCU prefetcher assumes the next line will be required. The next
line is prefetched in to the L1 data cache from memory or L2.
-.IP \[bu]
+.IP \[bu]
.B IP_PREFETCHER:
The IP prefetcher is an L1 data cache prefetcher. The IP prefetcher looks for
sequential load history to determine whether to prefetch the next expected data
@@ -43,18 +43,28 @@ prints version information to standard output, then exits.
.B \-\^h
prints a help message to standard output, then exits.
.TP
-.B \-\^c " <coreId>"
-set on which processor core the MSR should be read
+.B \-\^a
+List out the names of all detected features
.TP
-.B \-\^u " <HW_PREFETCHER | CL_PREFETCHER | DCU_PREFETCHER | IP_PREFETCHER>"
-specify which prefetcher to unset
+.B \-\^l
+Print the state of all features for the given CPUs
.TP
-.B \-\^s " <HW_PREFETCHER | CL_PREFETCHER | DCU_PREFETCHER | IP_PREFETCHER>"
-specify which prefetcher to set
+.B \-\^c " cpus"
+set on which processor cores the MSR should be read and written. Syntax according to
+.B likwid-pin(1)
+.TP
+.B \-\^d " HW_PREFETCHER | CL_PREFETCHER | DCU_PREFETCHER | IP_PREFETCHER"
+specify which prefetcher should be disabled. Argument can be a comma-separated list.
+.TP
+.B \-\^e " HW_PREFETCHER | CL_PREFETCHER | DCU_PREFETCHER | IP_PREFETCHER"
+specify which prefetcher should be enabled. Argument can be a comma-separated list.
.SH AUTHOR
-Written by Jan Treibig <jan.treibig at gmail.com>.
+Written by Thomas Röhl <thomas.Roehl at gmail.com>.
.SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/RRZE-HPC/likwidissues>.
.SH "SEE ALSO"
-likwid-perfctr(1), likwid-pin(1), likwid-powermeter(1), likwid-topology(1), likwid-setFrequencies(1)
+likwid-pin(1), likwid-topology(1), likwid-perfctr(1)
+
+
+
diff --git a/doc/likwid-genCfg.1 b/doc/likwid-genCfg.1
deleted file mode 100644
index 8b7632f..0000000
--- a/doc/likwid-genCfg.1
+++ /dev/null
@@ -1,30 +0,0 @@
-.TH LIKWID-GENCFG 1 <DATE> likwid\-<VERSION>
-.SH NAME
-likwid-genCfg \- Get system topology and write them to file for faster LIKWID startup
-.SH SYNOPSIS
-.B likwid-genCfg
-.RB [\-hv]
-.RB [ \-o
-.IR <filename>]
-.SH DESCRIPTION
-.B likwid-genCfg
-is a command line application that stores the system's CPU and NUMA topology to
-file. LIKWID applications use this file to read in the topology fast instead of
-re-gathering all values. The default output path is /etc/likwid.cfg.
-.SH OPTIONS
-.TP
-.B \-h
-prints a help message to standard output, then exits.
-.TP
-.B \-v
-prints a version message to standard output, then exits.
-.TP
-.B \-\^o " <filename>
-sets output file path (optional)
-
-.SH AUTHOR
-Written by Thomas Roehl <thomas.roehl at gmail.com>.
-.SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
-.SH "SEE ALSO"
-likwid-topology(1), likwid-perfctr(1), likwid-features(1), likwid-pin(1), likwid-powermeter(1)
diff --git a/doc/likwid-genTopoCfg.1 b/doc/likwid-genTopoCfg.1
new file mode 100644
index 0000000..6d0e8b2
--- /dev/null
+++ b/doc/likwid-genTopoCfg.1
@@ -0,0 +1,30 @@
+.TH LIKWID-GENTOPOCFG 1 <DATE> likwid\-<VERSION>
+.SH NAME
+likwid-genTopoCfg \- Get system topology and write them to file for faster LIKWID startup
+.SH SYNOPSIS
+.B likwid-genTopoCfg
+.RB [\-hv]
+.RB [ \-o
+.IR <filename>]
+.SH DESCRIPTION
+.B likwid-genTopoCfg
+is a command line application that stores the system's CPU and NUMA topology to
+file. LIKWID applications use this file to read in the topology fast instead of
+re-gathering all values.
+.SH OPTIONS
+.TP
+.B \-h, \-\-\^help
+prints a help message to standard output, then exits.
+.TP
+.B \-v, \-\-\^version
+prints a version message to standard output, then exits.
+.TP
+.B \-\^o, \-\-\^output <filename>
+sets output file path (Default: /etc/likwid-topo.cfg)
+
+.SH AUTHOR
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
+.SH BUGS
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
+.SH "SEE ALSO"
+likwid-topology(1), likwid-perfctr(1), likwid-pin(1)
diff --git a/doc/likwid-lua.1 b/doc/likwid-lua.1
new file mode 100644
index 0000000..411531b
--- /dev/null
+++ b/doc/likwid-lua.1
@@ -0,0 +1,111 @@
+.TH LUA 1 "$Date: 2014/12/10 15:55:45 $"
+.SH NAME
+lua \- Lua interpreter
+.SH SYNOPSIS
+.B lua
+[
+.I options
+]
+[
+.I script
+[
+.I args
+]
+]
+.SH DESCRIPTION
+.B lua
+is the standalone Lua interpreter.
+It loads and executes Lua programs,
+either in textual source form or
+in precompiled binary form.
+(Precompiled binaries are output by
+.BR luac ,
+the Lua compiler.)
+.B lua
+can be used as a batch interpreter and also interactively.
+.LP
+The given
+.I options
+are handled in order and then
+the Lua program in file
+.I script
+is loaded and executed.
+The given
+.I args
+are available to
+.I script
+as strings in a global table named
+.BR arg .
+If no options or arguments are given,
+then
+.B "\-v \-i"
+is assumed when the standard input is a terminal;
+otherwise,
+.B "\-"
+is assumed.
+.LP
+In interactive mode,
+.B lua
+prompts the user,
+reads lines from the standard input,
+and executes them as they are read.
+If the line contains an expression or list of expressions,
+then the line is evaluated and the results are printed.
+If a line does not contain a complete statement,
+then a secondary prompt is displayed and
+lines are read until a complete statement is formed or
+a syntax error is found.
+.LP
+At the very start,
+before even handling the command line,
+.B lua
+checks the contents of the environment variables
+.B LUA_INIT_5_3
+or
+.BR LUA_INIT ,
+in that order.
+If the contents is of the form
+.RI '@ filename ',
+then
+.I filename
+is executed.
+Otherwise, the string is assumed to be a Lua statement and is executed.
+.SH OPTIONS
+.TP
+.BI \-e " stat"
+execute statement
+.IR stat .
+.TP
+.B \-i
+enter interactive mode after executing
+.IR script .
+.TP
+.BI \-l " name"
+execute the equivalent of
+.IB name =require(' name ')
+before executing
+.IR script .
+.TP
+.B \-v
+show version information.
+.TP
+.B \-E
+ignore environment variables.
+.TP
+.B \-\-
+stop handling options.
+.TP
+.B \-
+stop handling options and execute the standard input as a file.
+.SH "SEE ALSO"
+.BR luac (1)
+.br
+The documentation at lua.org,
+especially section 7 of the reference manual.
+.SH DIAGNOSTICS
+Error messages should be self explanatory.
+.SH AUTHORS
+R. Ierusalimschy,
+L. H. de Figueiredo,
+W. Celes
+.\" EOF
diff --git a/doc/likwid-memsweeper.1 b/doc/likwid-memsweeper.1
index f474360..fda87f4 100644
--- a/doc/likwid-memsweeper.1
+++ b/doc/likwid-memsweeper.1
@@ -5,24 +5,24 @@ likwid-memsweeper \- A tool to clean up NUMA memory domains and last level cache
.B likwid-memsweeper
.RB [\-hv]
.RB [ \-c
-.IR <NUMA_ID> ]
+.IR <node_list> ]
.SH DESCRIPTION
.B likwid-memsweeper
-is a command line application to shrink the file buffer cache by filling the NUMA domain with random pages. Moreover the tool invalidates all cachelines in the LLC for 64 bit x86 systems. If no NUMA domain is specified, all are sweept.
+is a command line application to shrink the file buffer cache by filling the NUMA domain with random pages. Moreover, the tool invalidates all cachelines in the LLC.
.SH OPTIONS
.TP
-.B \-h
+.B \-h, \-\-\^help
prints a help message to standard output, then exits.
.TP
-.B \-v
+.B \-v, \-\-\^version
prints a version message to standard output, then exits.
.TP
-.B \-\^c " <NUMA_ID>
+.B \-\^c <node_list>
set the NUMA domain for sweeping.
.SH AUTHOR
-Written by Thomas Roehl <thomas.roehl at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
.SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
.SH "SEE ALSO"
-likwid-perfctr(1), likwid-features(1), likwid-pin(1), likwid-powermeter(1), likwid-topology(1),
+likwid-perfctr(1), likwid-features(1), likwid-pin(1), likwid-powermeter(1),
diff --git a/doc/likwid-mpirun.1 b/doc/likwid-mpirun.1
index 765b0c8..e3db441 100644
--- a/doc/likwid-mpirun.1
+++ b/doc/likwid-mpirun.1
@@ -3,7 +3,9 @@
likwid-mpirun \- A tool to start and monitor MPI applications with LIKWID
.SH SYNOPSIS
.B likwid-memsweeper
-.RB [\-hd]
+.RB [\-hvdOm]
+.RB [ \-n
+.IR number_of_processes ]
.RB [ \-hostfile
.IR filename ]
.RB [ \-nperdomain
@@ -11,9 +13,11 @@ likwid-mpirun \- A tool to start and monitor MPI applications with LIKWID
.RB [ \-pin
.IR expression ]
.RB [ \-omp
-.IR expression ]
+.IR omptype ]
.RB [ \-mpi
-.IR expression ]
+.IR mpitype ]
+.RB [ \-g
+.IR eventset ]
.RB [\-\-]
.SH DESCRIPTION
.B likwid-mpirun
@@ -22,32 +26,51 @@ is a command line application that wraps the vendor-specific mpirun tool and add
to the execution string. The user-given application is ran, measured and the results returned to the staring node.
.SH OPTIONS
.TP
-.B \-h
-prints a help message to standard output, then exits.
+.B \-\^h,\-\-\^help
+prints a help message to standard output, then exits
+.TP
+.B \-\^v,\-\-\^version
+prints version information to standard output, then exits
.TP
-.B \-d
-prints debug messages to standard output.
+.B \-\^d,\-\-\^debug
+prints debug messages to standard output
.TP
-.B \-\^hostfile " filename
-specifies the nodes to schedule the MPI processes on
+.B \-\^n,\-\^np,\-\-\^n,\-\-\^np <number_of_processes>
+specifies how many MPI processes should be started
.TP
-.B \-\^nperdomain " number_of_processes_in_domain
+.B \-\^hostfile <filename>
+specifies the nodes to schedule the MPI processes on. If not given, the environment variables PBS_NODEFILE, LOADL_HOSTFILE and SLURM_HOSTFILE are checked.
+.TP
+.B \-\^nperdomain <number_of_processes_in_domain>
specifies the processes per affinity domain (see
.B likwid-pin
for info about affinity domains)
.TP
-.B \-\^pin " expression
+.B \-\^pin <expression>
specifies the pinning for hybrid execution (see
.B likwid-pin
for info about affinity domains)
.TP
-.B \-\^omp " expression
-enables hybrid setup. Can only be used in combination with
-.B -pin.
-The only possible value is: intel
+.B \-\^s, \-\-\^skip <mask>
+Specify skip mask as HEX number. For each set bit the corresponding thread is skipped.
+.TP
+.B \-\^omp <omptype>
+enables hybrid setup. Likwid tries to determine OpenMP type automatically. The only possible value are
+.B intel
+and
+.B gnu
.TP
-.B \-\^mpi " expression
-specifies the MPI implementation that should be used by the wrapper. Possible values are intelmpi, openmpi and mvapich2
+.B \-\^mpi <mpitype>
+specifies the MPI implementation that should be used by the wrapper. Possible values are
+.B intelmpi, openmpi
+and
+.B mvapich2
+.TP
+.B \-\^m,\-\-\^marker
+activates the Marker API for the executed MPI processes
+.TP
+.B \-\^O
+prints output in CSV not ASCII tables
.TP
.B \-\-
stops parsing arguments for likwid-mpirun, in order to set options for underlying MPI implementation after \-\-.
@@ -56,26 +79,32 @@ stops parsing arguments for likwid-mpirun, in order to set options for underlyin
.IP 1. 4
For standard application:
.TP
-.B likwid-mpirun -np 32 ./myApp
+.B likwid-mpirun -np 32 ./myApp
.PP
Will run 32 MPI processes, each host is filled with as much processes as written in ppn
.IP 2. 4
With pinning:
.TP
-.B likwid-mpirun -np 32 -nperdomain S:2 ./myApp
+.B likwid-mpirun -np 32 -nperdomain S:2 ./myApp
.PP
Will start 32 MPI processes with 2 processes per socket.
.IP 3. 4
For hybrid runs:
.TP
-.B likwid-mpirun -np 32 -pin M0:0-3_M1:0-3 ./myApp
+.B likwid-mpirun -np 32 -pin M0:0-3_M1:0-3 ./myApp
.PP
Will start 32 MPI processes with 2 processes per node. Threads of the first process are pinned to the cores 0-3 in NUMA domain 0 (M0). The OpenMP threads of the second process are pinned to the first four cores in NUMA domain 1 (M1)
-
+.SH BUGS
+When measuring Uncore events it is not possible to select a cpu pin expression
+that covers multiple sockets, e.g. S0:0-1_S0:2 at S1:2. This runs two processes,
+each running on two CPUs. But since the first CPU of the second expression is on
+socket 0, which is already handled by S0:0-1, the second MPI process gets a
+event set that does not contain Uncore counters although the second part of the
+second expression would measure the Uncore counters on socket 1.
.SH AUTHOR
-Written by Thomas Roehl <thomas.roehl at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
.SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
.SH "SEE ALSO"
-likwid-pin(1), likwid-perfctr(1), likwid-features(1), likwid-powermeter(1), likwid-topology(1),
+likwid-pin(1), likwid-perfctr(1), likwid-powermeter(1)
diff --git a/doc/likwid-perfctr.1 b/doc/likwid-perfctr.1
index ea3e4f3..321da34 100644
--- a/doc/likwid-perfctr.1
+++ b/doc/likwid-perfctr.1
@@ -1,152 +1,110 @@
.TH LIKWID-PERFCTR 1 <DATE> likwid\-<VERSION>
.SH NAME
-likwid-perfctr \- configure and read out hardware performance counters on x86 cpus
+likwid-perfctr \- configure and read out hardware performance counters on x86 CPUs
.SH SYNOPSIS
.B likwid-perfctr
-.RB [\-vhHVmaeiMoO]
-.RB [ \-c/\-C
-.IR <core_list> ]
+.RB [\-vhHmaief]
+.RB [ \-c
+.IR core_list ]
+.RB [ \-C
+.IR core_list_for_pinning ]
.RB [ \-g
-.IR <performance_group>
+.IR performance_group
or
-.IR <performance_event_string> ]
+.IR performance_event_string ]
.RB [ \-t
-.IR <frequency> ]
+.IR timeline_frequency ]
.RB [ \-S
-.IR <time> ]
-.RB [ \-s
-.IR <skip_mask> ]
+.IR monitoring_time ]
+.RB [ \-T
+.IR group_switch_frequency ]
+.RB [ \-V
+.IR verbosity ]
+.RB [ \-M
+.IR access_mode ]
.RB [ \-o
-.IR <output_file> ]
+.IR output_file ]
+.RB [ \-s
+.IR skip_mask ]
+.RB [ \-E
+.IR search_str ]
.SH DESCRIPTION
.B likwid-perfctr
is a lightweight command line application to configure and read out hardware performance monitoring data
on supported x86 processors. It can measure either as wrapper without changing the measured application
or with marker API functions inside the code, which will turn on and off the counters. There are preconfigured
-groups with useful event sets and derived metrics. Additonally arbitrary events can be measured with
-custom event sets. The marker API can measure mulitple named regions. Results are accumulated on multiple calls.
-The following x86 processors are supported:
-.IP \[bu]
-.B Intel Core 2:
-all variants. Counters:
-.I PMC[0-1], FIXC[0-2]
-.IP \[bu]
-.B Intel Nehalem:
-Counters:
-.I PMC[0-3], FIXC[0-2], UPMC[0-7]
-.IP \[bu]
-.B Intel Nehalem EX:
-Counters:
-.I PMC[0-3], FIXC[0-2], MBOX[0-1]C[0-5], BBOX[0-1]C[0-3], RBOX[0-1]C[0-7], WBOX[0-5], UBOX0, SBOX[0-1]C[0-3], CBOX[0-9]C[0-4]
-.IP \[bu]
-.B Intel Westmere:
- Counters:
-.I PMC[0-3], FIXC[0-2], UPMC[0-7]
-.IP \[bu]
-.B Intel Westmere EX:
-Counters:
-.I PMC[0-3], FIXC[0-2], MBOX[0-1]C[0-5], BBOX[0-1]C[0-3], RBOX[0-1]C[0-7], WBOX[0-5], UBOX0, SBOX[0-1]C[0-3], CBOX[0-9]C[0-4]
-.IP \[bu]
-.B Intel Sandy Bridge:
-full RAPL support. Counters:
-.I PMC[0-3], FIXC[0-2], PWR[0-3]
-.IP \[bu]
-.B Intel Sandy Bridge EP:
-partial support for uncore, full RAPL support. Counters:
-.I PMC[0-3], FIXC[0-2], PWR[0-3]. MBOX[0-3]C[0-3]
-.IP \[bu]
-.B Intel Ivy Bridge:
-full RAPL support. Counters:
-.I PMC[0-3], FIXC[0-2], PWR[0-3]
-.IP \[bu]
-.B Intel Ivy Bridge EP:
-partial support for uncore, full RAPL support. Counters:
-.I PMC[0-3], FIXC[0-2], PWR[0-3], CBOX[0-9]C[0-3], MBOX[0-3]C[0-3], MBOX[0-3]FIX
-.IP \[bu]
-.B Intel Haswell:
-full RAPL support. Counters:
-.I PMC[0-3], FIXC[0-2], PWR[0-3]
-.IP \[bu]
-.B Intel Haswell EP:
-no uncore support, full RAPL support. Counters:
-.I PMC[0-3], FIXC[0-2], PWR[0-3]
-.IP \[bu]
-.B Intel Atom Silvermont:
-full RAPL support. Counters:
-.I PMC[0-1], FIXC[0-2], PWR[0-1]
-.IP \[bu]
-.B Intel Pentium M:
-Banias and Dothan variants. Counters:
-.I PMC[0-1]
-.IP \[bu]
-.B Intel P6:
-Tested on P3.
-.IP \[bu]
-.B AMD K8:
-all variants. Counters:
-.I PMC[0-3]
-.IP \[bu]
-.B AMD K10:
-Barcelona, Shanghai, Istanbul, MagnyCours based processors. Counters:
-.I PMC[0-3]
+performance groups with useful event sets and derived metrics. Additonally, arbitrary events can be measured with
+custom event sets. The marker API can measure mulitple named regions and the results are accumulated over multiple region calls.
.SH OPTIONS
.TP
-.B \-\^v
+.B \-\^v, \-\-\^version
prints version information to standard output, then exits.
.TP
-.B \-\^h
+.B \-\^h, \-\-\^help
prints a help message to standard output, then exits.
.TP
.B \-\^H
prints group help message (use together with -g switch).
.TP
-.B \-\^V
-verbose output during execution for debugging.
+.B \-\^V <level>, \-\-\^verbose <level>
+verbose output during execution for debugging. 0 for only errors, 1 for informational output, 2 for detailed output and 3 for developer output
.TP
.B \-\^m
run in marker API mode
.TP
.B \-\^a
-print available performance groups for current processor.
+print available performance groups for current processor, then exit.
.TP
.B \-\^e
print available counters and performance events of current processor.
.TP
-.B \-\^o " <filename>
+.B \-\^o, \-\-\^output <filename>
store all ouput to a file instead of stdout. For the filename the following placeholders are supported:
-%j for PBS_JOBID, %r for MPI RANK (only Intel MPI at the moment), %h hostname and %p for process pid.
+%j for PBS_JOBID, %r for MPI RANK (only Intel MPI at the moment), %h host name and %p for process pid.
The placeholders must be separated by underscore as, e.g., -o test_%h_%p. You must specify a suffix to
the filename. For txt the output is printed as is to the file. Other suffixes trigger a filter on the output.
Available filters are csv (comma separated values) and xml at the moment.
.TP
.B \-\^O
-Do not print tables for results, use easily parseable CSV instead.
+print output in CSV format (conform to RFC 4180, see
+.I https://tools.ietf.org/html/rfc4180
+for details).
.TP
-.B \-\^i
-print cpuid information about processor and on Intel Performance Monitoring features, then exit.
+.B \-\^i, \-\-\^info
+print cpuid information about processor and about Intel Performance Monitoring features, then exit.
.TP
-.B \-\^c " <processor_list>"
+.B \-\^c <cpu expression>
specify a numerical list of processors. The list may contain multiple
items, separated by comma, and ranges. For example 0,3,9-11.
.TP
-.B \-\^C " <processor_list>"
+.B \-\^C <cpu expression>
specify a numerical list of processors. The list may contain multiple
items, separated by comma, and ranges. For example 0,3,9-11. This variant will
also pin the threads to the cores. Also logical numberings can be used.
.TP
-.B \-\^g " <performance group> or <performance event set string>"
+.B \-\^g, \-\-\^group <performance group> or <performance event set string>
specify which performance group to measure. This can be one of the tags output with the -a flag.
Also a custom event set can be specified by a comma separated list of events. Each event has the format
eventId:register with the the register being one of a architecture supported performance counter registers.
.TP
-.B \-\^t " <frequency of measurements>"
-timeline mode for time resolved measurements, possible suffixes 's' and 'ms' like 100ms. The output has the format:
+.B \-\^t <frequency of measurements>
+timeline mode for time resolved measurements. The time unit must be given on command line, e.g. 4s, 500ms or 900us.
+.TP
+.B \-\^S <waittime between measurements>
+End-to-end measurement using likwid-perfctr but sleep instead of executing an application. The time unit must be given on command line, e.g. 4s, 500ms or 900us.
+.TP
+.B \-\^T <time between group switches>
+Frequency to switch groups if multiple are given on commandline, default is 2s. Value is ignored for a single event set and default frequency of 30s is used to catch overflows. The time unit must be given on command line, e.g. 4s, 500ms or 900us.
+.TP
+.B \-\^s, \-\-\^skip <mask>
+Specify skip mask as HEX number. For each set bit the corresponding thread is skipped.
.TP
-.B <Event> <Timestamp> <Result thread0> <Result thread1> ...
+.B \-\^f, \-\-\^force
+Force writing of registers even if they are in use.
.TP
-.B \-\^S " <time_in_seconds>"
-stethoscope mode with duration in senconds. Can be used to measure an application from the outside.
+.B \-\^E <search_str>
+Print only events and corresponding counters matching <search_str>
.SH EXAMPLE
Because
@@ -163,7 +121,7 @@ The parent process is pinned to processor 0, Thread 0 to processor 1 and Thread
.IP 2. 4
As wrapper with custom event set on AMD:
.TP
-.B likwid-perfctr -C 0-4 -g INSTRUCTIONS_RETIRED_SSE:PMC0,CPU_CLOCKS_UNHALTED:PMC3 ./myApp
+.B likwid-perfctr -C 0-4 -g INSTRUCTIONS_RETIRED_SSE:PMC0,CPU_CLOCKS_UNHALTED:PMC3 ./cacheBench
.PP
It is specified that the event
.B INSTRUCTIONS_RETIRED_SSE
@@ -173,84 +131,116 @@ and the event
.B CPU_CLOCKS_UNHALTED
on counter
.B PMC3.
-It is possible calculate the runtime of all threads based on the
+It is possible calculate the run time of all threads based on the
.B CPU_CLOCKS_UNHALTED
event. If you want this you have to include this event in your custom event string as shown above.
.IP 3. 4
As wrapper with custom event set on Intel:
.TP
-.B likwid-perfctr -C 0 -g INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1 ./myApp
+.B likwid-perfctr -C 0 -g INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1,UNC_L3_LINES_IN_ANY:UPMC0 ./stream-icc
.PP
On Intel processors fixed events are measured on dedicated counters. These are
.B INSTR_RETIRED_ANY
-,
-.B CPU_CLK_UNHALTED_CORE.
and
-.B CPU_CLK_UNHALTED_REF
+.B CPU_CLK_UNHALTED_CORE.
If you configure these fixed counters,
.B likwid-perfctr
-will calculate the runtime and CPI metrics for your run.
+will calculate the run time and CPI metrics for your run.
.IP 4. 4
Using the marker API to measure only parts of your code (this can be used both with groups or custom event sets):
.TP
.B likwid-perfctr -m -C 0-4 -g INSTRUCTIONS_RETIRED_SSE:PMC0,CPU_CLOCKS_UNHALTED:PMC3 ./cacheBench
.PP
-You have to link you code against liblikwid.a/.so and use the marker API calls.
+You have to link you code against liblikwid.so and use the marker API calls.
+Examples can be found in examples folder <INSTALLEDPREFIX>/share/likwid/examples.
The following code snippet shows the necessary calls:
.nf
#include <likwid.h>
/* only one thread calls init */
-if (threadId == 0)
-{
- likwid_markerInit();
-}
-/* if you want to measure an threaded application
- * you have to call likwid_markerThreadInit() for
- * preparation, example with OpenMP */
-#pragma omp parallel
-{
- likwid_markerThreadInit();
-}
-BARRIER;
-likwid_markerStartRegion("Benchmark");
-/* your code to be measured is here.*/
+LIKWID_MARKER_INIT;
+
+/* Must be called by each thread the should
+ * perform measurements.
+ * If you place it in the same parallel
+ * region as LIKWID_MARKER_START, perform a
+ * barrier between the statements to avoid
+ * timing problems.
+ */
+LIKWID_MARKER_THREADINIT;
+
+/* If you run the code region only once, register
+ * the region tag previously to reduce the overhead
+ * of START and STOP calls. Call it once for each
+ * thread in parallel environment.
+ * Note: No whitespace characters are allowed in the region tags
+ * This call is optional, START will do the same operations.
+ */
+LIKWID_MARKER_REGISTER("name");
-likwid_markerStopRegion("Benchmark");
-BARRIER;
-/* again only one thread can close the markers */
-if (threadId == 0)
-{
- likwid_markerClose();
-}
+/* Start measurement
+ * Note: No whitespace characters are allowed in the region tags
+ */
+LIKWID_MARKER_START("name");
+/*
+ * Your code to be measured is here
+ * You can also nest named regions
+ * No whitespaces are allowed in the region names!
+ */
+LIKWID_MARKER_STOP("name");
+
+/* If you want to measure multiple groups/event sets
+ * Switches through groups in round-robin fashion
+ */
+LIKWID_MARKER_SWITCH;
+
+/* Finally */
+LIKWID_MARKER_CLOSE;
.fi
.IP 5. 4
Using likwid in timeline mode:
.TP
-.B likwid-perfctr -c 0-3 -g FLOPS_DP -t 300ms ./myApp > out.txt
+.B likwid-perfctr -c 0-3 -g FLOPS_DP -t 300ms ./cacheBench > out.txt
.PP
This will read out the counters every 300ms on physical cores 0-3 and write the results to out.txt.
-For timeline mode there is a frontend application likwid-scope, which enables live plotting of selected events.
-For more code examples have a look at the likwid WIKI pages. The processes are
-.B not
-pinned to the CPUs 0-3.
+The application is not pinned to the CPUs. The output syntax of the timeline
+mode is for custom event sets:
+
+.B <groupID> <numberOfEvents> <numberOfThreads> <Timestamp> <Event1_Thread1> <Event2_Thread1> ... <Event1_Thread2> ... <EventN_ThreadM>
+
+For performance groups with metrics:
+.B <groupID> <numberOfMetrics> <numberOfThreads> <Timestamp> <Metric1_Thread1> <Metric2_Thread1> ... <Metric1_Thread2> ...<MetricN_ThreadM>
+
+For timeline mode there is a frontend application likwid-perfscope(1), which enables live plotting of selected events. Please be aware that with high frequencies (<100ms), the values differ from the real results but the behavior of them is valid.
.IP 6. 4
Using likwid in stethoscope mode:
.TP
.B likwid-perfctr -c 0-3 -g FLOPS_DP -S 2s
.PP
-This will start the counters and read them out after 2s on physical cores 0-3 and write the results to stdout. The processes are
-.B not
-pinned to the CPUs 0-3.
+This will start the counters and read them out after 2s on physical cores 0-3 and write the results to stdout.
+
+.IP 7. 4
+Using likwid with counter options:
+.TP
+.B likwid-perfctr -c S0:1 at S1:1 -g LLC_LOOKUPS_DATA_READ:CBOX0C0:STATE=0x9 ./cacheBench
+.PP
+This will program the counter
+.B CBOX0C0
+(the counter 0 of the LLC cache box 0) to measure the event
+.B LLC_LOOKUPS_DATA_READ
+and filter the increments by the state of a cacheline.
+.B STATE=0x9
+for this event means all <invalid> and <modified> cachelines. Which options are allowed for which box is listed in LIKWID's html documentation. The values for the options can be found in the vendors performance monitoring documentations. Likwid measures the first CPU of socket 0 and the first CPU of socket 1. See likwid-pin(1) for details regarding the cpu expressions.
+For more code examples have a look at the likwid WIKI pages and LIKWID's html documentation.
.SH AUTHOR
-Written by Jan Treibig <jan.treibig at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
.SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
.SH SEE ALSO
-likwid-topology(1), likwid-features(1), likwid-pin(1), likwid-bench(1)
+likwid-topology(1), likwid-perfscope(1), likwid-pin(1), likwid-bench(1)
diff --git a/doc/likwid-perfscope.1 b/doc/likwid-perfscope.1
index 2d48e21..19886a8 100644
--- a/doc/likwid-perfscope.1
+++ b/doc/likwid-perfscope.1
@@ -1,55 +1,177 @@
.TH LIKWID-PERFSCOPE 1 <DATE> likwid\-<VERSION>
.SH NAME
likwid-perfscope \- Frontend for the timeline mode of
-.N likwid-perfctr(1)
-that on-the-fly generates pictures from the measurements
+.B likwid-perfctr(1)
+that generates pictures on-the-fly from the measurements
.SH SYNOPSIS
-.B likwid-perfscope
-.RB [\-h]
-.RB [ \-cores
+.B likwid-perfscope
+.RB [\-hvadp]
+.RB [ \-c
+.IR <cpu_list> ]
+.RB [ \-C
.IR <cpu_list> ]
-.RB [ \-freq
+.RB [ \-t
.IR <frequency> ]
-.RB [ \-group
-.IR <eventset> ]
+.RB [ \-r
+.IR <value> ]
+.RB [ \-g
+.IR <eventset_and_plotconfig> ]
+.RB [ \-\-\^host
+.IR <hostname> ]
+.B <executable>
+
.SH DESCRIPTION
.B likwid-perfscope
-is a command line application written in Perl that uses the timeline daemon mode of
+is a command line application written in Lua that uses the timeline daemon mode of
.B likwid-perfctr(1)
to create on-the-fly pictures with the current measurements. It uses the
.B feedGnuplot(1)
-script to send the current data to gnuplot.
+script to send the current data to gnuplot. Since the plot windows are normally closed directly after the execution of the monitored applications,
+.B likwid-perfscope
+waits until Ctrl+c is pressed.
.SH OPTIONS
.TP
-.B \-h
-prints a help message to standard output, then exits.
+.B \-\^h,\-\-\^help
+Prints a help message to standard output, then exits.
.TP
-.B \-\^cores " <cpu_list>
-measures the given group on given CPUs in <cpu_list>
+.B \-\^v,\-\-\^version
+Prints version information to standard output, then exits.
+.TP
+.B \-\^C " <cpu_list>
+Measures on given CPUs in <cpu_list>. See
+.B likwid-pin(1)
+for further information about the syntax.
+.TP
+.B \-\^C " <cpu_list>
+Measures the given group on given CPUs in <cpu_list>. See
+.B likwid-pin(1)
+for further information about the syntax. The application is pinned to that cores.
+.TP
+.B \-\^a,\-\-\^all
+List preconfigured event and plot configurations
+.TP
+.B \-\^d,\-\-\^dump
+Print the measurements of
+.B likwid-perfctr(1)
+to stdout.
.TP
-.B \-\^freq " <frequency>
-reads the current performance values every <frequency>. Available suffixes are 's' and 'ms', e.g. 500ms. Default value is 1s
+.B \-\^t,\-\-\^time " <frequency>
+Reads the current performance values every <frequency>. Available suffixes are 's', 'ms' or 'us, e.g. 500ms. Default value is 1s.
.TP
-.B \-\^group " <eventset>
-defines the events and counters that should be read. Possible values can be gathered from
+.B \-\^g,\-\-\^group " <eventset_and_plotconfig>
+Defines the events and counters that should be read. Possible values can be gathered from
.B likwid-perfctr(1).
-Default is group 'FLOPS_DP'
+You can give multiple
+.B \-\^g
+options on the commandline. They will be measured in a round-robin fashion and one plot generated per option. Moreover, the
+.B \-\^g
+option accepts config options for
+.B feedGnuplot(1),
+see section
+.B EVENTSETS
+.TP
+.B \-\^r,\-\-\^range " <value>
+Plot only the last <value> values. Often refered to as sliding window.
+.TP
+.B \-\^p,\-\-\^plotdump
+Use the dumping feature of feedGnuplot to print out the plot configuration and its data at each timestep.
+Can be used to create file-based plots afterwards.
+.TP
+.B \-\-\^host " <hostname>
+Instead of performing likwid-perfctr on the local machine, execute it on a remote machine and plot data locally. Uses ssh and you probably need to enter the password before starting. You can also give something like user at host.
+
+
+.SH EVENTSETS
+In contrast to the \-\^g option for
+.B likwid-perfctr
+the \-\^g option for
+.B likwid-perfscope
+is extended to accept configuration options for
+.B feedGnuplot.
+There are some predefined plot configurations embedded into
+.B likwid-perfscope
+which can be listed with the
+.B \-\^a
+command line option. They are filtered to show only configs that are available for your current system.
+If you need to measure and plot custom events you can set the plotting options as last entry in your eventset. The plotting config options can be set as a ':' separated list. If you select preconfigured group, you can overwrite single fields in the config like changing the title or the matching. The folling options are available:
-.SH EXAMPLE
.IP 1. 4
-Monitor double precision floating-point operations:
+.B title=<string>, TITLE=<string>
.TP
-.B likwid-perfscope -group FLOPS_DP -cores 0-3 -freq 500ms
+Use the given title for the plot, use "" to enclose text with spaces and escape characters which could be interpreted by the shell. ':' are not allowed!
+.PP
+.IP 2. 4
+.B xtitle=<string>, XTITLE=<string>
+.TP
+Use the given title for the x-axis of the plot, use "" to enclose text with spaces and escape characters which could be interpreted by the shell. ':' are not allowed!
+.PP
+.IP 3. 4
+.B ytitle=<string>, YTITLE=<string>
+.TP
+Use the given title for the left y-axis of the plot, use "" to enclose text with spaces and escape characters which could be interpreted by the shell. ':' are not allowed!
+.PP
+.IP 4. 4
+.B <string>=<string>
+.TP
+All option string items that is not recognized as keyword like TITLE are used as formulas for the output. You can set multiple of those items in one option string. Each is calculated and integrated in the output plot. The first <string> is used as legend entry. The second <string> is the formula for the function.
+.PP
+.IP 5. 4
+.B y2title=<string>, Y2TITLE=<string>, y2title=<id-string>, Y2TITLE=<id-string>
+.TP
+Use the given title for the right y-axis of the plot. If no id is set, the last y2-axis is related to the last formula. If id is set, the formula with the id is used for the y2-axis. The id starts with index 1 for the first formula. Use "" to enclose text with spaces and escape characters which could be interpreted by the shell with '\'. ':' are not allowed!
+.PP
+
+.SH EXAMPLE
+.IP 1. 5
+Measure and print a preconfigured plotting configuration:
+.TP
+.B likwid-perfscope -g L3 -C 0-2 -t 1s ./a.out
+.PP
+This measures the L3 bandwidth with likwid-perfctr every second on CPU cores 0,1,2 and use the plotting configuration L3. The plot will have a title and the axes are labeled properly.
+.IP 2. 5
+Measure and print a preconfigured plotting configuration:
+.TP
+.B likwid-perfscope -g L2:TITLE="My Title" -C 0 -t 1s ./a.out
+.PP
+This measures the L2 bandwidth with likwid-perfctr every second on CPU core 0 and use the plotting configuration L2. The title of the output plot is changed to the custom title "My Title".
+.IP 3. 5
+Custom event set with plotting configuration:
+.TP
+.B likwid-perfscope -g INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1,CPI=FIXC0/FIXC1:YTITLE="Cycles per Instruction" -C 0 --time 500ms ./a.out
.PP
Executes
.B likwid-perfctr
-on the first four cores. The values are read every 500ms are forwarded to gnuplot using the
-.B feedGnuplot
-script.
+on the first core. The values for the events
+.B INSTR_RETIRED_ANY
+and
+.B CPU_CLK_UNHALTED_CORE
+are read every 500ms. The raw values are transformed using the formula
+.B FIXC0/FIXC1
+and forwarded to gnuplot using the
+.B feedGnuplot(1)
+script with the curve name 'CPI' in the legend. The y-axis is labeled with the string "Cycles per Instruction".
+IP 4. 5
+Custom event set with plotting configuration:
+.TP
+.B likwid-perfscope -g L3,CPI=FIXC0/FIXC1:Y2TITLE="2-Cycles per Instruction" -C 0 --time 500ms ./a.out
+.PP
+This measures the L3 bandwidth for CPU 0 every 500 ms. Additionally, a second curve is plotted with the function
+.B FIXC0/FIXC1
+with the legend entry
+.B CPI.
+The right y-axis is labeled with
+.B 'Cycles per Instruction'
+and is associated to the second formula. The first formula is hidden in the
+.B L3
+plot group. Since the
+.B CPI
+formula is the last in the list, the curve id is not needed in the
+.B Y2TITLE
+as this is the default behavior.
.SH AUTHOR
-Written by Jan Treibig <jan.treibig at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
.SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
.SH "SEE ALSO"
-likwid-perfctr(1), feedGnuplot(1), likwid-pin(1), likwid-powermeter(1), likwid-setFrequencies(1)
+likwid-perfctr(1), feedGnuplot(1)
diff --git a/doc/likwid-pin.1 b/doc/likwid-pin.1
index efea873..4822f8d 100644
--- a/doc/likwid-pin.1
+++ b/doc/likwid-pin.1
@@ -2,138 +2,182 @@
.SH NAME
likwid-pin \- pin a sequential or threaded application to dedicated processors
.SH SYNOPSIS
-.B likwid-pin
-.RB [\-vhqipS]
+.B likwid-pin
+.RB [\-vhSpqi]
+.RB [ \-V
+.IR verbosity ]
.RB [ \-c
-.IR <core_list> ]
+.IR corelist ]
.RB [ \-s
-.IR <skip_mask> ]
-.RB [ \-d
-.IR <delimiter> ]
+.IR skip_mask ]
.SH DESCRIPTION
.B likwid-pin
-is a command line application to pin a sequential or multithreaded
-applications to dedicated processors. It can be used as replacement for
-.B taskset(1).
+is a command line application to pin a sequential or multithreaded
+application to dedicated processors. It can be used as replacement for taskset.
Opposite to taskset no affinity mask but single processors are specified.
-For multithreaded applications based on the pthread library the
-.I pthread_create
+For multithreaded applications based on the pthread library the
+.B pthread_create
library call is overloaded through LD_PRELOAD and each created thread is pinned
-to a dedicated processor as specified in
-.I core_list
-.
+to a dedicated processor as specified in
+.I core_list .
.PP
-Per default every generated thread is pinned to the core in the order of calls
-to
-.I pthread_create.
-It is possible to skip single threads using -s commandline option.
+Per default every generated thread is pinned to the core in the order of calls to
+.B pthread_create
+it is possible to skip single threads.
.PP
-For OpenMP implementations gcc and icc compilers are explicitly supported. Others may also work.
+The OpenMP implementations of GCC and ICC compilers are explicitly supported.
+Clang's OpenMP backend should also work as it is built on top of Intel's OpenMP runtime library.
+Others may also work
.B likwid-pin
-sets the environment variable OMP_NUM_THREADS for you if not already present.
-It will set as many threads as present in the pin expression. Be aware that
+sets the environment variable
+.B OMP_NUM_THREADS
+for you if not already present.
+It will set as many threads as present in the pin expression. Be aware that
with pthreads the parent thread is always pinned. If you create for example 4
threads with
-.I pthread_create
-and do not use the parent process as worker you
-still have to provide num_threads+1 processor ids.
+.B pthread_create
+and do not use the parent process as worker you still have to provide
+.B num_threads+1
+processor ids.
.PP
.B likwid-pin
-supports different numberings for pinning. Per default physical numbering of
-the cores is used. This is the numbering also
-.B likwid-topology(1)
-reports. But also logical numbering inside the node or the sockets can be used. If using
-with a N (e.g. -c N:0-6) the cores are logical numbered over the whole node.
-Physical cores come first. If a system e.g. has 8 cores with 16 SMT threads
-with -c N:0-7 you get all physical cores. If you specify -c N:0-15 you get all
-physical cores and all SMT threads. With S you can specify logical numberings
-inside sockets, again physical cores come first. You can mix different domains
-separated with @. E.g. -c S0:0-3 at S2:2-3 you pin thread 0-3 to logical cores 0-3 on socket 0
-and threads 4-5 on logical cores 2-3 on socket 2.
+supports different numberings for pinning. See section
+.B CPU EXPRESSION
+for details.
.PP
-For applications where first touch policy on numa systems cannot be employed
+For applications where first touch policy on NUMA systems cannot be employed
.B likwid-pin
can be used to turn on interleave memory placement. This can significantly
-speed up the performance of memory bound multithreaded codes. All numa nodes
+speed up the performance of memory bound multithreaded codes. All NUMA nodes
the user pinned threads to are used for interleaving.
.SH OPTIONS
.TP
-.B \-\^v
+.B \-\^h,\-\-\^help
+prints a help message to standard output, then exits.
+.TP
+.B \-\^v,\-\-\^version
prints version information to standard output, then exits.
.TP
-.B \-\^h
-prints a help message to standard output, then exits.
+.B \-\^V, \-\-\^verbose <level>
+verbose output during execution for debugging. 0 for only errors, 1 for informational output, 2 for detailed output and 3 for developer output
.TP
-.B \-\^c " <processor_list> OR <thread_expression> OR <scatter policy> "
-specify a numerical list of processors. The list may contain multiple
-items, separated by comma, and ranges. For example 0,3,9-11. You can also use
-logical numberings, either within a node (N), a socket (S<id>) or a numa domain (M<id>).
-likwid-pin also supports logical pinning within a cpuset with a L prefix. If you ommit this option
-likwid-pin will pin the threads to the processors on the node with physical cores first.
-See below for details on using a thread expression or scatter policy
+.B \-\^c <cpu expression>
+specify a numerical list of processors. The list may contain multiple items, separated by comma, and ranges. For example 0,3,9-11. Other format are available, see the
+.B CPU EXPRESSION
+section.
.TP
-.B \-\^s " <skip_mask>
+.B \-\^s, \-\-\^skip <mask>
Specify skip mask as HEX number. For each set bit the corresponding thread is skipped.
.TP
-.B \-\^S
-All ccNUMA memory domains belonging to the specified threadlist will be cleaned before the run. Can solve file buffer cache problems on Linux.
+.B \-\^S,\-\-\^sweep
+All ccNUMA memory domains belonging to the specified thread list will be cleaned before the run. Can solve file buffer cache problems on Linux.
.TP
.B \-\^p
-prints the available thread domains for logical pinning. If used in combination with -c, the physical processor IDs are printed to stdout.
+prints the available thread domains for logical pinning
.TP
.B \-\^i
-set numa memory policy to interleave spanning all numa nodes involved in pinning
+set NUMA memory policy to interleave involving all NUMA nodes involved in pinning
.TP
-.B \-\^q
+.B \-\^q,\-\-\^quiet
silent execution without output
-.TP
-.B \-\^d " <delimiter>
-set delimiter used to output the physical processor list (-p & -c)
+.SH CPU EXPRESSION
+.IP 1. 4
+The most intuitive CPU selection method is a comma-separated list of phyiscal CPU IDs. An example for this is
+.B 0,2
+which schedules the threads on CPU cores
+.B 0
+and
+.B 2.
+The physical numbering also allows the usage of ranges like
+.B 0-2
+which results in the list
+.B 0,1,2.
+.IP 2. 4
+The CPUs can be selected by their indices inside of an affinity domain. The affinity domain is optional and if not given, Likwid assumes the domain
+.B 'N'
+for the whole node. The format is
+.B L:<indexlist>
+for selecting the CPUs inside of domain
+.B 'N'
+or
+.B L:<domain>:<indexlist>
+for selecting the CPUs inside the given domain. Assuming an virtual affinity domain
+.B 'P'
+that contains the CPUs
+.B 0,4,1,5,2,6,3,7.
+After sorting it to have physical cores first we get:
+.B 0,1,2,3,4,5,6,7.
+The logical numbering
+.B L:P:0-2
+results in the selection
+.B 0,1,2
+from the physical cores first list.
+.IP 3. 4
+The expression syntax enables the selection according to an selection function with variable input parameters. The format is either
+.B E:<affinity domain>:<numberOfThreads>
+to use the first <numberOfThreads> threads in affinity domain <affinity domain> or
+.B E:<affinity domain>:<numberOfThreads>:<chunksize>:<stride>
+to use <numberOfThreads> threads with <chunksize> threads selected in row while skipping <stride> threads in affinity domain <affinity domain>. Examples are
+.B E:N:4:1:2
+for selecting the first four physical CPUs on a system with 2 SMT threads per core or
+.B E:P:4:2:4
+for choosing the first two threads in affinity domain
+.B P,
+skipping 2 threads and selecting again two threads. The resulting CPU list for virtual affinity domain
+.B P
+is
+.B 0,4,2,6
+.IP 3. 4
+The last format schedules the threads not only in a single affinity domain but distributed them evenly over all available affinity domains of the same kind. In contrast to the other formats, the selection is done using the physical cores first and then the SMT threads. The format is
+.B <affinity domain without number>:scatter
+like
+.B M:scatter
+to schedule the threads evenly in all available memory affinity domains. Assuming the two socket domains
+.B S0 = 0,4,1,5
+and
+.B S1 = 2,6,3,7
+the expression
+.B S:scatter
+results in the CPU list
+.B 0,2,1,3,4,6,5,7
.SH EXAMPLE
-.IP 1. 4
+.IP 1. 5
For standard pthread application:
.TP
-.B likwid-pin -c 0,2,4-6 ./myApp
+.B likwid-pin -c 0,2,4-6 ./myApp
.PP
-The parent process is pinned to processor 0. Thread 0 to processor 2, thread
-1 to processor 4, thread 2 to processor 5 and thread 3 to processor 6. If more threads
-are created than specified in the processor list, these threads are pinned to processor 0
-as fallback.
-.IP 2. 4
-For gcc OpenMP as many ids must be specified in processor list as there are threads:
-.TP
-.B OMP_NUM_THREADS=4; likwid-pin -c 0,2,1,3 ./myApp
-.IP 3. 4
-Full control over the pinning can be achieved by specifying a skip mask.
-For example the following command skips the pinning of thread 1:
+The parent process is pinned to processor 0 which is likely to be thread 0 in
+.B ./myApp.
+Thread 1 is pinned to processor 2, thread 2 to processor 4, thread 3 to processor 5 and thread 4 to processor 6. If more threads
+are created than specified in the processor list, these threads are pinned to processor 0 as fallback.
+.IP 2. 5
+For selection of CPUs inside of a CPUset only the logical numbering is allowed. Assuming CPUset
+.B 0,4,1,5:
.TP
-.B OMP_NUM_THREADS=4; likwid-pin -s 0x1 -c 0,2,1,3 ./myApp
-.IP 4. 4
-The -c switch supports the definition of threads in a specific affinity domain like
-NUMA node or cache group. The available affinity domains can be retrieved with the -p switch
-and no further option on the commandline. The common affinity domains are N (whole Node),
-SX (socket X), CX (cache group X) and MX (memory group X). Multiple affinity domains
-can be set separated by @. In order to pin 2 threads on each socket of a 2-socket system:
-.TP
-.B OMP_NUM_THREADS=4; likwid-pin -c S0:0-1 at S1:0-1 ./myApp
-.IP 5. 4
-Another argument definition of the -c switch allows the threads to be pinned according
-to an expression like E:N:4:1:2. The syntax is E:<thread domain>:<number of threads>(:<chunk size>:<stride>).
-The example pins 8 threads with 2 SMT threads per core on a SMT 4 machine:
-.TP
-.B OMP_NUM_THREADS=4; likwid-pin -c E:N:8:2:4 ./myApp
-.IP 6. 4
-The last alternative for the -c switch is the automatic scattering of threads on affinity domains.
-For example to scatter the threads over all memory domains in a system:
+.B likwid-pin -c L:1,3 ./myApp
+.PP
+This command pins
+.B ./myApp
+on CPU
+.B 4
+and the thread started by
+.B ./myApp
+on CPU
+.B 5
+.IP 3. 5
+A common use-case for the numbering by expression is pinning of an application on the Intel Xeon Phi coprocessor with its 60 cores each having 4 SMT threads.
.TP
-.B OMP_NUM_THREADS=4; likwid-pin -c M:scatter ./myApp
+.B likwid-pin -c E:N:60:1:4 ./myApp
+.PP
+This command schedules one thread per physical CPU core for
+.B ./myApp.
.SH AUTHOR
-Written by Jan Treibig <jan.treibig at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
.SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
.SH "SEE ALSO"
-taskset(1), likwid-perfctr(1), likwid-features(1), likwid-powermeter(1), likwid-setFrequencies(1)
+taskset(1), likwid-perfctr(1), likwid-features(1), likwid-topology(1),
diff --git a/doc/likwid-powermeter.1 b/doc/likwid-powermeter.1
index f4a3ba2..9f35ceb 100644
--- a/doc/likwid-powermeter.1
+++ b/doc/likwid-powermeter.1
@@ -3,49 +3,72 @@
likwid-powermeter \- A tool to print power and clocking information on Intel CPUs
.SH SYNOPSIS
.B likwid-powermeter
-.RB [ \-vhip ]
+.RB [ \-vhpitf ]
+.RB [ \-V
+.IR verbosity_level ]
.RB [ \-c
-.IR <socket_list> ]
+.IR socket_list ]
.RB [ \-s
-.IR <duration_in_seconds> ]
+.IR duration ]
.RB [ \-M
-.IR <access_mode>]
+.IR <0|1> ]
.SH DESCRIPTION
.B likwid-powermeter
-is a command line application to get the energy comsumption of Intel RAPL capable processors.
-It also prints information about TDP and Turbo Mode steps supported.
+is a command line application to get the Energy comsumption on Intel RAPL capable processors. Currently
+only Intel SandyBridge is supported. It also prints information about TDP and Turbo Mode steps supported.
The Turbo Mode information works on all Turbo mode enabled Intel processors. The tool can be either used
in stethoscope mode for a specified duration or as a wrapper to your application measuring your complete
run. RAPL works on a per package (socket) base.
-Please note that the RAPL counters are also accessible as normal events within
-.B likwid-perfctr.
+Please note that the RAPL counters are also accessible as normal events withing likwid-perfctr.
.SH OPTIONS
.TP
-.B \-\^v
+.B \-\^h,\-\-\^help
+prints a help message to standard output, then exits.
+.TP
+.B \-\^v,\-\-\^version
prints version information to standard output, then exits.
.TP
-.B \-\^h
-prints a help message to standard output, then exits.
+.B \-\^V, \-\-\^verbose <level>
+verbose output during execution for debugging. 0 for only errors, 1 for informational output, 2 for detailed output and 3 for developer output
+.TP
+.B \-\^c <socket_list>
+set on which socket(s) the RAPL interface is accessed. List of sockets like 0,1,2 or 0-2 are allowed.
.TP
-.B \-\^c " <socket_list>"
-set on which sockets the RAPL interface is accessed. comma-separated list of socket IDs
+.B \-\^M <0|1>
+set how MSR registers are accessed, 0=direct, 1=accessDaemon.
+.TP
+.B \-\^s <duration>
+set measure duration in us, ms or s. (default 2s)
.TP
.B \-\^p
-prints out information about dynamic clocks and CPI information on the socket measured. Uses likwid-perfctr internally.
+prints out information about dynamic clocks and CPI information on the socket(s) measured.
.TP
-.B \-\^i
-prints out information TDP and Turbo mode steps
+.B \-\^i,\-\-\^info
+prints out information TDP and Turbo mode steps of all RAPL domains supporting it.
.TP
-.B \-\^M " <access_mode>"
-set the access method. 0 for direct access to MSR/RAPL registers, 1 for using the accessDaemon.
+.B \-\^t
+prints out the temperature of all CPUs in the system.
.TP
-.B \-\^s " <duration_in_seconds>
-measure the power for a specific time (default 2s)
+.B \-\^f
+prints out the temperature like
+.B \-\^t
+but used Fahrenheit as temperature unit.
+.SH EXAMPLE
+.IP 1. 3
+Measure the power consumption for 4 seconds on socket 1
+.TP
+.B likwid-powermeter -s 4 -c 1
+.PP
+.IP 2. 3
+Use it as wrapper for an application to measure the energy for the whole execution
+.TP
+.B likwid-powermeter -c 1 ./a.out
+.PP
.SH AUTHOR
-Written by Jan Treibig <jan.treibig at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
.SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
.SH "SEE ALSO"
-likwid-topology(1), likwid-perfctr(1), likwid-pin(1), likwid-features(1), likwid-setFrequencies(1)
+likwid-topology(1), likwid-perfctr(1), likwid-pin(1)
diff --git a/doc/likwid-setFreq.1 b/doc/likwid-setFreq.1
index 87054c7..1ef598c 100644
--- a/doc/likwid-setFreq.1
+++ b/doc/likwid-setFreq.1
@@ -4,7 +4,7 @@ likwid-setFreq \- Mediator for
.B likwid-setFrequencies(1)
that performs the actual setting of CPU cores' frequency and governor.
.SH SYNOPSIS
-.B likwid-setFreq
+.B likwid-setFreq
.IR <coreId>
.IR <frequency>
.IR [<governor>]
@@ -14,11 +14,13 @@ that performs the actual setting of CPU cores' frequency and governor.
is a command line application that mediates the request from
.B likwid-setFrequencies(1)
because setting a CPU core's frequency and/or governor requires root privileges. This executable must be suid-root.
+.B likwid-setFreq
+works only with the kernel module acpi-cpufreq. The recent intel_pstate module does not allow to set fixed frequencies.
.SH AUTHOR
-Written by Jan Treibig <jan.treibig at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
.SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
.SH "SEE ALSO"
-likwid-setFrequencies(1), likwid-perfctr(1), feedGnuplot(1), likwid-pin(1), likwid-powermeter(1)
+likwid-setFrequencies(1)
diff --git a/doc/likwid-setFrequencies.1 b/doc/likwid-setFrequencies.1
index b268280..b45fcbe 100644
--- a/doc/likwid-setFrequencies.1
+++ b/doc/likwid-setFrequencies.1
@@ -3,20 +3,30 @@
likwid-setFrequencies \- print and manage the clock frequency of CPU cores
.SH SYNOPSIS
.B likwid-setFrequencies
-.RB [\-hpl]
+.RB [\-hvplmp]
.RB [ \-c
-.IR <cpu_list,_socket_list_or_expression> ]
+.IR <cpu_list> ]
.RB [ \-g
.IR <governor> ]
-.RB [ \-f
+.RB [ \-f,\-\-\^freq
.IR <frequency> ]
.SH DESCRIPTION
.B likwid-setFrequencies
-is a command line application to set the clock frequency of CPU cores. Since only priviledged users are allowed to change the frequency of CPU cores, the application works in combination with a daemon
-.B likwid-setFreq.
-The daemon needs the suid permission bit to be set in order to manipulate the sysfs entries. With
+is a command line application to set the clock frequency of CPU cores. Since only priviledged users are allowed to change the frequency of CPU cores, the application works in combination with a daemon
+.B likwid-setFreq(1).
+The daemon needs the suid permission bit to be set in order to manipulate the sysfs entries. With
.B likwid-setFrequencies
the clock of all cores inside a the cpu_list or affinity domain can be set to a specific frequency or governor at once.
+.B likwid-setFrequencies
+works only with the kernel module
+.B acpi-cpufreq.
+The recent
+.B intel_pstate
+module does not allow to set fixed frequencies. In order to deactivate
+.B intel_pstate
+add 'intel_pstate=disable' to your kernel boot commandline (commonly in grub) and load the
+.B acpi-cpufreq
+module.
.SH OPTIONS
.TP
.B \-h
@@ -28,19 +38,23 @@ prints the current frequencies for all CPU cores
.B \-l
prints all configurable frequencies
.TP
-.B \-\^c " <cpu_list,_socket_list_or_expression>
-set the affinity domain where to set the frequencies. Common are N (Node), SX (Socket X), CX (Cache Group X) and MX (Memory Group X). For detailed information about affinity domains see
+.B \-m
+prints all configurable governors
+.TP
+.B \-\^c <cpu_list>
+set the affinity domain where to set the frequencies. Common are N (Node), SX (Socket X), CX (Cache Group X) and MX (Memory Group X).
+For detailed information about affinity domains see
.B likwid-pin(1)
.TP
-.B \-\^g " <governor>
+.B \-\^g <governor>
set the governor of all CPU cores inside the affinity domain. Current governors are ondemand, performance, turbo. Default is ondemand
.TP
-.B \-\^f " <frequency>
+.B \-\^f, \-\-\^freq <frequency>
set a fixed frequency at all CPU cores inside the affinity domain. Implicitly sets userspace governor for the cores.
.SH AUTHOR
-Written by Thomas Roehl <thomas.roehl at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
.SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
.SH "SEE ALSO"
-likwid-perfctr(1), likwid-features(1), likwid-pin(1), likwid-powermeter(1), likwid-topology(1),
+likwid-pin(1), likwid-perfctr(1), likwid-powermeter(1)
diff --git a/doc/likwid-topology.1 b/doc/likwid-topology.1
index 64bc8b4..04ebdc4 100644
--- a/doc/likwid-topology.1
+++ b/doc/likwid-topology.1
@@ -2,41 +2,47 @@
.SH NAME
likwid-topology \- print thread and cache topology
.SH SYNOPSIS
-.B likwid-topology
+.B likwid-topology
.RB [\-hvgcC]
+.RB [ \-V
+.IR level ]
.RB [ \-o
-.IR <filename> ]
+.IR output_file ]
.SH DESCRIPTION
.B likwid-topology
-is a command line application to print the thread and cache topology on multicore x86 processors. Used with mono spaced fonts it can
-draw the processor topology of a machine in ASCII art. Beyond topology
-.B likwid-topology
-determines the clock of a processor and prints detailed informations about the caches hierarchy and NUMA structure.
+is a command line application to print the thread and cache
+topology on multicore x86 processors. Used with mono spaced fonts it can
+draw the processor topology of a machine in ascii art. Beyond topology
+likwid-topology determines the clock of a processor and prints detailed
+informations about the caches hierarchy.
.SH OPTIONS
.TP
-.B \-v
+.B \-h, \-\-\^help
+prints a help message to standard output, then exits.
+.TP
+.B \-v, \-\-\^version
prints version information to standard output, then exits.
.TP
-.B \-h
-prints a help message to standard output, then exits.
+.B \-V, \-\-\^verbose <level>
+sets the verbosity level of LIKWID's topology backend. Possible levels range from 0 to 3.
.TP
.B \-g
prints topology information in ASCII art. Best viewed with monospaced font.
.TP
-.B \-c
-prints detailed informations about cache hierarchy
+.B \-c, \-\-\^caches
+prints detailed information about cache hierarchy
.TP
-.B \-C
-measures and output the processor clock. This involves a longer runtime of
-.B likwid-topology.
+.B \-C, \-\-\^clock
+measures and output the processor clock. This involves a longer run time of likwid-topology.
.TP
-.B \-\^f " <filename>
-Specify output file for topology information. According to the file suffix, the information
-is converted using converter scripts installed at <PREFIX>/share/likwid
+.B \-o, \-\-\^output <file>
+write the output to file instead of stdout.
+Likwid applies filter scripts according to filename suffix.
+Currently available scripts are csv. You can place additional filter scripts in <INSTALLEDPREFIX>/share/likwid/filter.
.SH AUTHOR
-Written by Jan Treibig <jan.treibig at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
.SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
.SH "SEE ALSO"
-likwid-perfctr(1), likwid-features(1), likwid-pin(1), likwid-powermeter(1), likwid-setFrequencies(1)
+likwid-perfctr(1), likwid-features(1), likwid-pin(1),
diff --git a/doc/likwid.cfg.md b/doc/likwid.cfg.md
new file mode 100644
index 0000000..2122dee
--- /dev/null
+++ b/doc/likwid.cfg.md
@@ -0,0 +1,38 @@
+/*! \page likwid.cfg <CODE>likwid.cfg</CODE>
+<H1>Information</H1>
+<CODE>likwid.cfg</CODE> is the global configuration file for LIKWID but it is optional. The configuration is normally defined at compile time. It allows to set the path to the access mode for the MSR/PCI access daemon and some other basic options.<BR>
+LIKWID searches for the configuration file at different paths like <CODE>/usr/local/etc/likwid.cfg</CODE>.<BR>
+<B>Note: It was introduced with version 4 and is not fully integrated in the LIKWID code.</B>
+
+<H1>Config file options</H1>
+<H1>Config file</H1>
+The global configuration file has the following options:
+<TABLE>
+<TR>
+ <TH>Option</TH>
+ <TH>Description</TH>
+</TR>
+<TR>
+ <TD>topology_file = <path></TD>
+ <TD>Path to the toplogy file created with \ref likwid-genTopoCfg</TD>
+</TR>
+<TR>
+ <TD>access_mode = <daemon|direct></TD>
+ <TD>Set access mode. The direct mode can only used by users with root priviledges. The daemon uses \ref likwid-accessD.</TD>
+</TR>
+<TR>
+ <TD>daemon_path = <path></TD>
+ <TD>Path to the access daemon.</TD>
+</TR>
+<TR>
+ <TD>max_threads = <arg></TD>
+ <TD>Adjust maximally supported threads/CPUs. <B>Note:</B> not use by now, fixed at compile time.</TD>
+</TR>
+<TR>
+ <TD>max_nodes = <arg></TD>
+ <TD>Adjust maximally supported NUMA nodes. <B>Note:</B> not use by now, fixed at compile time.</TD>
+</TR>
+</TABLE>
+
+
+*/
diff --git a/doc/logo.png b/doc/logo.png
new file mode 100644
index 0000000..048ed9a
Binary files /dev/null and b/doc/logo.png differ
diff --git a/doc/lua-doxygen.md b/doc/lua-doxygen.md
new file mode 100644
index 0000000..c00b992
--- /dev/null
+++ b/doc/lua-doxygen.md
@@ -0,0 +1,2592 @@
+/*! \page lua_Info Information about LIKWID's Lua API
+<H1>How to include Lua API into own Lua applications</H1>
+<CODE>
+package.path = package.path .. ';<PREFIX>/share/lua/?.lua'<BR>
+local likwid = require("likwid")<BR>
+</CODE>
+<P></P>
+Now all function and variables can be called with<BR>
+<CODE>likwid.<I>functionname()</I></CODE><BR>
+or<BR>
+<CODE>likwid.<I>variable</I></CODE>
+
+<H1>Global variables defined by LIKWID's Lua API</H1>
+<TABLE>
+<TR>
+ <TH>Variablename</TH>
+ <TH>Description</TH>
+</TR>
+<TR>
+ <TD>\a groupfolder</TD>
+ <TD>Path to the folder containing the definitions of the performance groups</TD>
+</TR>
+<TR>
+ <TD>\a version</TD>
+ <TD>Version of LIKWID</TD>
+</TR>
+<TR>
+ <TD>\a release</TD>
+ <TD>Release number of LIKWID</TD>
+</TR>
+<TR>
+ <TD>\a pinlibpath</TD>
+ <TD>Path to the pinning library. Is added automatically to $LD_PRELOAD by \ref likwid-pin and \ref likwid-perfctr</TD>
+</TR>
+<TR>
+ <TD>\a hline</TD>
+ <TD>Horizontal line with 80 '-' characters</TD>
+</TR>
+<TR>
+ <TD>\a sline</TD>
+ <TD>Horizontal line with 80 '*' characters</TD>
+</TR>
+<TR>
+ <TD>\a dline</TD>
+ <TD>Horizontal line with 80 '=' characters</TD>
+</TR>
+</TABLE>
+*/
+
+/*! \page lua_Config Config file module
+<H1>Data type definition for Lua config file module in the Lua API</H1>
+\anchor lua_config
+<H2>Config file read</H2>
+<P>This structure is returned by \ref getConfiguration function<BR>The config file can be created with \ref likwid-genTopoCfg executable. It searches the files /etc/likwid.cfg and <PREFIX>/etc/likwid.cfg. Other configuration file paths can be set in config.mk before building LIKWID.</P>
+<TABLE>
+<TR>
+ <TH>Membername</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>\a configFile</TD>
+ <TD>Path to the config file</TD>
+</TR>
+<TR>
+ <TD>\a topologyFile</TD>
+ <TD>Path to the config file containing topology information</TD>
+</TR>
+<TR>
+ <TD>\a daemonPath</TD>
+ <TD>Path to the access daemon</TD>
+</TR>
+<TR>
+ <TD>\a daemonMode</TD>
+ <TD>Access mode for LIKWID (0 = direct access, 1 = access daemon)</TD>
+</TR>
+<TR>
+ <TD>\a maxNumThreads</TD>
+ <TD>Maximal amount of hardware threads in the system</TD>
+</TR>
+<TR>
+ <TD>\a maxNumNodes</TD>
+ <TD>Maximal amount of NUMA nodes in the system</TD>
+</TR>
+<TR>
+ <TD>\a maxHashTableSize</TD>
+ <TD>Maximal size for the internally used hash table</TD>
+</TR>
+</TABLE>
+
+<H1>Function definitions for Lua config file module in the Lua API</H1>
+\anchor getConfiguration
+<H2>getConfiguration()</H2>
+<P>Read the configuration file and return a list of config options</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>List of configuration options, see \ref lua_config</TD>
+</TR>
+</TABLE>
+
+\anchor setVerbosity
+<H2>setVerbosity(verbosity)</H2>
+<P>Define and/or change the verbosity level of LIKWID</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a verbosity</TD>
+ <TD>0 = only errors<BR>1 = infos<BR>2 = detail<BR>3 = developer<BR>Other flags are rejected.</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor putConfiguration
+<H2>putConfiguration()</H2>
+<P>Frees the C-structures that were created by \ref getConfiguration function.</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>None</TD>
+</TR>
+</TABLE>
+
+*/
+
+/*! \page lua_Access Access client module
+<H1>Data type definition for Lua access client module in the Lua API</H1>
+<H1>Function definitions for Lua access client module in the Lua API</H1>
+\anchor setAccessMode
+<H2>setAccessMode(accessFlag)</H2>
+<P>Define and/or change the access mode to the MSR and PCI registers</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a accessFlag</TD>
+ <TD>0 = direct access<BR>1 = access daemon<BR>Other flags are rejected.</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Always 0</TD>
+</TR>
+</TABLE>
+
+*/
+
+/*! \page lua_CPUTopology CPU information module
+<H1>Data type definition for CPU information module in the Lua API</H1>
+\anchor lua_cpuinfo
+<H2>Cpu Info</H2>
+<P>This structure is returned by \ref getCpuInfo function<BR>It is similar to the C struct CpuInfo</P>
+<TABLE>
+<TR>
+ <TH>Membername</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>\a family</TD>
+ <TD>Family ID of CPU</TD>
+</TR>
+<TR>
+ <TD>\a model</TD>
+ <TD>Model ID of CPU</TD>
+</TR>
+<TR>
+ <TD>\a stepping</TD>
+ <TD>Revision of CPU</TD>
+</TR>
+<TR>
+ <TD>\a clock</TD>
+ <TD>Base clock frequency</TD>
+</TR>
+<TR>
+ <TD>\a turbo</TD>
+ <TD>Flag if the system supports the Turbo mode</TD>
+</TR>
+<TR>
+ <TD>\a name</TD>
+ <TD>Name of the microarchitecture</TD>
+</TR>
+<TR>
+ <TD>\a osname</TD>
+ <TD>Name of the CPU as given by manufacturer</TD>
+</TR>
+<TR>
+ <TD>\a short_name</TD>
+ <TD>Short name of microarchitecture</TD>
+</TR>
+<TR>
+ <TD>\a features</TD>
+ <TD>String with all interesting CPU feature flags as a space separated list</TD>
+</TR>
+<TR>
+ <TD>\a featureFlags</TD>
+ <TD>Bitmask with all interesting CPU feature flags<BR>Bit positions can be retrieved from the FeatureBit enum</TD>
+</TR>
+<TR>
+ <TD>\a isIntel</TD>
+ <TD>Flag to check if the system is using Intel CPUs</TD>
+</TR>
+<TR>
+ <TD>\a perf_version</TD>
+ <TD>Version of architectural performance monitoring capabilities</TD>
+</TR>
+<TR>
+ <TD>\a perf_num_ctr</TD>
+ <TD>Amount of core-local general-purpose counters</TD>
+</TR>
+<TR>
+ <TD>\a perf_num_fixed_ctr</TD>
+ <TD>Amount of core-local fixed-purpose counters</TD>
+</TR>
+<TR>
+ <TD>\a perf_width_ctr</TD>
+ <TD>Register width of core-local counters</TD>
+</TR>
+</TABLE>
+
+
+\anchor lua_cputopo
+<H2>Cpu Topology</H2>
+<P>This structure is returned by \ref getCpuTopology function<BR>The nested list structure is similar to the C struct CpuTopology.</P>
+<TABLE>
+<TR>
+ <TH>Membername</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>\a numHWThreads</TD>
+ <TD>Total amount of hardware threads in the system</TD>
+</TR>
+<TR>
+ <TD>\a activeHWThreads</TD>
+ <TD>Amount of active hardware threads in the system</TD>
+</TR>
+<TR>
+ <TD>\a numSockets</TD>
+ <TD>Number of CPU sockets in the system</TD>
+</TR>
+<TR>
+ <TD>\a numCoresPerSocket</TD>
+ <TD>Number of physical cores of each socket in the system</TD>
+</TR>
+<TR>
+ <TD>\a numThreadsPerCore</TD>
+ <TD>Number of hardware threads of each core in the system</TD>
+</TR>
+<TR>
+ <TD>\a numCacheLevels</TD>
+ <TD>Amount of cache levels in the system</TD>
+</TR>
+<TR>
+ <TD>\a threadPool<BR>(List with<BR>\a numHWThreads entries)</TD>
+ <TD>
+ <TABLE>
+ <TR>
+ <TH>Membername</TH>
+ <TH>Comment</TH>
+ </TR>
+ <TR>
+ <TD>\a threadId</TD>
+ <TD>Thread ID</TD>
+ </TR>
+ <TR>
+ <TD>\a coreId</TD>
+ <TD>ID of physical CPU core</TD>
+ </TR>
+ <TR>
+ <TD>\a apicId</TD>
+ <TD>ID of the interrupt line for the hardware thread as defined by ACPI</TD>
+ </TR>
+ <TR>
+ <TD>\a packageId</TD>
+ <TD>ID of CPU socket for the current thread</TD>
+ </TR>
+ <TR>
+ <TD>\a inCpuSet</TD>
+ <TD>Defines whether the thread is available in current cpuset</TD>
+ </TR>
+ </TABLE>
+ </TD>
+</TR>
+<TR>
+ <TD>\a cacheLevels<BR>(List with<BR>\a numCacheLevels entries)</TD>
+ <TD>
+ <TABLE>
+ <TR>
+ <TH>Membername</TH>
+ <TH>Comment</TH>
+ </TR>
+ <TR>
+ <TD>\a level</TD>
+ <TD>Level of cache</TD>
+ </TR>
+ <TR>
+ <TD>\a associativity</TD>
+ <TD>Associativity in cache level</TD>
+ </TR>
+ <TR>
+ <TD>\a sets</TD>
+ <TD>Sets in cache level</TD>
+ </TR>
+ <TR>
+ <TD>\a lineSize</TD>
+ <TD>Size of a cache line in cache level</TD>
+ </TR>
+ <TR>
+ <TD>\a size</TD>
+ <TD>Size in bytes of cache level</TD>
+ </TR>
+ <TR>
+ <TD>\a threads</TD>
+ <TD>Amount of threads sharing the cache</TD>
+ </TR>
+ <TR>
+ <TD>\a inclusive</TD>
+ <TD>Inclusiveness of cache</TD>
+ </TR>
+ <TR>
+ <TD>\a type</TD>
+ <TD>
+ <TABLE>
+ <TR>
+ <TH>Typename</TH>
+ <TH>comment</TH>
+ </TR>
+ <TR>
+ <TD>DATACACHE</TD>
+ <TD>Cache manages only data</TD>
+ </TR>
+ <TR>
+ <TD>INSTRUCTIONCACHE</TD>
+ <TD>Cache manages only instructions</TD>
+ </TR>
+ <TR>
+ <TD>UNIFIEDCACHE</TD>
+ <TD>Cache manages data and instructions</TD>
+ </TR>
+ <TR>
+ <TD>ITLB</TD>
+ <TD>Translation Lookaside Buffer for instruction page addresses</TD>
+ </TR>
+ <TR>
+ <TD>DTLB</TD>
+ <TD>Translation Lookaside Buffer for data page addresses</TD>
+ </TR>
+ <TR>
+ <TD>NOCACHE</TD>
+ <TD>Type cannot be determined</TD>
+ </TR>
+ </TABLE>
+ </TD>
+ </TR>
+ </TABLE>
+ </TD>
+</TR>
+<TR>
+ <TD>\a topologyTree</TD>
+ <TD><TABLE>
+ <TR>
+ <TH>Membername</TH>
+ <TH>Comment</TH>
+ </TR>
+ <TR>
+ <TD>\a ID</TD>
+ <TD>ID of socket</TD>
+ </TR>
+ <TR>
+ <TD>\a Childs</TD>
+ <TD><TABLE>
+ <TR>
+ <TH>Membername</TH>
+ <TH>Comment</TH>
+ </TR>
+ <TR>
+ <TD>\a ID</TD>
+ <TD>ID of CPU core</TD>
+ </TR>
+ <TR>
+ <TD>\a Childs</TD>
+ <TD>List of thread IDs for the current CPU core</TD>
+ </TR>
+ </TABLE></TD>
+ </TABLE></TD>
+</TR>
+</TABLE>
+
+
+<H1>Function definitions for Lua CPU information module in the Lua API</H1>
+\anchor getCpuInfo
+<H2>getCpuInfo()</H2>
+<P>Get basic information about the CPUs in the system</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Cpu Info \ref lua_cpuinfo</TD>
+</TR>
+</TABLE>
+
+\anchor getCpuTopology
+<H2>getCpuTopology()</H2>
+<P>Get the topology information about the CPUs in the system</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Return</TD>
+ <TD>Cpu Topology \ref lua_cputopo</TD>
+</TR>
+</TABLE>
+
+<H2>putTopology()</H2>
+<P>Frees C struct CpuInfo and CpuTopology. You can still use the lua_cpuinfo and lua_cputopo data structures<BR>If you call \ref getCpuInfo or \ref getCpuTopology functions again after calling this function, the topology information will be read again.</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Return</TD>
+ <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor cpustr_to_cpulist
+<H2>cpustr_to_cpulist(cpuexpression)</H2>
+<P>Resolve the given CPU expression string to a list of CPUs as available in the system</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a cpuexpression</TD>
+ <TD>CPU expression string. Look at \ref likwid-pin for possible formats</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Return</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a nrCPUs</TD>
+ <TD>Number of CPUs in the \a cpulist</TD>
+ </TR>
+ <TR>
+ <TD>\a cpulist</TD>
+ <TD>List containing the CPU IDs after resolution of the cpu expression</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+</TABLE>
+
+<H2>printSupportedCPUs()</H2>
+<P>Print all Intel and AMD CPU types that are supported by Likwid</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Return</TD>
+ <TD>None</TD>
+</TR>
+</TABLE>
+
+*/
+
+
+/*! \page lua_NumaInfo NUMA memory topology module
+
+<H1>Data type definition for Lua NUMA topology module in the Lua API</H1>
+\anchor lua_numainfo
+<H2>NUMA Info</H2>
+<P>This structure is returned by \ref getNumaInfo function<BR>It is similar to the C struct NumaTopology</P>
+<TABLE>
+<TR>
+ <TH>Membername</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>\a numberOfNodes</TD>
+ <TD>Amount of NUMA nodes in the system</TD>
+</TR>
+<TR>
+ <TD>\a nodes</TD>
+ <TD><TABLE>
+ <TR>
+ <TH>Membername</TH>
+ <TH>Comment</TH>
+ </TR>
+ <TR>
+ <TD>id</TD>
+ <TD>ID of NUMA node</TD>
+ </TR>
+ <TR>
+ <TD>totalMemory</TD>
+ <TD>Total amount of memory in the NUMA domain</TD>
+ </TR>
+ <TR>
+ <TD>freeMemory</TD>
+ <TD>Free amount of memory in the NUMA domain</TD>
+ </TR>
+ <TR>
+ <TD>numberOfProcessors</TD>
+ <TD>Amount of CPUs in the NUMA domain</TD>
+ </TR>
+ <TR>
+ <TD>numberOfDistances</TD>
+ <TD>Amount of distances to local and remote NUMA nodes</TD>
+ </TR>
+ <TR>
+ <TD>processors</TD>
+ <TD>List of CPU IDs in the NUMA domain</TD>
+ </TR>
+ <TR>
+ <TD>distances</TD>
+ <TD>Two dimensional list of distances to NUMA nodes in the system</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+</TABLE>
+
+<H1>Function definitions for Lua NUMA topology module in the Lua API</H1>
+\anchor getNumaInfo
+<H2>getNumaInfo()</H2>
+<P>Get information about the NUMA domains in the system</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>NUMA Info \ref lua_numainfo</TD>
+</TR>
+</TABLE>
+
+
+<H2>putNumaInfo()</H2>
+<P>Frees C struct NumaTopology. You can still use the lua_numainfo data structure<BR>If you call \ref getNumaInfo function again after calling this function, the NUMA topology information will be read again.</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Return</TD>
+ <TD>None</TD>
+</TR>
+</TABLE>
+
+<H2>setMemInterleaved(nrThreads, threads2Cpus)</H2>
+<P>Set the 'Interleaved' memory policy to allocate data only on given CPUs</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a nrThreads</TD>
+ <TD>Amount of threads in the \a threads2Cpus list</TD>
+ </TR>
+ <TR>
+ <TD>\a threads2Cpus</TD>
+ <TD>List of thread to CPU relations</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Return</TD>
+ <TD>None</TD>
+</TR>
+</TABLE>
+
+<H2>nodestr_to_nodelist(nodeexpression)</H2>
+<P>Resolve the given node expression in NUMA affinity domain</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a nodeexpression</TD>
+ <TD>List of CPUs in NUMA node</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Return</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a nrThreads</TD>
+ <TD>Amount of threads in the \a threads2Cpus list</TD>
+ </TR>
+ <TR>
+ <TD>\a threads2Cpus</TD>
+ <TD>List of thread to CPU relations</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+</TABLE>
+
+<H2>sockstr_to_socklist(socketexpression)</H2>
+<P>Resolve the given socket expression in socket affinity domain</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a socketexpression</TD>
+ <TD>List of CPUs in socket affinity domain</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Return</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a nrThreads</TD>
+ <TD>Amount of threads in the \a threads2Cpus list</TD>
+ </TR>
+ <TR>
+ <TD>\a threads2Cpus</TD>
+ <TD>List of thread to CPU relations</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+</TABLE>
+
+*/
+
+/*! \page lua_AffinityInfo Thread affinity module
+
+<H1>Data type definition for Lua thread affinity module in the Lua API</H1>
+\anchor lua_affinityinfo
+<H2>Affinity Info</H2>
+<P>This structure is returned by \ref getAffinityInfo function<BR>It is similar to the C struct AffinityDomains</P>
+<TABLE>
+<TR>
+ <TH>Membername</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>\a numberOfAffinityDomains</TD>
+ <TD>Total amount of affinity domains in the system</TD>
+</TR>
+<TR>
+ <TD>\a numberOfSocketDomains</TD>
+ <TD>Amount of affinity domains for CPU sockets in the system</TD>
+</TR>
+<TR>
+ <TD>\a numberOfNumaDomains</TD>
+ <TD>Amount of affinity domains for NUMA domains in the system</TD>
+</TR>
+<TR>
+ <TD>\a numberOfCacheDomains</TD>
+ <TD>Amount of affinity domains for LLC domains in the system</TD>
+</TR>
+<TR>
+ <TD>\a numberOfProcessorsPerSocket</TD>
+ <TD>Amount of hardware threads for each CPU socket in the system</TD>
+</TR>
+<TR>
+ <TD>\a numberOfCoresPerCache</TD>
+ <TD>Amount of physical CPU cores for each LLC in the system</TD>
+</TR>
+<TR>
+ <TD>\a numberOfProcessorsPerCache</TD>
+ <TD>Amount of hardware threads for each LLC in the system</TD>
+</TR>
+<TR>
+ <TD>\a domains</TD>
+ <TD><TABLE>
+ <TR>
+ <TH>Membername</TH>
+ <TH>Comment</TH>
+ </TR>
+ <TR>
+ <TD>tag</TD>
+ <TD>Tag identifiying the affinity domain</TD>
+ </TR>
+ <TR>
+ <TD>numberOfCores</TD>
+ <TD>Amount of physical CPU cores in the affinity domain</TD>
+ </TR>
+ <TR>
+ <TD>numberOfProcessors</TD>
+ <TD>Amount of hardware threads in the affinity domain</TD>
+ </TR>
+ <TR>
+ <TD>processorList</TD>
+ <TD>List with hardware thread IDs that are in the affinity domain</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+</TABLE>
+<H1>Function definitions for Lua thread affinity module in the Lua API</H1>
+\anchor getAffinityInfo
+<H2>getAffinityInfo()</H2>
+<P>Get information about the affinity domains in the system</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>NUMA Info \ref lua_affinityinfo</TD>
+</TR>
+</TABLE>
+<H2>putAffinityInfo()</H2>
+<P>Frees C struct AffinityDomains. You can still use the lua_affinityinfo data structure<BR>If you call \ref getAffinityInfo function again after calling this function, the thread affinity information will be read again.</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>None</TD>
+</TR>
+</TABLE>
+\anchor pinProcess
+<H2>pinProcess(cpuID, silent)</H2>
+<P>Pins the current pocess to the given CPU ID</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a cpuID</TD>
+ <TD>CPU to pin the process on</TD>
+ </TR>
+ <TR>
+ <TD>\a silent</TD>
+ <TD>Verbosity of pinning method</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>None</TD>
+</TR>
+</TABLE>
+*/
+
+
+/*! \page lua_Perfmon Performance monitoring module
+<H1>Data type definition for Lua performance monitoring module in the Lua API</H1>
+\anchor lua_counterinfo
+<H2>Event and Counter Info</H2>
+<P>This structure is returned by \ref getEventsAndCounters function</P>
+<TABLE>
+<TR>
+ <TH>Membername</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>\a Counters</TD>
+ <TD><TABLE>
+ <TR>
+ <TH>Membername</TH>
+ <TH>Comment</TH>
+ </TR>
+ <TR>
+ <TD>Name</TD>
+ <TD>Counter name as used by LIKWID</TD>
+ </TR>
+ <TR>
+ <TD>Index</TD>
+ <TD>Index of counter definition in internal list of counters</TD>
+ </TR>
+ <TR>
+ <TD>Type</TD>
+ <TD>ID number of counter type, use TypeName to get a human-readable name</TD>
+ </TR>
+ <TR>
+ <TD>TypeName</TD>
+ <TD>Name of counter type</TD>
+ </TR>
+ <TR>
+ <TD>Options</TD>
+ <TD>String with the options available for the counter</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>\a Events</TD>
+ <TD><TABLE>
+ <TR>
+ <TH>Membername</TH>
+ <TH>Comment</TH>
+ </TR>
+ <TR>
+ <TD>Name</TD>
+ <TD>Event name as used by LIKWID</TD>
+ </TR>
+ <TR>
+ <TD>ID</TD>
+ <TD>Event ID as defined by CPU vendor</TD>
+ </TR>
+ <TR>
+ <TD>Umask</TD>
+ <TD>Umask further restricting the event defined by ID</TD>
+ </TR>
+ <TR>
+ <TD>Limit</TD>
+ <TD>String containing the name(s) of registers the event can be programmed on</TD>
+ </TR>
+ <TR>
+ <TD>Options</TD>
+ <TD>String with the options available for the event</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+</TABLE>
+
+\anchor lua_groupdata
+<H2>Info about a performance group</H2>
+<P>This structure is returned by \ref get_groupdata function</P>
+<TABLE>
+<TR>
+ <TH>Membername</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>EventString</TD>
+ <TD>Event set used for the performance group. Well formatted for \ref addEventSet function</TD>
+</TR>
+<TR>
+ <TD>GroupString</TD>
+ <TD>Name of the performance group</TD>
+</TR>
+<TR>
+ <TD>LongDescription</TD>
+ <TD>Description of the group. The 'LONG' section in the performance group file</TD>
+</TR>
+<TR>
+ <TD>\a Events</TD>
+ <TD><TABLE>
+ <TR>
+ <TH>Membername</TH>
+ <TH>Comment</TH>
+ </TR>
+ <TR>
+ <TD>Event ID</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a Event</TD>
+ <TD>Name of event</TD>
+ </TR>
+ <TR>
+ <TD>\a Counter</TD>
+ <TD>LIKWID's name of the counter register</TD>
+ </TR>
+ </TABLE></TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>\a Metrics</TD>
+ <TD><TABLE>
+ <TR>
+ <TH>Membername</TH>
+ <TH>Comment</TH>
+ </TR>
+ <TR>
+ <TD>Metric ID</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a description</TD>
+ <TD>Descriptive information of the metric</TD>
+ </TR>
+ <TR>
+ <TD>\a formula</TD>
+ <TD>Formula for calculating the metrics value</TD>
+ </TR>
+ </TABLE></TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+</TABLE>
+
+
+\anchor lua_pcidevinfo
+<H2>Info about online PCI devices used for performance monitoring</H2>
+<P>This structure is returned by \ref getOnlineDevices function</P>
+<TABLE>
+<TR>
+ <TH>Membername</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>\a Name (used by LIKWID)</TD>
+ <TD><TABLE>
+ <TR>
+ <TH>Membername</TH>
+ <TH>Comment</TH>
+ </TR>
+ <TR>
+ <TD>Name</TD>
+ <TD>Name of PCI device</TD>
+ </TR>
+ <TR>
+ <TD>Path</TD>
+ <TD>Path to PCI device</TD>
+ </TR>
+ <TR>
+ <TD>Type</TD>
+ <TD>Human-readable name of the PCI device type</TD>
+ </TR>
+ <TR>
+ <TD>TypeDescription</TD>
+ <TD>Description about the PCI device</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+</TABLE>
+
+<H1>Function definitions for Lua performance monitoring module in the Lua API</H1>
+\anchor init
+<H2>init(nrThreads, thread2Cpus)</H2>
+<P>Initializes the Perfmon module of LIKWID, like opening the MSR files and check the PCI devices<BR>If in access daemon mode, a single daemon instance is started to forward measurements on all given CPUs</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a nrThreads</TD>
+ <TD>Number of CPUs that should be measured</TD>
+ </TR>
+ <TR>
+ <TD>\a thread2Cpus</TD>
+ <TD>List with length \a nrThreads containing the relation between thread number and measured CPU</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Error code, 0 for success</TD>
+</TR>
+</TABLE>
+
+\anchor addEventSet
+<H2>addEventSet(eventSet)</H2>
+<P>Creates the internal management structures for the given event set. Checks the registers and if needed PCI device access<BR>The \ref init function as to be called previously</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a eventSet</TD>
+ <TD>String composed of all events in the event set. Format is Event1:Counter1(:Option11:Options12:...),Event2:Counter2...</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>The group ID of the added event set</TD>
+</TR>
+</TABLE>
+
+
+\anchor setupCounters
+<H2>setupCounters(groupID)</H2>
+<P>Setup the config registers to measure the events defined by group</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a groupID</TD>
+ <TD>ID of group returned by \ref addEventSet function.</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Error code, 0 for success</TD>
+</TR>
+</TABLE>
+
+\anchor startCounters
+<H2>startCounters()</H2>
+<P>Starts the perfmon group previously set up with \ref setupCounters function.</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Error code, 0 for success</TD>
+</TR>
+</TABLE>
+
+\anchor stopCounters
+<H2>stopCounters()</H2>
+<P>Stops the perfmon group and reads the counters into the internal result section. Use the \ref getResult or \ref getResults functions to get the results.</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Error code, 0 for success</TD>
+</TR>
+</TABLE>
+
+\anchor readCounters
+<H2>readCounters()</H2>
+<P>Reads the perfmon group into the internal result section. Use the \ref getResult or \ref getResults functions to get the results.<BR>The counters will be stopped shortly and started after reading to exclude the LIKWID code from measurements.</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Error code, 0 for success</TD>
+</TR>
+</TABLE>
+
+\anchor switchGroup
+<H2>switchGroup(newgroup)</H2>
+<P>Switches the currently active group in the perfmon module. If the given group ID does not exist, it fallbacks to group ID 1.</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a newgroup</TD>
+ <TD>Switch active group to \a newgroup</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Error code, 0 for success</TD>
+</TR>
+</TABLE>
+
+\anchor finalize
+<H2>finalize()</H2>
+<P>Destroy internal structures and clean all used registers</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Always 0</TD>
+</TR>
+</TABLE>
+
+\anchor getResult
+<H2>getResult(groupID, eventID, threadID)</H2>
+<P>Get result for a group, event, thread combination. All options must be given</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a groupID</TD>
+ <TD>Return result from group defined by \a groupID</TD>
+ </TR>
+ <TR>
+ <TD>\a eventID</TD>
+ <TD>Return result for event with \a eventID. Position in string given to \ref addEventSet function</TD>
+ </TR>
+ <TR>
+ <TD>\a threadID</TD>
+ <TD>Return result for thread with \a threadID as defined by the \a thread2Cpus input parameter for \ref init function</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Result</TD>
+</TR>
+</TABLE>
+
+\anchor getResults
+<H2>getResults()</H2>
+<P>Get all results for all group, event, thread combinations</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Three-dimensional list with results. First dim. is groups, second dim. is events and third dim. are the threads</TD>
+</TR>
+</TABLE>
+
+\anchor getMarkerResults
+<H2>getMarkerResults(filename, group_list, num_cpus)</H2>
+<P>Get the results for an output file written by \ref MarkerAPI</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a filename</TD>
+ <TD>Filename written by \ref MarkerAPI</TD>
+ </TR>
+ <TR>
+ <TD>\a group_list</TD>
+ <TD>List of defined groups</TD>
+ </TR>
+ <TR>
+ <TD>\a num_cpus</TD>
+ <TD>Amount of defined CPUs. Is used just used for checking if the \ref MarkerAPI run is valid. If LIKWID_MARKER_THREADINIT is not called properly the tests will fail</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Four-dimensional list with results. First dim. is groups, second dim. is management regions, and third dim. are the events and fourth dim. are the threads</TD>
+</TR>
+</TABLE>
+
+\anchor getEventsAndCounters
+<H2>getEventsAndCounters()</H2>
+<P>Get a list containing all event and counter definitions</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Event and counter info like \ref lua_counterinfo</TD>
+</TR>
+</TABLE>
+
+\anchor getOnlineDevices
+<H2>getOnlineDevices()</H2>
+<P>Get a list containing all online PCI devices</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>PCI device info like \ref lua_pcidevinfo</TD>
+</TR>
+</TABLE>
+
+\anchor getNumberOfGroups
+<H2>getNumberOfGroups()</H2>
+<P>Returns the number of event sets (groups) added to the perfmon module</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Amount of configured groups</TD>
+</TR>
+</TABLE>
+
+\anchor getIdOfActiveGroup
+<H2>getIdOfActiveGroup()</H2>
+<P>Returns the ID of the currently active group</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>ID of active group</TD>
+</TR>
+</TABLE>
+
+\anchor getRuntimeOfGroup
+<H2>getRuntimeOfGroup(groupID)</H2>
+<P>Returns the measurement time of the given groupID</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a groupID</TD>
+ <TD>Return the measurement time for group defined by \a groupID</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Measurement time of group</TD>
+</TR>
+</TABLE>
+
+\anchor getNumberOfEvents
+<H2>getNumberOfEvents(groupID)</H2>
+<P>Returns the amount of events for the given groupID</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a groupID</TD>
+ <TD>Return the measurement time for group defined by \a groupID</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Amount of events in group</TD>
+</TR>
+</TABLE>
+
+\anchor getNumberOfThreads
+<H2>getNumberOfThreads()</H2>
+<P>Returns the number of threads as given to \ref init function</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Amount of measurement threads</TD>
+</TR>
+</TABLE>
+
+\anchor get_groups
+<H2>get_groups()</H2>
+<P>Returns a list of all performance groups in \a groupfolder</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a numerOfGroups</TD>
+ <TD>Amount of groups in \a groupfolder for given \a architecture</TD>
+ </TR>
+ <TR>
+ <TD>\a groups</TD>
+ <TD>List with the names of all performance groups in \a groupfolder for given \a architecture</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+</TABLE>
+
+\anchor get_groupdata
+<H2>get_groupdata(group)</H2>
+<P>Read in the performance group \a group</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a group</TD>
+ <TD>Get group data for \a group </TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a groupdata</TD>
+ <TD>Structure with all group information found for the performance group \a group, see \ref lua_groupdata</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+</TABLE>
+
+*/
+
+/*! \page lua_PowerInfo Power and Energy monitoring module
+<H1>Data type definition for Lua power and energy monitoring module in the Lua API</H1>
+\anchor lua_powerinfo
+<H2>Power Information</H2>
+<P>This structure is returned by \ref getPowerInfo function<BR>The nested list structure is almost similar to the C struct CpuTopology.</P>
+<TABLE>
+<TR>
+ <TH>Membername</TH>
+ <TH>Comment</TH>
+</TR>
+<TR>
+ <TD>\a hasRAPL</TD>
+ <TD>If set, the system supports power readings through the RAPL interface</TD>
+</TR>
+<TR>
+ <TD>\a baseFrequency</TD>
+ <TD>Nominal clock frequency of the system</TD>
+</TR>
+<TR>
+ <TD>\a minFrequency</TD>
+ <TD>Minimal supported clock frequency of the system</TD>
+</TR>
+<TR>
+ <TD>\a powerUnit</TD>
+ <TD>Multiplier for power readings</TD>
+</TR>
+<TR>
+ <TD>\a timeUnit</TD>
+ <TD>Multiplier for time readings from RAPL</TD>
+</TR>
+<TR>
+ <TD>\a turbo</TD>
+ <TD>
+ <TABLE>
+ <TR>
+ <TH>Membername</TH>
+ <TH>Comment</TH>
+ </TR>
+ <TR>
+ <TD>\a numSteps</TD>
+ <TD>Amount of turbo mode steps</TD>
+ </TR>
+ <TR>
+ <TD>\a steps</TD>
+ <TD>List containing the turbo mode steps</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>\a domains</TD>
+ <TD>
+ <TABLE>
+ <TR>
+ <TH>Membername</TH>
+ <TH>Comment</TH>
+ </TR>
+ <TR>
+ <TD>\a RAPL domain</TD>
+ <TD>
+ <TABLE>
+ <TR>
+ <TH>Typename</TH>
+ <TH>comment</TH>
+ </TR>
+ <TR>
+ <TD>ID</TD>
+ <TD>Type of domain (PKG, PP0, PP1, DRAM)</TD>
+ </TR>
+ <TR>
+ <TD>energyUnit</TD>
+ <TD>Multiplier for energy readings for RAPL domain</TD>
+ </TR>
+ <TR>
+ <TD>supportStatus</TD>
+ <TD>RAPL domain has a status register to read energy values</TD>
+ </TR>
+ <TR>
+ <TD>supportPerf</TD>
+ <TD>RAPL domain has a perf register</TD>
+ </TR>
+ <TR>
+ <TD>supportPolicy</TD>
+ <TD>RAPL domain has a policy register to define a global energy policy</TD>
+ </TR>
+ <TR>
+ <TD>supportLimit</TD>
+ <TD>RAPL domain has a policy register to define a limit for the energy consumption</TD>
+ </TR>
+ <TR>
+ <TD>supportInfo</TD>
+ <TD>RAPL domain has a policy register to define a limit for the energy consumption</TD>
+ </TR>
+ <TR>
+ <TD>tdp</TD>
+ <TD>Thermal Design Power<BR>Only if supportInfo is set</TD>
+ </TR>
+ <TR>
+ <TD>minPower</TD>
+ <TD>Minimal power consumption for the RAPL domain<BR>Only if supportInfo is set</TD>
+ </TR>
+ <TR>
+ <TD>maxPower</TD>
+ <TD>Maximal power consumption for the RAPL domain<BR>Only if supportInfo is set</TD>
+ </TR>
+ <TR>
+ <TD>maxTimeWindow</TD>
+ <TD>Maximal duration between updates of the RAPL status registers<BR>Only if supportInfo is set</TD>
+ </TR>
+ </TABLE>
+ </TD>
+ </TR>
+ </TABLE>
+ </TD>
+</TR>
+</TABLE>
+<H1>Function definitions for Lua power and energy monitoring module in the Lua API</H1>
+\anchor getPowerInfo
+<H2>getPowerInfo()</H2>
+<P>Get information about the RAPL interface in the system</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Power Info \ref lua_powerinfo</TD>
+</TR>
+</TABLE>
+\anchor putPowerInfo
+<H2>putPowerInfo()</H2>
+<P>Frees C struct PowerInfo. You can still use the lua_powerinfo data structure<BR>If you call \ref getPowerInfo function again after calling this function, the power information struct will be filled again.</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor startPower
+<H2>startPower(cpuID, domainID)</H2>
+<P>Start measuring given RAPL domain on given CPU</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a cpuID</TD>
+ <TD>Start the power measurement on CPU \a cpuID</TD>
+ </TR>
+ <TR>
+ <TD>\a domainID</TD>
+ <TD>Start the power measurement for domain domainID<BR>Possible values: 0=PKG, 1=PP0, 2=PP1, 3=DRAM</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Power value at start</TD>
+</TR>
+</TABLE>
+
+\anchor stopPower
+<H2>stopPower(cpuID, domainID)</H2>
+<P>Stop measuring given RAPL domain on given CPU</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a cpuID</TD>
+ <TD>Stop the power measurement on CPU \a cpuID</TD>
+ </TR>
+ <TR>
+ <TD>\a domainID</TD>
+ <TD>Stop the power measurement for domain domainID<BR>Possible values: 0=PKG, 1=PP0, 2=PP1, 3=DRAM</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Power value at stop</TD>
+</TR>
+</TABLE>
+
+
+\anchor printEnergy
+<H2>printEnergy(before, after, domainID)</H2>
+<P></P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a before</TD>
+ <TD>Result from \ref startPower function</TD>
+ </TR>
+ <TR>
+ <TD>\a after</TD>
+ <TD>Result from \ref stopPower function</TD>
+ </TR>
+ <TR>
+ <TD>\a domainID</TD>
+ <TD>Print the power result for domain domainID<BR>Possible values: 0=PKG, 1=PP0, 2=PP1, 3=DRAM</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Power value at stop</TD>
+</TR>
+</TABLE>
+
+\anchor limitGet
+<H2>limitGet(cpuID, domainID) (EXPERIMENTAL)</H2>
+<P>Get the current limit in the limit register of domain. The limit is defined as maximal power consumption in a time window</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a cpuID</TD>
+ <TD>Get limit for CPU \a cpuID</TD>
+ </TR>
+ <TR>
+ <TD>\a domainID</TD>
+ <TD>Get limit for domain domainID<BR>Possible values: 0=PKG, 1=PP0, 2=PP1, 3=DRAM</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a power</TD>
+ <TD>Power limit value</TD>
+ </TR>
+ <TR>
+ <TD>\a time</TD>
+ <TD>Duration of time window</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+</TABLE>
+
+
+\anchor limitSet
+<H2>limitSet(cpuID, domainID, power, time, clamp) (EXPERIMENTAL)</H2>
+<P></P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a cpuID</TD>
+ <TD>Set limit for CPU \a cpuID</TD>
+ </TR>
+ <TR>
+ <TD>\a domainID</TD>
+ <TD>Set limit for domain domainID<BR>Possible values: 0=PKG, 1=PP0, 2=PP1, 3=DRAM</TD>
+ </TR>
+ <TR>
+ <TD>\a power</TD>
+ <TD>Set power value to \a power</TD>
+ </TR>
+ <TR>
+ <TD>\a time</TD>
+ <TD>Set time window value to \a time</TD>
+ </TR>
+ <TR>
+ <TD>\a clamp</TD>
+ <TD>Should the limit be clamped or can it sometimes exceed the power limit if in total the limit is satisfied</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Error code, 0 for success</TD>
+</TR>
+</TABLE>
+
+\anchor limitState
+<H2>limitState(cpuID, domainID) (EXPERIMENTAL)</H2>
+<P>Get the state of the limit</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a cpuID</TD>
+ <TD>Get the state on CPU \a cpuID</TD>
+ </TR>
+ <TR>
+ <TD>\a domainID</TD>
+ <TD>Get the state for domain domainID<BR>Possible values: 0=PKG, 1=PP0, 2=PP1, 3=DRAM</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>State, 0 for off, 1 for on</TD>
+</TR>
+</TABLE>
+*/
+
+/*! \page lua_ThermalInfo Thermal monitoring module
+<H1>Data type definition for Lua thermal monitoring module in the Lua API</H1>
+<H1>Function definitions for Lua thermal monitoring module in the Lua API</H1>
+\anchor initTemp
+<H2>initTemp(cpuID)</H2>
+<P>Initialize the thermal measurements on given CPU</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a cpuID</TD>
+ <TD>Initialize thermal readings on CPU \a cpuID</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor initTemp
+<H2>readTemp(cpuID)</H2>
+<P>Measure the temperature on given CPU</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a cpuID</TD>
+ <TD>Read the temperature on CPU \a cpuID</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Temperature</TD>
+</TR>
+</TABLE>
+*/
+
+/*! \page lua_Timer Time measurement module
+<H1>Data type definition for Lua time measurement module in the Lua API</H1>
+<H1>Function definitions for Lua time measurement module in the Lua API</H1>
+\anchor getCpuClock
+<H2>getCpuClock()</H2>
+<P>Returns the nominal clock speed</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Clock speed in Hz</TD>
+</TR>
+</TABLE>
+
+\anchor startClock
+<H2>startClock()</H2>
+<P>Start the TSC clock</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Current timestamp</TD>
+</TR>
+</TABLE>
+
+\anchor stopClock
+<H2>stopClock()</H2>
+<P>Stop the TSC clock</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Current timestamp</TD>
+</TR>
+</TABLE>
+
+\anchor getClockCycles
+<H2>getClockCycles(start, stop)</H2>
+<P>Return the amount of cycles between start and stop timestamps</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a start</TD>
+ <TD>Start timestamp</TD>
+ </TR>
+ <TR>
+ <TD>\a stop</TD>
+ <TD>Stop timestamp</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Amount of cycles between start and stop</TD>
+</TR>
+</TABLE>
+
+\anchor getClock
+<H2>getClock(start, stop)</H2>
+<P>Return the time in seconds between start and stop timestamps</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a start</TD>
+ <TD>Start timestamp</TD>
+ </TR>
+ <TR>
+ <TD>\a stop</TD>
+ <TD>Stop timestamp</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Time in seconds between start and stop</TD>
+</TR>
+</TABLE>
+
+\anchor sleep
+<H2>sleep(usecs)</H2>
+<P>Sleep for specified amount of microseconds</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a usecs</TD>
+ <TD>Sleep for seconds</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>Remaining time to sleep. >0 if sleep is interrupted</TD>
+</TR>
+</TABLE>
+
+
+*/
+
+/*! \page lua_MemSweep Memory sweeping module
+<H1>Data type definition for Lua memory sweeping module in the Lua API</H1>
+<H1>Function definitions for Lua memory sweeping module in the Lua API</H1>
+\anchor memSweep
+<H2>memSweep(nrThreads, Cpus)</H2>
+<P>Sweep the memory and LLC for given threads</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a nrThreads</TD>
+ <TD>Amount of threads in the \a threads2Cpus list</TD>
+ </TR>
+ <TR>
+ <TD>\a Cpus</TD>
+ <TD>List with thread to CPU relations</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor memSweepDomain
+<H2>memSweepDomain(domainID)</H2>
+<P>Sweep the memory and LLC for a given NUMA domain</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a domainID</TD>
+ <TD>Sweep the memory and LLC at the NUMA domain specified by \a domainID</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>None</TD>
+</TR>
+</TABLE>
+*/
+
+/*! \page lua_Misc Miscellaneous functions module
+<H1>Data type definition for Lua miscellaneous functions module in the Lua API</H1>
+<H1>Function definitions for Lua miscellaneous functions module in the Lua API</H1>
+\anchor startProgram
+<H2>startProgram(Exec)</H2>
+<P>Start an executable in a new thread</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a Exec</TD>
+ <TD>String containing the executable and its options</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>PID of newly created thread</TD>
+</TR>
+</TABLE>
+
+\anchor checkProgram
+<H2>checkProgram()</H2>
+<P>Check if the executable is running</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>True/False</TD>
+</TR>
+</TABLE>
+
+\anchor killProgram
+<H2>killProgram(PID)</H2>
+<P>Kill the executable with SIGTERM</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a PID</TD>
+ <TD>PID to send the SIGTERM signal</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>None</TD>
+</TR>
+</TABLE>
+
+
+\anchor setenv
+<H2>setenv(Name, Value)</H2>
+<P>Set environment variable. Lua only provides getenv()</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a Name</TD>
+ <TD>Name of environment variable</TD>
+ </TR>
+ <TR>
+ <TD>\a Value</TD>
+ <TD>Value for the environment variable</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor getpid
+<H2>getpid()</H2>
+<P>Get the PID of the current process</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>PID number</TD>
+</TR>
+</TABLE>
+
+\anchor access
+<H2>access(Filepath, perm)</H2>
+<P>Check the file existance for a given filepath</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a Filepath</TD>
+ <TD>Name of Filepath to check</TD>
+ </TR>
+ <TR>
+ <TD>\a perm</TD>
+ <TD>Check for specified attribute<BR>r: read, w: write, x: executable, e: existance</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>PID number</TD>
+</TR>
+</TABLE>
+
+\anchor msr_available
+<H2>msr_available()</H2>
+<P>Check whether the msr files are available. Basically checks whether the msr kernel module is loaded properly</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>True/False</TD>
+</TR>
+</TABLE>
+
+\anchor gethostname
+<H2>gethostname()</H2>
+<P>Returns the hostname of the system in short format</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a Hostname</TD>
+ <TD>Hostname in short format</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+</TABLE>
+
+\anchor getjid
+<H2>getjid()</H2>
+<P>Returns the job ID if running in a batch environment. Basically reads the <CODE>PBS_JOBID</CODE> environment variable</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a Job ID</TD>
+ <TD>Job ID or 'X' if not in batch environment</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+</TABLE>
+
+\anchor getMPIrank
+<H2>getMPIrank()</H2>
+<P>Returns the MPI rank of the current process. Basically read the <CODE>PMI_RANK</CODE> and <CODE>OMPI_COMM_WORLD_RANK</CODE> environment variables</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD>None</TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a MPI Rank</TD>
+ <TD>MPI rank or 'X' if not in MPI environment</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+</TABLE>
+*/
+
+
+/*! \page lua_InputOutput Input and output functions module
+<H1>Data type definition for Lua output functions module in the Lua API</H1>
+<H1>Function definitions for Lua output functions module in the Lua API</H1>
+\anchor getopt
+<H2>getopt(commandline, optionlist)</H2>
+<P>Read commandline parameters and split them to the given options. The version LIKWID uses was originally taken from the web but extended to talk short '-o' and long options "--option". It returns an iterator for the commandline options.<BR>Basic usage:<BR></P>
+<CODE>
+for opt,arg in likwid.getopt(arg, {"n:","h"}) do<BR>
+ if (type(arg) == "string") then<BR>
+ local s,e = arg:find("-")<BR>
+ if s == 1 then<BR>
+ print(string.format("ERROR: Argmument %s to option -%s starts with invalid character -.", arg, opt))<BR>
+ print("ERROR: Did you forget an argument to an option?")<BR>
+ os.exit(1)<BR>
+ end<BR>
+ end<BR>
+ --parse options<BR>
+end<BR>
+</CODE><BR>
+The option 'n' takes an argument, specified by the ':'. If found the option argument for option 'h' is true. The type check for the argument is recommended to get errors with an argument awaiting option where the argument is missing.
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a commandline</TD>
+ <TD>Normally, Lua saves the commandline parameters in variable 'arg'</TD>
+ </TR>
+ <TR>
+ <TD>\a optionlist</TD>
+ <TD>List of options that should be recognized. Options with ':' as last character need an argument<BR>Example {"h","v","cpu:"}</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a option</TD>
+ <TD>Option string found on the commandline without leading '-'</TD>
+ </TR>
+ <TR>
+ <TD>\a argument</TD>
+ <TD>Argument to the \a option. If \a option does not require an argument, true or false is returned in \a argument</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+</TABLE>
+
+\anchor parse_time
+<H2>parse_time(timestr)</H2>
+<P>Parses time interval describing strings like 2s, 100ms or 250us</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a timestr</TD>
+ <TD>String describing a time interval</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a duration</TD>
+ <TD>Time string \a timestr resolved to usecs</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+</TABLE>
+
+\anchor printtable
+<H2>printtable(table)</H2>
+<P>Prints the given two dimensional table as fancy ASCII table. For CSV output use \ref printcsv</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a table</TD>
+ <TD>Two dimensional list with table entries. First dim. are columns and second dim. the lines</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor printcsv
+<H2>printcsv(table)</H2>
+<P>Prints the given two dimensional table in CSV format. For ASCII table output see \ref printtable</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a table</TD>
+ <TD>Two dimensional list with table entries. First dim. are columns and second dim. the lines</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor stringsplit
+<H2>stringsplit(str, sSeparator,( nMax, bRegexp))</H2>
+<P>Splits the given string at separating character</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a str</TD>
+ <TD>String to split</TD>
+ </TR>
+ <TR>
+ <TD>\a sSeparator</TD>
+ <TD>String with separating character</TD>
+ </TR>
+ <TR>
+ <TD>\a nMax</TD>
+ <TD>Split string maximally \a nMax times (optional)</TD>
+ </TR>
+ <TR>
+ <TD>\a bRegexp</TD>
+ <TD>Lua RegEx string for separation (optional)</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>List of \a str splitted at \a sSeparator or \a bRegexp</TD>
+</TR>
+</TABLE>
+
+\anchor printOutput
+<H2>printOutput(groups, results, groupData, cpulist)</H2>
+<P>Prints results</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a groups</TD>
+ <TD>List of groups for printing</TD>
+ </TR>
+ <TR>
+ <TD>\a results</TD>
+ <TD>List of results as returned by \ref getResults function</TD>
+ </TR>
+ <TR>
+ <TD>\a groupData</TD>
+ <TD>List of group data structures</TD>
+ </TR>
+ <TR>
+ <TD>\a cpulist</TD>
+ <TD>List of thread ID to CPU ID relations</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor print_markerOutput
+<H2>print_markerOutput(groups, results, groupData, cpulist)</H2>
+<P>Prints results of a Marker API run. This is different to \ref printOutput because we have to resolve the measurement regions</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a groups</TD>
+ <TD>List of groups for printing</TD>
+ </TR>
+ <TR>
+ <TD>\a results</TD>
+ <TD>List of results as returned by \ref getMarkerResults function</TD>
+ </TR>
+ <TR>
+ <TD>\a groupData</TD>
+ <TD>List of group data structures</TD>
+ </TR>
+ <TR>
+ <TD>\a cpulist</TD>
+ <TD>List of thread ID to CPU ID relations</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>None</TD>
+</TR>
+</TABLE>
+
+
+\anchor addSimpleAsciiBox
+<H2>addSimpleAsciiBox(container, lineIdx, colIdx, label)</H2>
+<P>Add a simple ASCII box with given label to box container. This function is only used by \ref likwid-topology</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a container</TD>
+ <TD>Box container containing all boxes</TD>
+ </TR>
+ <TR>
+ <TD>\a lineIdx</TD>
+ <TD>Add box at line index \a lineIdx</TD>
+ </TR>
+ <TR>
+ <TD>\a colIdx</TD>
+ <TD>Add box at column index \a colIdx</TD>
+ </TR>
+ <TR>
+ <TD>\a label</TD>
+ <TD>Content of the box</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor addJoinedAsciiBox
+<H2>addJoinedAsciiBox(container, lineIdx, startColIdx, endColIdx, label)</H2>
+<P>Add a joined ASCII box with given label to box container. Joined boxes can span the space of multiple simple boxes. This function is only used by \ref likwid-topology</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a container</TD>
+ <TD>Box container containing all boxes</TD>
+ </TR>
+ <TR>
+ <TD>\a lineIdx</TD>
+ <TD>Add box at line index \a lineIdx</TD>
+ </TR>
+ <TR>
+ <TD>\a startColIdx</TD>
+ <TD>Start joined box at column index \a startColIdx</TD>
+ </TR>
+ <TR>
+ <TD>\a endColIdx</TD>
+ <TD>End joined box at column index \a endColIdx</TD>
+ </TR>
+ <TR>
+ <TD>\a label</TD>
+ <TD>Content of the box</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor printAsciiBox
+<H2>printAsciiBox(container)</H2>
+<P>Print the box container previously filled with \ref addSimpleAsciiBox and \ref addJoinedAsciiBox. This function is only used by \ref likwid-topology</P>
+<TABLE>
+<TR>
+ <TH>Direction</TH>
+ <TH>Data type(s)</TH>
+</TR>
+<TR>
+ <TD>Input Parameter</TD>
+ <TD><TABLE>
+ <TR>
+ <TD>\a container</TD>
+ <TD>Box container containing all boxes</TD>
+ </TR>
+ </TABLE></TD>
+</TR>
+<TR>
+ <TD>Returns</TD>
+ <TD>None</TD>
+</TR>
+</TABLE>
+*/
diff --git a/examples/C-internalMarkerAPI.c b/examples/C-internalMarkerAPI.c
new file mode 100644
index 0000000..b5a0c4f
--- /dev/null
+++ b/examples/C-internalMarkerAPI.c
@@ -0,0 +1,152 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <omp.h>
+
+#include <likwid.h>
+
+
+void dummy()
+{
+ ;;
+}
+
+int main(int argc, char* argv[])
+{
+ int i, k;
+ char group[] = "L3";
+ int gid = 0;
+ char cpulist[] = "0,1,2";
+ int cpus[3] = {0,1,2};
+ char filepath[] = "/tmp/test-marker.out";
+ char accessmode[] = "1";
+ double *A, *B;
+ size_t asize = 1024*1024;
+
+
+ setenv("LIKWID_EVENTS", group, 1);
+ setenv("LIKWID_THREADS", cpulist, 1);
+ setenv("LIKWID_FILEPATH", filepath, 1);
+ setenv("LIKWID_MODE", accessmode, 1);
+ /* If the NMI watchdog is enabled or the application does not call
+ * perfmon_finalize(), e.g. because of some error, LIKWID will fail with
+ * a message "Counter in use". By settings LIKWID_FORCE you can overwrite
+ * the registers.
+ */
+ //setenv("LIKWID_FORCE", "1", 1);
+
+ A = malloc(asize * sizeof(double));
+ if (A==NULL)
+ return 1;
+ B = malloc(asize * sizeof(double));
+ if (B==NULL)
+ {
+ free(A);
+ return 1;
+ }
+ for (i=0; i<asize;i++)
+ B[i] = ((double)i)+1.5;
+
+ /* This is only for showcase. If your application pins them already, you
+ * don't need this
+ */
+#pragma omp parallel
+{
+ likwid_pinThread(cpus[omp_get_thread_num()]);
+}
+
+ /* Calls perfmon_init() and perfmon_addEventSet */
+ LIKWID_MARKER_INIT;
+ /* Setup and start manually. We use group ID 0, we can switch later */
+ perfmon_setupCounters(0);
+ perfmon_startCounters();
+
+ printf("Getting results during the measurements with LIKWID_MARKER_GET\n");
+#pragma omp parallel private(k,i)
+{
+ int nr_events = 20;
+ double time = 0;
+ int count = 0;
+ double *events = malloc(nr_events * sizeof(double));
+ memset(events, 0, nr_events * sizeof(double));
+ LIKWID_MARKER_START("Total");
+ for (k=0; k<10; k++)
+ {
+
+ LIKWID_MARKER_START("Calc1");
+#pragma omp for
+ for (i=0; i< asize; i++)
+ A[i] = B[i];
+ if (A[i] < 0) dummy();
+ LIKWID_MARKER_STOP("Calc1");
+ }
+ LIKWID_MARKER_GET("Calc1", &nr_events, events, &time, &count);
+ printf("Calc1 Thread %d got %d events, runtime %f s, call count %d\n", omp_get_thread_num(), nr_events, time, count);
+ nr_events = 20;
+ memset(events, 0, nr_events * sizeof(double));
+ for (k=0; k<10; k++)
+ {
+ LIKWID_MARKER_START("Calc2");
+#pragma omp for
+ for (i=0; i< asize; i++)
+ A[i] = A[i] + B[i];
+ if (A[i] < 0) dummy();
+ LIKWID_MARKER_STOP("Calc2");
+ }
+ LIKWID_MARKER_STOP("Total");
+ LIKWID_MARKER_GET("Calc2", &nr_events, events, &time, &count);
+ printf("Calc2 Thread %d got %d events, runtime %f s, call count %d\n", omp_get_thread_num(), nr_events, time, count);
+ nr_events = 20;
+ memset(events, 0, nr_events * sizeof(double));
+ LIKWID_MARKER_GET("Total", &nr_events, events, &time, &count);
+ printf("Total Thread %d got %d events, runtime %f s, call count %d\n", omp_get_thread_num(), nr_events, time, count);
+ free(events);
+}
+
+
+
+ perfmon_stopCounters();
+ LIKWID_MARKER_CLOSE;
+
+
+
+ perfmon_readMarkerFile(filepath);
+ printf("\nMarker API measured %d regions\n", perfmon_getNumberOfRegions());
+ for (i=0;i<perfmon_getNumberOfRegions();i++)
+ {
+ gid = perfmon_getGroupOfRegion(i);
+ printf("Region %s with %d events and %d metrics\n",perfmon_getTagOfRegion(i),
+ perfmon_getEventsOfRegion(i),
+ perfmon_getMetricsOfRegion(i));
+ }
+ printf("\nExample metrics output for thread 0\n");
+
+
+ for (i=0;i<perfmon_getNumberOfRegions();i++)
+ {
+ printf("Region %s\n", perfmon_getTagOfRegion(i));
+ for (k=0;k<perfmon_getEventsOfRegion(i);k++)
+ printf("Event %s:%s: %f\n", perfmon_getEventName(gid, k),
+ perfmon_getCounterName(gid, k),
+ perfmon_getResultOfRegionThread(i, k, 0));
+ for (k=0;k<perfmon_getNumberOfMetrics(gid);k++)
+ printf("Metric %s: %f\n", perfmon_getMetricName(gid, k),
+ perfmon_getMetricOfRegionThread(i, k, 0));
+ printf("\n");
+ }
+ remove(filepath);
+
+ /* Reinitialize access to HPM registers, LIKWID_MARKER_CLOSE closed the connection */
+ HPMinit();
+ for (i=0;i<3; i++)
+ HPMaddThread(cpus[i]);
+ /* Finalize perfmon sets all used counters to zero and deletes marker results, so no
+ perfmon_destroyMarkerResults() required */
+ perfmon_finalize();
+ HPMfinalize();
+ free(A);
+ free(B);
+ return 0;
+
+}
diff --git a/examples/C-likwidAPI.c b/examples/C-likwidAPI.c
new file mode 100644
index 0000000..aa6ed4e
--- /dev/null
+++ b/examples/C-likwidAPI.c
@@ -0,0 +1,149 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: C-likwidAPI.c
+ *
+ * Description: Example how to use the LIKWID API in C/C++ applications
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <likwid.h>
+
+
+int main(int argc, char* argv[])
+{
+ int i, j;
+ int err;
+ int* cpus;
+ int gid;
+ double result = 0.0;
+ char estr[] = "L2_LINES_IN_ALL:PMC0,L2_TRANS_L2_WB:PMC1";
+ //perfmon_setVerbosity(3);
+ // Load the topology module and print some values.
+ err = topology_init();
+ if (err < 0)
+ {
+ printf("Failed to initialize LIKWID's topology module\n");
+ return 1;
+ }
+ // CpuInfo_t contains global information like name, CPU family, ...
+ CpuInfo_t info = get_cpuInfo();
+ // CpuTopology_t contains information about the topology of the CPUs.
+ CpuTopology_t topo = get_cpuTopology();
+ // Create affinity domains. Commonly only needed when reading Uncore counters
+ affinity_init();
+
+ printf("Likwid example on a %s with %d CPUs\n", info->name, topo->numHWThreads);
+
+ cpus = (int*)malloc(topo->numHWThreads * sizeof(int));
+ if (!cpus)
+ return 1;
+
+ for (i=0;i<topo->numHWThreads;i++)
+ {
+ cpus[i] = topo->threadPool[i].apicId;
+ }
+
+ // Must be called before perfmon_init() but only if you want to use another
+ // access mode as the pre-configured one. For direct access (0) you have to
+ // be root.
+ //accessClient_setaccessmode(0);
+
+ // Initialize the perfmon module.
+ err = perfmon_init(topo->numHWThreads, cpus);
+ if (err < 0)
+ {
+ printf("Failed to initialize LIKWID's performance monitoring module\n");
+ topology_finalize();
+ return 1;
+ }
+
+ // Add eventset string to the perfmon module.
+ gid = perfmon_addEventSet(estr);
+ if (gid < 0)
+ {
+ printf("Failed to add event string %s to LIKWID's performance monitoring module\n", estr);
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+ }
+
+ // Setup the eventset identified by group ID (gid).
+ err = perfmon_setupCounters(gid);
+ if (err < 0)
+ {
+ printf("Failed to setup group %d in LIKWID's performance monitoring module\n", gid);
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+ }
+ // Start all counters in the previously set up event set.
+ err = perfmon_startCounters();
+ if (err < 0)
+ {
+ printf("Failed to start counters for group %d for thread %d\n",gid, (-1*err)-1);
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+ }
+ // Perform something
+ sleep(10);
+ // Stop all counters in the previously started event set.
+ err = perfmon_stopCounters();
+ if (err < 0)
+ {
+ printf("Failed to stop counters for group %d for thread %d\n",gid, (-1*err)-1);
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+ }
+
+
+ // Print the result of every thread/CPU for all events in estr.
+ char* ptr = strtok(estr,",");
+ j = 0;
+ while (ptr != NULL)
+ {
+ for (i = 0;i < topo->numHWThreads; i++)
+ {
+ result = perfmon_getResult(gid, j, i);
+ printf("Measurement result for event set %s at CPU %d: %f\n", ptr, cpus[i], result);
+ }
+ ptr = strtok(NULL,",");
+ j++;
+ }
+
+
+ free(cpus);
+ // Uninitialize the perfmon module.
+ perfmon_finalize();
+ affinity_finalize();
+ // Uninitialize the topology module.
+ topology_finalize();
+ return 0;
+}
diff --git a/examples/C-markerAPI.c b/examples/C-markerAPI.c
new file mode 100644
index 0000000..84f97a4
--- /dev/null
+++ b/examples/C-markerAPI.c
@@ -0,0 +1,87 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: C-markerAPI.c
+ *
+ * Description: Example how to use the C/C++ Marker API
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <omp.h>
+#include <likwid.h>
+
+#define SLEEPTIME 2
+
+int main(int argc, char* argv[])
+{
+ int i;
+ int nevents = 10;
+ double events[10];
+ double time;
+ int count;
+ // Init Marker API in serial region once in the beginning
+ LIKWID_MARKER_INIT;
+ #pragma omp parallel
+ {
+ // Each thread must add itself to the Marker API, therefore must be
+ // in parallel region
+ LIKWID_MARKER_THREADINIT;
+ // Optional. Register region name
+ LIKWID_MARKER_REGISTER("example");
+ }
+
+
+ #pragma omp parallel
+ {
+ printf("Thread %d sleeps now for %d seconds\n", omp_get_thread_num(), SLEEPTIME);
+ // Start measurements inside a parallel region
+ LIKWID_MARKER_START("example");
+ // Insert your code here.
+ // Often contains an OpenMP for pragma. Regions can be nested.
+ sleep(SLEEPTIME);
+ // Stop measurements inside a parallel region
+ LIKWID_MARKER_STOP("example");
+ printf("Thread %d wakes up again\n", omp_get_thread_num());
+ // If multiple groups given, you can switch to the next group
+ LIKWID_MARKER_SWITCH;
+ // If you need the performance data inside your application, use
+ LIKWID_MARKER_GET("example", &nevents, events, &time, &count);
+ // where events is an array of doubles with nevents entries,
+ // time is a double* and count an int*.
+ printf("Region example measures %d events, total measurement time is %f\n", nevents, time);
+ printf("The region was called %d times\n", count);
+ for (i = 0; i < nevents; i++)
+ {
+ printf("Event %d: %f\n", i, events[i]);
+ }
+ }
+
+ // Close Marker API and write results to file for further evaluation done
+ // by likwid-perfctr
+ LIKWID_MARKER_CLOSE;
+ return 0;
+}
diff --git a/examples/F-markerAPI.F90 b/examples/F-markerAPI.F90
new file mode 100644
index 0000000..5e2ff4b
--- /dev/null
+++ b/examples/F-markerAPI.F90
@@ -0,0 +1,79 @@
+! =======================================================================================
+!
+! Filename: F-markerAPI.F90
+!
+! Description: Example how to use the Fortran90 Marker API
+!
+! Version: 4.1
+! Released: 19.5.2016
+!
+! Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+! Project: likwid
+!
+! Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+!
+! This program is free software: you can redistribute it and/or modify it under
+! the terms of the GNU General Public License as published by the Free Software
+! Foundation, either version 3 of the License, or (at your option) any later
+! version.
+!
+! This program is distributed in the hope that it will be useful, but WITHOUT ANY
+! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+! PARTICULAR PURPOSE. See the GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along with
+! this program. If not, see <http://www.gnu.org/licenses/>.
+!
+! =======================================================================================
+
+#define SLEEPTIME 2
+
+program FmarkerAPI
+ use likwid
+ include "omp_lib.h"
+ INTEGER :: nr_events
+ DOUBLE PRECISION, DIMENSION(10) :: events
+ DOUBLE PRECISION :: time
+ INTEGER :: c
+ nr_events = 10
+ ! Init Marker API in serial region once in the beginning.
+ call likwid_markerInit()
+
+!$OMP PARALLEL
+ ! Each thread must add itself to the Marker API, therefore must be
+ ! in parallel region.
+ call likwid_markerthreadInit()
+ ! Optional. Register region name and initialize hash table entries.
+ call likwid_markerRegisterRegion("example")
+!$OMP END PARALLEL
+
+!$OMP PARALLEL
+ print '(a,i0,a,i0,a)', "Thread ", omp_get_thread_num()," sleeps now for ", SLEEPTIME," seconds"
+ ! Start measurements inside a parallel region.
+ call likwid_markerStartRegion("example")
+ ! Insert your code here
+ ! Often contains an OpenMP for pragma. Regions can be nested.
+ call Sleep(SLEEPTIME)
+ ! Stop measurements inside a parallel region.
+ call likwid_markerStopRegion("example")
+ print '(a,i0,a)', "Thread ", omp_get_thread_num()," wakes up again"
+ ! If multiple groups given, you can switch to the next group.
+ call likwid_markerNextGroup();
+ ! If you need the performance data inside your application, use
+ call likwid_markerGetRegion("example", nr_events, events, time, c)
+ ! Events is an array of DOUBLE PRECISION with nr_events (INTEGER) entries,
+ ! time is a DOUBLE PRECISION and count an INTEGER.
+ ! After returning the events array contains maximally nr_events results.
+ print '(a,i0,a,f9.3)', "Region example measures ", nr_events, " events, total measurement time is ", time
+ print '(a,i0,a)', "The region was called ", c, " times"
+ do i=1,nr_events
+ print '(a,i0,a,e13.7)', "Event ",i,": ",events(i)
+ end do
+
+!$OMP END PARALLEL
+
+! Close Marker API and write results to file for further evaluation done
+! by likwid-perfctr.
+call likwid_markerClose()
+
+end program FmarkerAPI
diff --git a/examples/Lua-likwidAPI.lua b/examples/Lua-likwidAPI.lua
new file mode 100644
index 0000000..a77cdb8
--- /dev/null
+++ b/examples/Lua-likwidAPI.lua
@@ -0,0 +1,93 @@
+#!<PREFIX>/bin/likwid-lua
+--[[
+ * =======================================================================================
+
+ *
+ * Filename: Lua-likwidAPI.lua
+ *
+ * Description: Example how to use the LIKWID API in Lua scripts
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+
+package.path = package.path .. ';<PREFIX>/share/lua/?.lua'
+
+local likwid = require("likwid")
+
+EVENTSET = "INSTR_RETIRED_ANY:FIXC0"
+
+cpuinfo = likwid.getCpuInfo()
+cputopo = likwid.getCpuTopology()
+
+print(string.format("Likwid example on a %s with %d CPUs", cpuinfo.name, cputopo.numHWThreads))
+
+local cpus = {}
+for i, cpu in pairs(cputopo.threadPool) do
+ table.insert(cpus, cpu.apicId)
+end
+
+if likwid.init(#cpus, cpus) ~= 0 then
+ print("Failed to initialize LIKWID's performance monitoring module")
+ likwid.putTopology()
+ os.exit(1)
+end
+
+local gid = likwid.addEventSet(EVENTSET)
+if gid <= 0 then
+ print(string.format("Failed to add events %s to LIKWID's performance monitoring module", EVENTSET))
+ likwid.finalize()
+ likwid.putTopology()
+ os.exit(1)
+end
+
+
+if likwid.setupCounters(gid) < 0 then
+ printf(string.format("Failed to setup group %d in LIKWID's performance monitoring module\n", gid))
+ likwid.finalize()
+ likwid.putTopology()
+ os.exit(1)
+end
+if likwid.startCounters() < 0 then
+ printf(string.format("Failed to start group %d in LIKWID's performance monitoring module\n", gid))
+ likwid.finalize()
+ likwid.putTopology()
+ os.exit(1)
+end
+-- Application code
+likwid.sleep(2)
+if likwid.stopCounters() < 0 then
+ printf(string.format("Failed to stop group %d in LIKWID's performance monitoring module\n", gid))
+ likwid.finalize()
+ likwid.putTopology()
+ os.exit(1)
+end
+
+
+for i,cpu in pairs(cpus) do
+ result = likwid.getResult(gid, 1, i)
+ print(string.format("Measurement result for event set %s at CPU %d: %f", EVENTSET, cpu, result))
+end
+
+
+likwid.putTopology()
+likwid.finalize()
diff --git a/examples/Makefile b/examples/Makefile
new file mode 100644
index 0000000..cc21c3c
--- /dev/null
+++ b/examples/Makefile
@@ -0,0 +1,64 @@
+
+include ../config.mk
+include ../make/include_$(COMPILER).mk
+
+LIKWID_INCLUDE ?= -I$(PREFIX)/include
+LIKWID_LIB ?= -L$(PREFIX)/lib -llikwid
+
+all: C-markerAPI C-likwidAPI F-markerAPI Lua-likwidAPI C-markerAPI-run C-likwidAPI-run F-markerAPI-run Lua-likwidAPI-run
+
+help:
+ @echo "Help message for examples included in LIKWID"
+ @echo
+ @echo "This folder contins examples how you can use the LIKWID API"
+ @echo "Possible examples are:"
+ @echo "- Marker API in C applications: C-markerAPI"
+ @echo "- Marker API in Fortran applications: F-markerAPI"
+ @echo "- Self Monitoring in C applications: C-likwidAPI"
+ @echo "- Using the LIKWID API in Lua scripts: Lua-likwidAPI"
+ @echo "- Monitoring a system with LIKWID: monitoring"
+ @echo
+ @echo "To build an example put the name behind make, e.g. make C-likwidAPI"
+ @echo "To run the built example append '-run' to the name and add it to make: make C-likwidAPI-run"
+
+C-markerAPI:
+ $(CC) -fopenmp -DLIKWID_PERFMON -I$(PREFIX)/include -L$(PREFIX)/lib C-markerAPI.c -o C-markerAPI -llikwid -lm
+
+C-markerAPI-run: C-markerAPI
+ $(PREFIX)/bin/likwid-perfctr -C 0 -g INSTR_RETIRED_ANY:FIXC0 -m ./C-markerAPI
+
+C-likwidAPI:
+ $(CC) -fopenmp -I$(PREFIX)/include -L$(PREFIX)/lib C-likwidAPI.c -o C-likwidAPI -llikwid -lm
+
+C-likwidAPI-run: C-likwidAPI
+ ./C-likwidAPI
+
+C-internalMarkerAPI:
+ $(CC) -g -fopenmp -DLIKWID_PERFMON -I$(PREFIX)/include -L$(PREFIX)/lib C-internalMarkerAPI.c -o C-internalMarkerAPI -llikwid -lm
+
+C-internalMarkerAPI-run: C-internalMarkerAPI
+ OMP_NUM_THREADS=3 ./C-internalMarkerAPI
+
+monitoring:
+ $(CC) -I$(PREFIX)/include -L$(PREFIX)/lib monitoring.c -o monitoring -llikwid -lm
+
+monitoring-run: monitoring
+ ./monitoring
+
+F-markerAPI:
+ $(FC) -fopenmp -DLIKWID_PERFMON -I$(PREFIX) -L$(PREFIX) F-markerAPI.F90 -o F-markerAPI -llikwid -lm
+
+F-markerAPI-run: F-markerAPI
+ $(PREFIX)/bin/likwid-perfctr -C 0 -g INSTR_RETIRED_ANY:FIXC0 -m ./F-markerAPI
+
+Lua-likwidAPI:
+ sed -e "s+<PREFIX>+$(PREFIX)+g" Lua-likwidAPI.lua > Lua-likwidAPI
+ chmod +x Lua-likwidAPI
+
+Lua-likwidAPI-run: Lua-likwidAPI
+ ./Lua-likwidAPI
+
+clean:
+ rm -f C-markerAPI C-likwidAPI F-markerAPI Lua-likwidAPI monitoring C-internalMarkerAPI
+
+.PHONY: clean C-markerAPI C-likwidAPI F-markerAPI Lua-likwidAPI monitoring C-internalMarkerAPI
diff --git a/examples/monitoring.c b/examples/monitoring.c
new file mode 100644
index 0000000..ddddcb4
--- /dev/null
+++ b/examples/monitoring.c
@@ -0,0 +1,118 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <signal.h>
+#include <likwid.h>
+
+
+static int sleeptime = 1;
+
+static int run = 1;
+
+void INThandler(int sig)
+{
+ signal(sig, SIG_IGN);
+ run = 0;
+}
+
+
+int main (int argc, char* argv[])
+{
+ int i, c, err = 0;
+ double timer = 0.0;
+ topology_init();
+ numa_init();
+ affinity_init();
+ timer_init();
+ CpuInfo_t cpuinfo = get_cpuInfo();
+ CpuTopology_t cputopo = get_cpuTopology();
+ int numCPUs = cputopo->activeHWThreads;
+ int* cpus = malloc(numCPUs * sizeof(int));
+ if (!cpus)
+ {
+ affinity_finalize();
+ numa_finalize();
+ topology_finalize();
+ return 1;
+ }
+ c = 0;
+ for (i=0;i<cputopo->numHWThreads;i++)
+ {
+ if (cputopo->threadPool[i].inCpuSet)
+ {
+ cpus[c] = cputopo->threadPool[i].apicId;
+ c++;
+ }
+ }
+ NumaTopology_t numa = get_numaTopology();
+ AffinityDomains_t affi = get_affinityDomains();
+ timer = timer_getCpuClock();
+ perfmon_init(numCPUs, cpus);
+ int gid1 = perfmon_addEventSet("L2");
+ if (gid1 < 0)
+ {
+ printf("Failed to add performance group L2\n");
+ err = 1;
+ goto monitor_exit;
+ }
+ int gid2 = perfmon_addEventSet("L3");
+ if (gid2 < 0)
+ {
+ printf("Failed to add performance group L3\n");
+ err = 1;
+ goto monitor_exit;
+ }
+ int gid3 = perfmon_addEventSet("ENERGY");
+ if (gid3 < 0)
+ {
+ printf("Failed to add performance group ENERGY\n");
+ err = 1;
+ goto monitor_exit;
+ }
+ signal(SIGINT, INThandler);
+
+ while (run)
+ {
+ perfmon_setupCounters(gid1);
+ perfmon_startCounters();
+ sleep(sleeptime);
+ perfmon_stopCounters();
+ for (c = 0; c < 8; c++)
+ {
+ for (i = 0; i< perfmon_getNumberOfMetrics(gid1); i++)
+ {
+ printf("%s,cpu=%d %f\n", perfmon_getMetricName(gid1, i), cpus[c], perfmon_getLastMetric(gid1, i, c));
+ }
+ }
+ perfmon_setupCounters(gid2);
+ perfmon_startCounters();
+ sleep(sleeptime);
+ perfmon_stopCounters();
+ for (c = 0; c < 8; c++)
+ {
+ for (i = 0; i< perfmon_getNumberOfMetrics(gid2); i++)
+ {
+ printf("%s,cpu=%d %f\n", perfmon_getMetricName(gid2, i), cpus[c], perfmon_getLastMetric(gid2, i, c));
+ }
+ }
+ perfmon_setupCounters(gid3);
+ perfmon_startCounters();
+ sleep(sleeptime);
+ perfmon_stopCounters();
+ for (c = 0; c < 8; c++)
+ {
+ for (i = 0; i< perfmon_getNumberOfMetrics(gid3); i++)
+ {
+ printf("%s,cpu=%d %f\n", perfmon_getMetricName(gid3, i), cpus[c], perfmon_getLastMetric(gid3, i, c));
+ }
+ }
+ }
+monitor_exit:
+ free(cpus);
+ perfmon_finalize();
+ affinity_finalize();
+ numa_finalize();
+ topology_finalize();
+ return 0;
+}
diff --git a/ext/hwloc/AUTHORS b/ext/hwloc/AUTHORS
new file mode 100644
index 0000000..837b27f
--- /dev/null
+++ b/ext/hwloc/AUTHORS
@@ -0,0 +1,8 @@
+Cédric Augonnet <Cedric.Augonnet at labri.fr>
+Jérôme Clet-Ortega <Jerome.Clet-Ortega at labri.fr>
+Ludovic Courtès <Ludovic.Courtes at inria.fr>
+Brice Goglin <Brice.Goglin at inria.fr>
+Nathalie Furmento <Nathalie.Furmento at labri.fr>
+Samuel Thibault <Samuel.Thibault at labri.fr>
+Jeff Squyres <jsquyres at cisco.com>
+Alexey Kardashevskiy <aik at au1.ibm.com>
diff --git a/ext/hwloc/COPYING b/ext/hwloc/COPYING
new file mode 100644
index 0000000..32128c7
--- /dev/null
+++ b/ext/hwloc/COPYING
@@ -0,0 +1,28 @@
+Copyright © 2009 CNRS
+Copyright © 2009 inria. All rights reserved.
+Copyright © 2009 Université Bordeaux 1
+Copyright © 2009 Cisco Systems, Inc. All rights reserved.
+Copyright © 2012 Blue Brain Project, EPFL. All rights reserved.
+See COPYING in top-level directory.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+3. The name of the author may not be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/ext/hwloc/Makefile b/ext/hwloc/Makefile
new file mode 100644
index 0000000..1fd564c
--- /dev/null
+++ b/ext/hwloc/Makefile
@@ -0,0 +1,73 @@
+SRC_DIRS = ./hwloc
+MAKE_DIR = ../../make
+
+#DO NOT EDIT BELOW
+
+include ../../config.mk
+include $(MAKE_DIR)/include_$(COMPILER).mk
+include $(MAKE_DIR)/config_checks.mk
+include $(MAKE_DIR)/config_defines.mk
+
+CFLAGS = -O2 -Wall -fPIC -fvisibility=hidden
+INCLUDES += -I./include
+#DEFINES =
+LIBS = -L. -lm
+LFLAGS = -fPIC -fvisibility=hidden
+Q ?= @
+DEFINES := $(filter-out -DVERSION=$(VERSION),$(DEFINES))
+ifeq ($(DEBUG),true)
+DEBUG_FLAGS = -g
+else
+DEBUG_FLAGS =
+endif
+ifeq ($(COMPILER),MIC)
+CFLAGS += -mmic
+LFLAGS += -mmic
+endif
+ifeq ($(COMPILER),GCC)
+CFLAGS += -Wno-unused-result
+LFLAGS += -Wno-unused-result
+endif
+
+#CONFIGURE BUILD SYSTEM
+BUILD_DIR = ./$(COMPILER)
+
+VPATH = $(SRC_DIRS)
+FILES = $(notdir $(foreach dir,$(SRC_DIRS),$(wildcard $(dir)/*.c)))
+OBJ = $(patsubst %.c, $(BUILD_DIR)/%.o, $(FILES))
+LIBHWLOC = $(shell basename $(TARGET_HWLOC_LIB))
+
+CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES)
+
+all: $(BUILD_DIR) $(OBJ) $(LIBHWLOC)
+
+$(BUILD_DIR):
+ @mkdir $(BUILD_DIR)
+
+
+$(STATIC_LIBHWLOC): $(OBJ)
+ $(Q)${AR} -cq $(LIBHWLOC) $(OBJ)
+
+$(SHARED_LIBHWLOC): $(OBJ)
+ $(Q)$(CC) $(DEBUG_FLAGS) $(LFLAGS) -Wl,-soname,$(LIBHWLOC).$(VERSION).$(RELEASE) -Wall -shared -fPIC -o $(LIBHWLOC) $(OBJ) $(LIBS) $(RPATHS)
+
+#PATTERN RULES
+$(BUILD_DIR)/%.o: %.c
+ $(Q)$(CC) -c $(DEBUG_FLAGS) $(CFLAGS) $(CPPFLAGS) $< -o $@
+ $(Q)$(CC) $(DEBUG_FLAGS) $(CPPFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d
+
+ifeq ($(findstring $(MAKECMDGOALS),clean),)
+-include $(OBJ:.o=.d)
+endif
+
+.PHONY: clean distclean
+
+clean:
+ @rm -f $(TARGET) $(SHARED_LIBHWLOC) $(STATIC_LIBHWLOC) $(LIBHWLOC).$(VERSION).$(RELEASE) $(LIBHWLOC).$(VERSION)
+
+distclean: clean
+ @rm -f $(TARGET) $(SHARED_LIBHWLOC) $(STATIC_LIBHWLOC) $(LIBHWLOC).$(VERSION).$(RELEASE) $(LIBHWLOC).$(VERSION)
+ @rm -rf $(BUILD_DIR)
+
+
+
diff --git a/ext/hwloc/hwloc/base64.c b/ext/hwloc/hwloc/base64.c
new file mode 100644
index 0000000..7a3392f
--- /dev/null
+++ b/ext/hwloc/hwloc/base64.c
@@ -0,0 +1,306 @@
+/*
+ * Copyright © 2012 Inria. All rights reserved.
+ * See COPYING in top-level directory.
+ *
+ * Modifications after import:
+ * - removed all #if
+ * - updated prototypes
+ * - updated #include
+ */
+
+/* $OpenBSD: base64.c,v 1.5 2006/10/21 09:55:03 otto Exp $ */
+
+/*
+ * Copyright (c) 1996 by Internet Software Consortium.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SOFTWARE CONSORTIUM DISCLAIMS
+ * ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL INTERNET SOFTWARE
+ * CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
+ * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
+ * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+/*
+ * Portions Copyright (c) 1995 by International Business Machines, Inc.
+ *
+ * International Business Machines, Inc. (hereinafter called IBM) grants
+ * permission under its copyrights to use, copy, modify, and distribute this
+ * Software with or without fee, provided that the above copyright notice and
+ * all paragraphs of this notice appear in all copies, and that the name of IBM
+ * not be used in connection with the marketing of any product incorporating
+ * the Software or modifications thereof, without specific, written prior
+ * permission.
+ *
+ * To the extent it has a right to do so, IBM grants an immunity from suit
+ * under its patents, if any, for the use, sale or manufacture of products to
+ * the extent that such products are used for performing Domain Name System
+ * dynamic updates in TCP/IP networks by means of the Software. No immunity is
+ * granted for any product per se or for any other function of any product.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", AND IBM DISCLAIMS ALL WARRANTIES,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE. IN NO EVENT SHALL IBM BE LIABLE FOR ANY SPECIAL,
+ * DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE, EVEN
+ * IF IBM IS APPRISED OF THE POSSIBILITY OF SUCH DAMAGES.
+ */
+
+/* OPENBSD ORIGINAL: lib/libc/net/base64.c */
+
+static const char Base64[] =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+static const char Pad64 = '=';
+
+/* (From RFC1521 and draft-ietf-dnssec-secext-03.txt)
+ The following encoding technique is taken from RFC 1521 by Borenstein
+ and Freed. It is reproduced here in a slightly edited form for
+ convenience.
+
+ A 65-character subset of US-ASCII is used, enabling 6 bits to be
+ represented per printable character. (The extra 65th character, "=",
+ is used to signify a special processing function.)
+
+ The encoding process represents 24-bit groups of input bits as output
+ strings of 4 encoded characters. Proceeding from left to right, a
+ 24-bit input group is formed by concatenating 3 8-bit input groups.
+ These 24 bits are then treated as 4 concatenated 6-bit groups, each
+ of which is translated into a single digit in the base64 alphabet.
+
+ Each 6-bit group is used as an index into an array of 64 printable
+ characters. The character referenced by the index is placed in the
+ output string.
+
+ Table 1: The Base64 Alphabet
+
+ Value Encoding Value Encoding Value Encoding Value Encoding
+ 0 A 17 R 34 i 51 z
+ 1 B 18 S 35 j 52 0
+ 2 C 19 T 36 k 53 1
+ 3 D 20 U 37 l 54 2
+ 4 E 21 V 38 m 55 3
+ 5 F 22 W 39 n 56 4
+ 6 G 23 X 40 o 57 5
+ 7 H 24 Y 41 p 58 6
+ 8 I 25 Z 42 q 59 7
+ 9 J 26 a 43 r 60 8
+ 10 K 27 b 44 s 61 9
+ 11 L 28 c 45 t 62 +
+ 12 M 29 d 46 u 63 /
+ 13 N 30 e 47 v
+ 14 O 31 f 48 w (pad) =
+ 15 P 32 g 49 x
+ 16 Q 33 h 50 y
+
+ Special processing is performed if fewer than 24 bits are available
+ at the end of the data being encoded. A full encoding quantum is
+ always completed at the end of a quantity. When fewer than 24 input
+ bits are available in an input group, zero bits are added (on the
+ right) to form an integral number of 6-bit groups. Padding at the
+ end of the data is performed using the '=' character.
+
+ Since all base64 input is an integral number of octets, only the
+ -------------------------------------------------
+ following cases can arise:
+
+ (1) the final quantum of encoding input is an integral
+ multiple of 24 bits; here, the final unit of encoded
+ output will be an integral multiple of 4 characters
+ with no "=" padding,
+ (2) the final quantum of encoding input is exactly 8 bits;
+ here, the final unit of encoded output will be two
+ characters followed by two "=" padding characters, or
+ (3) the final quantum of encoding input is exactly 16 bits;
+ here, the final unit of encoded output will be three
+ characters followed by one "=" padding character.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include <private/private.h>
+
+int
+hwloc_encode_to_base64(const char *src, size_t srclength, char *target, size_t targsize)
+{
+ size_t datalength = 0;
+ unsigned char input[3];
+ unsigned char output[4];
+ unsigned int i;
+
+ while (2 < srclength) {
+ input[0] = *src++;
+ input[1] = *src++;
+ input[2] = *src++;
+ srclength -= 3;
+
+ output[0] = input[0] >> 2;
+ output[1] = ((input[0] & 0x03) << 4) + (input[1] >> 4);
+ output[2] = ((input[1] & 0x0f) << 2) + (input[2] >> 6);
+ output[3] = input[2] & 0x3f;
+
+ if (datalength + 4 > targsize)
+ return (-1);
+ target[datalength++] = Base64[output[0]];
+ target[datalength++] = Base64[output[1]];
+ target[datalength++] = Base64[output[2]];
+ target[datalength++] = Base64[output[3]];
+ }
+
+ /* Now we worry about padding. */
+ if (0 != srclength) {
+ /* Get what's left. */
+ input[0] = input[1] = input[2] = '\0';
+ for (i = 0; i < srclength; i++)
+ input[i] = *src++;
+
+ output[0] = input[0] >> 2;
+ output[1] = ((input[0] & 0x03) << 4) + (input[1] >> 4);
+ output[2] = ((input[1] & 0x0f) << 2) + (input[2] >> 6);
+
+ if (datalength + 4 > targsize)
+ return (-1);
+ target[datalength++] = Base64[output[0]];
+ target[datalength++] = Base64[output[1]];
+ if (srclength == 1)
+ target[datalength++] = Pad64;
+ else
+ target[datalength++] = Base64[output[2]];
+ target[datalength++] = Pad64;
+ }
+ if (datalength >= targsize)
+ return (-1);
+ target[datalength] = '\0'; /* Returned value doesn't count \0. */
+ return (datalength);
+}
+
+/* skips all whitespace anywhere.
+ converts characters, four at a time, starting at (or after)
+ src from base - 64 numbers into three 8 bit bytes in the target area.
+ it returns the number of data bytes stored at the target, or -1 on error.
+ */
+
+int
+hwloc_decode_from_base64(char const *src, char *target, size_t targsize)
+{
+ unsigned int tarindex, state;
+ int ch;
+ char *pos;
+
+ state = 0;
+ tarindex = 0;
+
+ while ((ch = *src++) != '\0') {
+ if (isspace(ch)) /* Skip whitespace anywhere. */
+ continue;
+
+ if (ch == Pad64)
+ break;
+
+ pos = strchr(Base64, ch);
+ if (pos == 0) /* A non-base64 character. */
+ return (-1);
+
+ switch (state) {
+ case 0:
+ if (target) {
+ if (tarindex >= targsize)
+ return (-1);
+ target[tarindex] = (pos - Base64) << 2;
+ }
+ state = 1;
+ break;
+ case 1:
+ if (target) {
+ if (tarindex + 1 >= targsize)
+ return (-1);
+ target[tarindex] |= (pos - Base64) >> 4;
+ target[tarindex+1] = ((pos - Base64) & 0x0f)
+ << 4 ;
+ }
+ tarindex++;
+ state = 2;
+ break;
+ case 2:
+ if (target) {
+ if (tarindex + 1 >= targsize)
+ return (-1);
+ target[tarindex] |= (pos - Base64) >> 2;
+ target[tarindex+1] = ((pos - Base64) & 0x03)
+ << 6;
+ }
+ tarindex++;
+ state = 3;
+ break;
+ case 3:
+ if (target) {
+ if (tarindex >= targsize)
+ return (-1);
+ target[tarindex] |= (pos - Base64);
+ }
+ tarindex++;
+ state = 0;
+ break;
+ }
+ }
+
+ /*
+ * We are done decoding Base-64 chars. Let's see if we ended
+ * on a byte boundary, and/or with erroneous trailing characters.
+ */
+
+ if (ch == Pad64) { /* We got a pad char. */
+ ch = *src++; /* Skip it, get next. */
+ switch (state) {
+ case 0: /* Invalid = in first position */
+ case 1: /* Invalid = in second position */
+ return (-1);
+
+ case 2: /* Valid, means one byte of info */
+ /* Skip any number of spaces. */
+ for (; ch != '\0'; ch = *src++)
+ if (!isspace(ch))
+ break;
+ /* Make sure there is another trailing = sign. */
+ if (ch != Pad64)
+ return (-1);
+ ch = *src++; /* Skip the = */
+ /* Fall through to "single trailing =" case. */
+ /* FALLTHROUGH */
+
+ case 3: /* Valid, means two bytes of info */
+ /*
+ * We know this char is an =. Is there anything but
+ * whitespace after it?
+ */
+ for (; ch != '\0'; ch = *src++)
+ if (!isspace(ch))
+ return (-1);
+
+ /*
+ * Now make sure for cases 2 and 3 that the "extra"
+ * bits that slopped past the last full byte were
+ * zeros. If we don't check them, they become a
+ * subliminal channel.
+ */
+ if (target && target[tarindex] != 0)
+ return (-1);
+ }
+ } else {
+ /*
+ * We ended by seeing the end of the string. Make sure we
+ * have no partial bytes lying around.
+ */
+ if (state != 0)
+ return (-1);
+ }
+
+ return (tarindex);
+}
diff --git a/ext/hwloc/hwloc/bind.c b/ext/hwloc/hwloc/bind.c
new file mode 100644
index 0000000..e2b5a06
--- /dev/null
+++ b/ext/hwloc/hwloc/bind.c
@@ -0,0 +1,781 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2011 inria. All rights reserved.
+ * Copyright © 2009-2010, 2012 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <hwloc/helper.h>
+#ifdef HAVE_SYS_MMAN_H
+# include <sys/mman.h>
+#endif
+/* <malloc.h> is only needed if we don't have posix_memalign() */
+#if defined(hwloc_getpagesize) && !defined(HAVE_POSIX_MEMALIGN) && defined(HAVE_MEMALIGN) && defined(HAVE_MALLOC_H)
+#include <malloc.h>
+#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <stdlib.h>
+#include <errno.h>
+
+/* TODO: HWLOC_GNU_SYS, HWLOC_IRIX_SYS,
+ *
+ * IRIX: see MP_MUSTRUN / _DSM_MUSTRUN, pthread_setrunon_np, /hw, procss_cpulink, numa_create
+ *
+ * We could use glibc's sched_setaffinity generically when it is available
+ *
+ * Darwin and OpenBSD don't seem to have binding facilities.
+ */
+
+static hwloc_const_bitmap_t
+hwloc_fix_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t set)
+{
+ hwloc_const_bitmap_t topology_set = hwloc_topology_get_topology_cpuset(topology);
+ hwloc_const_bitmap_t complete_set = hwloc_topology_get_complete_cpuset(topology);
+
+ if (!topology_set) {
+ /* The topology is composed of several systems, the cpuset is ambiguous. */
+ errno = EXDEV;
+ return NULL;
+ }
+
+ if (hwloc_bitmap_iszero(set)) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ if (!hwloc_bitmap_isincluded(set, complete_set)) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ if (hwloc_bitmap_isincluded(topology_set, set))
+ set = complete_set;
+
+ return set;
+}
+
+int
+hwloc_set_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t set, int flags)
+{
+ set = hwloc_fix_cpubind(topology, set);
+ if (!set)
+ return -1;
+
+ if (flags & HWLOC_CPUBIND_PROCESS) {
+ if (topology->binding_hooks.set_thisproc_cpubind)
+ return topology->binding_hooks.set_thisproc_cpubind(topology, set, flags);
+ } else if (flags & HWLOC_CPUBIND_THREAD) {
+ if (topology->binding_hooks.set_thisthread_cpubind)
+ return topology->binding_hooks.set_thisthread_cpubind(topology, set, flags);
+ } else {
+ if (topology->binding_hooks.set_thisproc_cpubind)
+ return topology->binding_hooks.set_thisproc_cpubind(topology, set, flags);
+ else if (topology->binding_hooks.set_thisthread_cpubind)
+ return topology->binding_hooks.set_thisthread_cpubind(topology, set, flags);
+ }
+
+ errno = ENOSYS;
+ return -1;
+}
+
+int
+hwloc_get_cpubind(hwloc_topology_t topology, hwloc_bitmap_t set, int flags)
+{
+ if (flags & HWLOC_CPUBIND_PROCESS) {
+ if (topology->binding_hooks.get_thisproc_cpubind)
+ return topology->binding_hooks.get_thisproc_cpubind(topology, set, flags);
+ } else if (flags & HWLOC_CPUBIND_THREAD) {
+ if (topology->binding_hooks.get_thisthread_cpubind)
+ return topology->binding_hooks.get_thisthread_cpubind(topology, set, flags);
+ } else {
+ if (topology->binding_hooks.get_thisproc_cpubind)
+ return topology->binding_hooks.get_thisproc_cpubind(topology, set, flags);
+ else if (topology->binding_hooks.get_thisthread_cpubind)
+ return topology->binding_hooks.get_thisthread_cpubind(topology, set, flags);
+ }
+
+ errno = ENOSYS;
+ return -1;
+}
+
+int
+hwloc_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_bitmap_t set, int flags)
+{
+ set = hwloc_fix_cpubind(topology, set);
+ if (!set)
+ return -1;
+
+ if (topology->binding_hooks.set_proc_cpubind)
+ return topology->binding_hooks.set_proc_cpubind(topology, pid, set, flags);
+
+ errno = ENOSYS;
+ return -1;
+}
+
+int
+hwloc_get_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_bitmap_t set, int flags)
+{
+ if (topology->binding_hooks.get_proc_cpubind)
+ return topology->binding_hooks.get_proc_cpubind(topology, pid, set, flags);
+
+ errno = ENOSYS;
+ return -1;
+}
+
+#ifdef hwloc_thread_t
+int
+hwloc_set_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t tid, hwloc_const_bitmap_t set, int flags)
+{
+ set = hwloc_fix_cpubind(topology, set);
+ if (!set)
+ return -1;
+
+ if (topology->binding_hooks.set_thread_cpubind)
+ return topology->binding_hooks.set_thread_cpubind(topology, tid, set, flags);
+
+ errno = ENOSYS;
+ return -1;
+}
+
+int
+hwloc_get_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t tid, hwloc_bitmap_t set, int flags)
+{
+ if (topology->binding_hooks.get_thread_cpubind)
+ return topology->binding_hooks.get_thread_cpubind(topology, tid, set, flags);
+
+ errno = ENOSYS;
+ return -1;
+}
+#endif
+
+int
+hwloc_get_last_cpu_location(hwloc_topology_t topology, hwloc_bitmap_t set, int flags)
+{
+ if (flags & HWLOC_CPUBIND_PROCESS) {
+ if (topology->binding_hooks.get_thisproc_last_cpu_location)
+ return topology->binding_hooks.get_thisproc_last_cpu_location(topology, set, flags);
+ } else if (flags & HWLOC_CPUBIND_THREAD) {
+ if (topology->binding_hooks.get_thisthread_last_cpu_location)
+ return topology->binding_hooks.get_thisthread_last_cpu_location(topology, set, flags);
+ } else {
+ if (topology->binding_hooks.get_thisproc_last_cpu_location)
+ return topology->binding_hooks.get_thisproc_last_cpu_location(topology, set, flags);
+ else if (topology->binding_hooks.get_thisthread_last_cpu_location)
+ return topology->binding_hooks.get_thisthread_last_cpu_location(topology, set, flags);
+ }
+
+ errno = ENOSYS;
+ return -1;
+}
+
+int
+hwloc_get_proc_last_cpu_location(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_bitmap_t set, int flags)
+{
+ if (topology->binding_hooks.get_proc_last_cpu_location)
+ return topology->binding_hooks.get_proc_last_cpu_location(topology, pid, set, flags);
+
+ errno = ENOSYS;
+ return -1;
+}
+
+static hwloc_const_nodeset_t
+hwloc_fix_membind(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset)
+{
+ hwloc_const_bitmap_t topology_nodeset = hwloc_topology_get_topology_nodeset(topology);
+ hwloc_const_bitmap_t complete_nodeset = hwloc_topology_get_complete_nodeset(topology);
+
+ if (!hwloc_topology_get_topology_cpuset(topology)) {
+ /* The topology is composed of several systems, the nodeset is thus
+ * ambiguous. */
+ errno = EXDEV;
+ return NULL;
+ }
+
+ if (!complete_nodeset) {
+ /* There is no NUMA node */
+ errno = ENODEV;
+ return NULL;
+ }
+
+ if (hwloc_bitmap_iszero(nodeset)) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ if (!hwloc_bitmap_isincluded(nodeset, complete_nodeset)) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ if (hwloc_bitmap_isincluded(topology_nodeset, nodeset))
+ return complete_nodeset;
+
+ return nodeset;
+}
+
+static int
+hwloc_fix_membind_cpuset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_const_cpuset_t cpuset)
+{
+ hwloc_const_bitmap_t topology_set = hwloc_topology_get_topology_cpuset(topology);
+ hwloc_const_bitmap_t complete_set = hwloc_topology_get_complete_cpuset(topology);
+ hwloc_const_bitmap_t complete_nodeset = hwloc_topology_get_complete_nodeset(topology);
+
+ if (!topology_set) {
+ /* The topology is composed of several systems, the cpuset is thus
+ * ambiguous. */
+ errno = EXDEV;
+ return -1;
+ }
+
+ if (!complete_nodeset) {
+ /* There is no NUMA node */
+ errno = ENODEV;
+ return -1;
+ }
+
+ if (hwloc_bitmap_iszero(cpuset)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (!hwloc_bitmap_isincluded(cpuset, complete_set)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (hwloc_bitmap_isincluded(topology_set, cpuset)) {
+ hwloc_bitmap_copy(nodeset, complete_nodeset);
+ return 0;
+ }
+
+ hwloc_cpuset_to_nodeset(topology, cpuset, nodeset);
+ return 0;
+}
+
+int
+hwloc_set_membind_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+ nodeset = hwloc_fix_membind(topology, nodeset);
+ if (!nodeset)
+ return -1;
+
+ if (flags & HWLOC_MEMBIND_PROCESS) {
+ if (topology->binding_hooks.set_thisproc_membind)
+ return topology->binding_hooks.set_thisproc_membind(topology, nodeset, policy, flags);
+ } else if (flags & HWLOC_MEMBIND_THREAD) {
+ if (topology->binding_hooks.set_thisthread_membind)
+ return topology->binding_hooks.set_thisthread_membind(topology, nodeset, policy, flags);
+ } else {
+ if (topology->binding_hooks.set_thisproc_membind)
+ return topology->binding_hooks.set_thisproc_membind(topology, nodeset, policy, flags);
+ else if (topology->binding_hooks.set_thisthread_membind)
+ return topology->binding_hooks.set_thisthread_membind(topology, nodeset, policy, flags);
+ }
+
+ errno = ENOSYS;
+ return -1;
+}
+
+int
+hwloc_set_membind(hwloc_topology_t topology, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags)
+{
+ hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+ int ret;
+
+ if (hwloc_fix_membind_cpuset(topology, nodeset, set))
+ ret = -1;
+ else
+ ret = hwloc_set_membind_nodeset(topology, nodeset, policy, flags);
+
+ hwloc_bitmap_free(nodeset);
+ return ret;
+}
+
+int
+hwloc_get_membind_nodeset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+{
+ if (flags & HWLOC_MEMBIND_PROCESS) {
+ if (topology->binding_hooks.get_thisproc_membind)
+ return topology->binding_hooks.get_thisproc_membind(topology, nodeset, policy, flags);
+ } else if (flags & HWLOC_MEMBIND_THREAD) {
+ if (topology->binding_hooks.get_thisthread_membind)
+ return topology->binding_hooks.get_thisthread_membind(topology, nodeset, policy, flags);
+ } else {
+ if (topology->binding_hooks.get_thisproc_membind)
+ return topology->binding_hooks.get_thisproc_membind(topology, nodeset, policy, flags);
+ else if (topology->binding_hooks.get_thisthread_membind)
+ return topology->binding_hooks.get_thisthread_membind(topology, nodeset, policy, flags);
+ }
+
+ errno = ENOSYS;
+ return -1;
+}
+
+int
+hwloc_get_membind(hwloc_topology_t topology, hwloc_cpuset_t set, hwloc_membind_policy_t * policy, int flags)
+{
+ hwloc_nodeset_t nodeset;
+ int ret;
+
+ nodeset = hwloc_bitmap_alloc();
+ ret = hwloc_get_membind_nodeset(topology, nodeset, policy, flags);
+
+ if (!ret)
+ hwloc_cpuset_from_nodeset(topology, set, nodeset);
+
+ hwloc_bitmap_free(nodeset);
+ return ret;
+}
+
+int
+hwloc_set_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+ nodeset = hwloc_fix_membind(topology, nodeset);
+ if (!nodeset)
+ return -1;
+
+ if (topology->binding_hooks.set_proc_membind)
+ return topology->binding_hooks.set_proc_membind(topology, pid, nodeset, policy, flags);
+
+ errno = ENOSYS;
+ return -1;
+}
+
+
+int
+hwloc_set_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags)
+{
+ hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+ int ret;
+
+ if (hwloc_fix_membind_cpuset(topology, nodeset, set))
+ ret = -1;
+ else
+ ret = hwloc_set_proc_membind_nodeset(topology, pid, nodeset, policy, flags);
+
+ hwloc_bitmap_free(nodeset);
+ return ret;
+}
+
+int
+hwloc_get_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+{
+ if (topology->binding_hooks.get_proc_membind)
+ return topology->binding_hooks.get_proc_membind(topology, pid, nodeset, policy, flags);
+
+ errno = ENOSYS;
+ return -1;
+}
+
+int
+hwloc_get_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, hwloc_membind_policy_t * policy, int flags)
+{
+ hwloc_nodeset_t nodeset;
+ int ret;
+
+ nodeset = hwloc_bitmap_alloc();
+ ret = hwloc_get_proc_membind_nodeset(topology, pid, nodeset, policy, flags);
+
+ if (!ret)
+ hwloc_cpuset_from_nodeset(topology, set, nodeset);
+
+ hwloc_bitmap_free(nodeset);
+ return ret;
+}
+
+int
+hwloc_set_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+ nodeset = hwloc_fix_membind(topology, nodeset);
+ if (!nodeset)
+ return -1;
+
+ if (topology->binding_hooks.set_area_membind)
+ return topology->binding_hooks.set_area_membind(topology, addr, len, nodeset, policy, flags);
+
+ errno = ENOSYS;
+ return -1;
+}
+
+int
+hwloc_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags)
+{
+ hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+ int ret;
+
+ if (hwloc_fix_membind_cpuset(topology, nodeset, set))
+ ret = -1;
+ else
+ ret = hwloc_set_area_membind_nodeset(topology, addr, len, nodeset, policy, flags);
+
+ hwloc_bitmap_free(nodeset);
+ return ret;
+}
+
+int
+hwloc_get_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+{
+ if (topology->binding_hooks.get_area_membind)
+ return topology->binding_hooks.get_area_membind(topology, addr, len, nodeset, policy, flags);
+
+ errno = ENOSYS;
+ return -1;
+}
+
+int
+hwloc_get_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_cpuset_t set, hwloc_membind_policy_t * policy, int flags)
+{
+ hwloc_nodeset_t nodeset;
+ int ret;
+
+ nodeset = hwloc_bitmap_alloc();
+ ret = hwloc_get_area_membind_nodeset(topology, addr, len, nodeset, policy, flags);
+
+ if (!ret)
+ hwloc_cpuset_from_nodeset(topology, set, nodeset);
+
+ hwloc_bitmap_free(nodeset);
+ return ret;
+}
+
+void *
+hwloc_alloc_heap(hwloc_topology_t topology __hwloc_attribute_unused, size_t len)
+{
+ void *p;
+#if defined(hwloc_getpagesize) && defined(HAVE_POSIX_MEMALIGN)
+ errno = posix_memalign(&p, hwloc_getpagesize(), len);
+ if (errno)
+ p = NULL;
+#elif defined(hwloc_getpagesize) && defined(HAVE_MEMALIGN)
+ p = memalign(hwloc_getpagesize(), len);
+#else
+ p = malloc(len);
+#endif
+ return p;
+}
+
+#ifdef MAP_ANONYMOUS
+void *
+hwloc_alloc_mmap(hwloc_topology_t topology __hwloc_attribute_unused, size_t len)
+{
+ return mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+}
+#endif
+
+int
+hwloc_free_heap(hwloc_topology_t topology __hwloc_attribute_unused, void *addr, size_t len __hwloc_attribute_unused)
+{
+ free(addr);
+ return 0;
+}
+
+#ifdef MAP_ANONYMOUS
+int
+hwloc_free_mmap(hwloc_topology_t topology __hwloc_attribute_unused, void *addr, size_t len)
+{
+ if (!addr)
+ return 0;
+ return munmap(addr, len);
+}
+#endif
+
+void *
+hwloc_alloc(hwloc_topology_t topology, size_t len)
+{
+ if (topology->binding_hooks.alloc)
+ return topology->binding_hooks.alloc(topology, len);
+ return hwloc_alloc_heap(topology, len);
+}
+
+void *
+hwloc_alloc_membind_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+ void *p;
+ nodeset = hwloc_fix_membind(topology, nodeset);
+ if (!nodeset)
+ goto fallback;
+ if (flags & HWLOC_MEMBIND_MIGRATE) {
+ errno = EINVAL;
+ goto fallback;
+ }
+
+ if (topology->binding_hooks.alloc_membind)
+ return topology->binding_hooks.alloc_membind(topology, len, nodeset, policy, flags);
+ else if (topology->binding_hooks.set_area_membind) {
+ p = hwloc_alloc(topology, len);
+ if (!p)
+ return NULL;
+ if (topology->binding_hooks.set_area_membind(topology, p, len, nodeset, policy, flags) && flags & HWLOC_MEMBIND_STRICT) {
+ int error = errno;
+ free(p);
+ errno = error;
+ return NULL;
+ }
+ return p;
+ } else {
+ errno = ENOSYS;
+ }
+
+fallback:
+ if (flags & HWLOC_MEMBIND_STRICT)
+ /* Report error */
+ return NULL;
+ /* Never mind, allocate anyway */
+ return hwloc_alloc(topology, len);
+}
+
+void *
+hwloc_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags)
+{
+ hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+ void *ret;
+
+ if (hwloc_fix_membind_cpuset(topology, nodeset, set)) {
+ if (flags & HWLOC_MEMBIND_STRICT)
+ ret = NULL;
+ else
+ ret = hwloc_alloc(topology, len);
+ } else
+ ret = hwloc_alloc_membind_nodeset(topology, len, nodeset, policy, flags);
+
+ hwloc_bitmap_free(nodeset);
+ return ret;
+}
+
+int
+hwloc_free(hwloc_topology_t topology, void *addr, size_t len)
+{
+ if (topology->binding_hooks.free_membind)
+ return topology->binding_hooks.free_membind(topology, addr, len);
+ return hwloc_free_heap(topology, addr, len);
+}
+
+/*
+ * Empty binding hooks always returning success
+ */
+
+static int dontset_return_complete_cpuset(hwloc_topology_t topology, hwloc_cpuset_t set)
+{
+ hwloc_const_cpuset_t cpuset = hwloc_topology_get_complete_cpuset(topology);
+ if (cpuset) {
+ hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+ return 0;
+ } else
+ return -1;
+}
+
+static int dontset_thisthread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+ return 0;
+}
+static int dontget_thisthread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_bitmap_t set, int flags __hwloc_attribute_unused)
+{
+ return dontset_return_complete_cpuset(topology, set);
+}
+static int dontset_thisproc_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+ return 0;
+}
+static int dontget_thisproc_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_bitmap_t set, int flags __hwloc_attribute_unused)
+{
+ return dontset_return_complete_cpuset(topology, set);
+}
+static int dontset_proc_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_pid_t pid __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+ return 0;
+}
+static int dontget_proc_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_pid_t pid __hwloc_attribute_unused, hwloc_bitmap_t cpuset, int flags __hwloc_attribute_unused)
+{
+ return dontset_return_complete_cpuset(topology, cpuset);
+}
+#ifdef hwloc_thread_t
+static int dontset_thread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_thread_t tid __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+ return 0;
+}
+static int dontget_thread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_thread_t tid __hwloc_attribute_unused, hwloc_bitmap_t cpuset, int flags __hwloc_attribute_unused)
+{
+ return dontset_return_complete_cpuset(topology, cpuset);
+}
+#endif
+
+static int dontset_return_complete_nodeset(hwloc_topology_t topology, hwloc_nodeset_t set, hwloc_membind_policy_t *policy)
+{
+ hwloc_const_nodeset_t nodeset = hwloc_topology_get_complete_nodeset(topology);
+ if (nodeset) {
+ hwloc_bitmap_copy(set, hwloc_topology_get_complete_nodeset(topology));
+ *policy = HWLOC_MEMBIND_DEFAULT;
+ return 0;
+ } else
+ return -1;
+}
+
+static int dontset_thisproc_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, hwloc_membind_policy_t policy __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+ return 0;
+}
+static int dontget_thisproc_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags __hwloc_attribute_unused)
+{
+ return dontset_return_complete_nodeset(topology, set, policy);
+}
+
+static int dontset_thisthread_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, hwloc_membind_policy_t policy __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+ return 0;
+}
+static int dontget_thisthread_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags __hwloc_attribute_unused)
+{
+ return dontset_return_complete_nodeset(topology, set, policy);
+}
+
+static int dontset_proc_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_pid_t pid __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, hwloc_membind_policy_t policy __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+ return 0;
+}
+static int dontget_proc_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_pid_t pid __hwloc_attribute_unused, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags __hwloc_attribute_unused)
+{
+ return dontset_return_complete_nodeset(topology, set, policy);
+}
+
+static int dontset_area_membind(hwloc_topology_t topology __hwloc_attribute_unused, const void *addr __hwloc_attribute_unused, size_t size __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, hwloc_membind_policy_t policy __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+ return 0;
+}
+static int dontget_area_membind(hwloc_topology_t topology __hwloc_attribute_unused, const void *addr __hwloc_attribute_unused, size_t size __hwloc_attribute_unused, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags __hwloc_attribute_unused)
+{
+ return dontset_return_complete_nodeset(topology, set, policy);
+}
+
+static void * dontalloc_membind(hwloc_topology_t topology __hwloc_attribute_unused, size_t size __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, hwloc_membind_policy_t policy __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+ return malloc(size);
+}
+static int dontfree_membind(hwloc_topology_t topology __hwloc_attribute_unused, void *addr __hwloc_attribute_unused, size_t size __hwloc_attribute_unused)
+{
+ free(addr);
+ return 0;
+}
+
+static void hwloc_set_dummy_hooks(struct hwloc_binding_hooks *hooks,
+ struct hwloc_topology_support *support __hwloc_attribute_unused)
+{
+ hooks->set_thisproc_cpubind = dontset_thisproc_cpubind;
+ hooks->get_thisproc_cpubind = dontget_thisproc_cpubind;
+ hooks->set_thisthread_cpubind = dontset_thisthread_cpubind;
+ hooks->get_thisthread_cpubind = dontget_thisthread_cpubind;
+ hooks->set_proc_cpubind = dontset_proc_cpubind;
+ hooks->get_proc_cpubind = dontget_proc_cpubind;
+#ifdef hwloc_thread_t
+ hooks->set_thread_cpubind = dontset_thread_cpubind;
+ hooks->get_thread_cpubind = dontget_thread_cpubind;
+#endif
+ hooks->get_thisproc_last_cpu_location = dontget_thisproc_cpubind; /* cpubind instead of last_cpu_location is ok */
+ hooks->get_thisthread_last_cpu_location = dontget_thisthread_cpubind; /* cpubind instead of last_cpu_location is ok */
+ hooks->get_proc_last_cpu_location = dontget_proc_cpubind; /* cpubind instead of last_cpu_location is ok */
+ /* TODO: get_thread_last_cpu_location */
+ hooks->set_thisproc_membind = dontset_thisproc_membind;
+ hooks->get_thisproc_membind = dontget_thisproc_membind;
+ hooks->set_thisthread_membind = dontset_thisthread_membind;
+ hooks->get_thisthread_membind = dontget_thisthread_membind;
+ hooks->set_proc_membind = dontset_proc_membind;
+ hooks->get_proc_membind = dontget_proc_membind;
+ hooks->set_area_membind = dontset_area_membind;
+ hooks->get_area_membind = dontget_area_membind;
+ hooks->alloc_membind = dontalloc_membind;
+ hooks->free_membind = dontfree_membind;
+}
+
+void
+hwloc_set_native_binding_hooks(struct hwloc_binding_hooks *hooks, struct hwloc_topology_support *support)
+{
+# ifdef HWLOC_LINUX_SYS
+ hwloc_set_linuxfs_hooks(hooks, support);
+# endif /* HWLOC_LINUX_SYS */
+
+# ifdef HWLOC_BGQ_SYS
+ hwloc_set_bgq_hooks(hooks, support);
+# endif /* HWLOC_BGQ_SYS */
+
+# ifdef HWLOC_AIX_SYS
+ hwloc_set_aix_hooks(hooks, support);
+# endif /* HWLOC_AIX_SYS */
+
+# ifdef HWLOC_OSF_SYS
+ hwloc_set_osf_hooks(hooks, support);
+# endif /* HWLOC_OSF_SYS */
+
+# ifdef HWLOC_SOLARIS_SYS
+ hwloc_set_solaris_hooks(hooks, support);
+# endif /* HWLOC_SOLARIS_SYS */
+
+# ifdef HWLOC_WIN_SYS
+ hwloc_set_windows_hooks(hooks, support);
+# endif /* HWLOC_WIN_SYS */
+
+# ifdef HWLOC_DARWIN_SYS
+ hwloc_set_darwin_hooks(hooks, support);
+# endif /* HWLOC_DARWIN_SYS */
+
+# ifdef HWLOC_FREEBSD_SYS
+ hwloc_set_freebsd_hooks(hooks, support);
+# endif /* HWLOC_FREEBSD_SYS */
+
+# ifdef HWLOC_NETBSD_SYS
+ hwloc_set_netbsd_hooks(hooks, support);
+# endif /* HWLOC_NETBSD_SYS */
+
+# ifdef HWLOC_HPUX_SYS
+ hwloc_set_hpux_hooks(hooks, support);
+# endif /* HWLOC_HPUX_SYS */
+}
+
+/* If the represented system is actually not this system, use dummy binding hooks. */
+void
+hwloc_set_binding_hooks(struct hwloc_topology *topology)
+{
+ if (topology->is_thissystem) {
+ hwloc_set_native_binding_hooks(&topology->binding_hooks, &topology->support);
+ /* every hook not set above will return ENOSYS */
+ } else {
+ /* not this system, use dummy binding hooks that do nothing (but don't return ENOSYS) */
+ hwloc_set_dummy_hooks(&topology->binding_hooks, &topology->support);
+ }
+
+ /* if not is_thissystem, set_cpubind is fake
+ * and get_cpubind returns the whole system cpuset,
+ * so don't report that set/get_cpubind as supported
+ */
+ if (topology->is_thissystem) {
+#define DO(which,kind) \
+ if (topology->binding_hooks.kind) \
+ topology->support.which##bind->kind = 1;
+ DO(cpu,set_thisproc_cpubind);
+ DO(cpu,get_thisproc_cpubind);
+ DO(cpu,set_proc_cpubind);
+ DO(cpu,get_proc_cpubind);
+ DO(cpu,set_thisthread_cpubind);
+ DO(cpu,get_thisthread_cpubind);
+#ifdef hwloc_thread_t
+ DO(cpu,set_thread_cpubind);
+ DO(cpu,get_thread_cpubind);
+#endif
+ DO(cpu,get_thisproc_last_cpu_location);
+ DO(cpu,get_proc_last_cpu_location);
+ DO(cpu,get_thisthread_last_cpu_location);
+ DO(mem,set_thisproc_membind);
+ DO(mem,get_thisproc_membind);
+ DO(mem,set_thisthread_membind);
+ DO(mem,get_thisthread_membind);
+ DO(mem,set_proc_membind);
+ DO(mem,get_proc_membind);
+ DO(mem,set_area_membind);
+ DO(mem,get_area_membind);
+ DO(mem,alloc_membind);
+ }
+}
diff --git a/ext/hwloc/hwloc/bitmap.c b/ext/hwloc/hwloc/bitmap.c
new file mode 100644
index 0000000..e2b807a
--- /dev/null
+++ b/ext/hwloc/hwloc/bitmap.c
@@ -0,0 +1,1492 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria. All rights reserved.
+ * Copyright © 2009-2011 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc/autogen/config.h>
+#include <hwloc.h>
+#include <private/misc.h>
+#include <private/private.h>
+#include <hwloc/bitmap.h>
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <assert.h>
+#include <errno.h>
+#include <ctype.h>
+
+/*
+ * possible improvements:
+ * - have a way to change the initial allocation size:
+ * add hwloc_bitmap_set_foo() to changes a global here,
+ * and make the hwloc core call based on the early number of PUs
+ * - preallocate inside the bitmap structure (so that the whole structure is a cacheline for instance)
+ * and allocate a dedicated array only later when reallocating larger
+ * - add a bitmap->ulongs_empty_first which guarantees that some first ulongs are empty,
+ * making tests much faster for big bitmaps since there's no need to look at first ulongs.
+ * no need for ulongs_empty_first to be exactly the max number of empty ulongs,
+ * clearing bits that were set earlier isn't very common.
+ */
+
+/* magic number */
+#define HWLOC_BITMAP_MAGIC 0x20091007
+
+/* actual opaque type internals */
+struct hwloc_bitmap_s {
+ unsigned ulongs_count; /* how many ulong bitmasks are valid, >= 1 */
+ unsigned ulongs_allocated; /* how many ulong bitmasks are allocated, >= ulongs_count */
+ unsigned long *ulongs;
+ int infinite; /* set to 1 if all bits beyond ulongs are set */
+#ifdef HWLOC_DEBUG
+ int magic;
+#endif
+};
+
+/* overzealous check in debug-mode, not as powerful as valgrind but still useful */
+#ifdef HWLOC_DEBUG
+#define HWLOC__BITMAP_CHECK(set) do { \
+ assert((set)->magic == HWLOC_BITMAP_MAGIC); \
+ assert((set)->ulongs_count >= 1); \
+ assert((set)->ulongs_allocated >= (set)->ulongs_count); \
+} while (0)
+#else
+#define HWLOC__BITMAP_CHECK(set)
+#endif
+
+/* extract a subset from a set using an index or a cpu */
+#define HWLOC_SUBBITMAP_INDEX(cpu) ((cpu)/(HWLOC_BITS_PER_LONG))
+#define HWLOC_SUBBITMAP_CPU_ULBIT(cpu) ((cpu)%(HWLOC_BITS_PER_LONG))
+/* Read from a bitmap ulong without knowing whether x is valid.
+ * Writers should make sure that x is valid and modify set->ulongs[x] directly.
+ */
+#define HWLOC_SUBBITMAP_READULONG(set,x) ((x) < (set)->ulongs_count ? (set)->ulongs[x] : (set)->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO)
+
+/* predefined subset values */
+#define HWLOC_SUBBITMAP_ZERO 0UL
+#define HWLOC_SUBBITMAP_FULL (~0UL)
+#define HWLOC_SUBBITMAP_ULBIT(bit) (1UL<<(bit))
+#define HWLOC_SUBBITMAP_CPU(cpu) HWLOC_SUBBITMAP_ULBIT(HWLOC_SUBBITMAP_CPU_ULBIT(cpu))
+#define HWLOC_SUBBITMAP_ULBIT_TO(bit) (HWLOC_SUBBITMAP_FULL>>(HWLOC_BITS_PER_LONG-1-(bit)))
+#define HWLOC_SUBBITMAP_ULBIT_FROM(bit) (HWLOC_SUBBITMAP_FULL<<(bit))
+#define HWLOC_SUBBITMAP_ULBIT_FROMTO(begin,end) (HWLOC_SUBBITMAP_ULBIT_TO(end) & HWLOC_SUBBITMAP_ULBIT_FROM(begin))
+
+struct hwloc_bitmap_s * hwloc_bitmap_alloc(void)
+{
+ struct hwloc_bitmap_s * set;
+
+ set = malloc(sizeof(struct hwloc_bitmap_s));
+ if (!set)
+ return NULL;
+
+ set->ulongs_count = 1;
+ set->ulongs_allocated = 64/sizeof(unsigned long);
+ set->ulongs = malloc(64);
+ if (!set->ulongs) {
+ free(set);
+ return NULL;
+ }
+
+ set->ulongs[0] = HWLOC_SUBBITMAP_ZERO;
+ set->infinite = 0;
+#ifdef HWLOC_DEBUG
+ set->magic = HWLOC_BITMAP_MAGIC;
+#endif
+ return set;
+}
+
+struct hwloc_bitmap_s * hwloc_bitmap_alloc_full(void)
+{
+ struct hwloc_bitmap_s * set = hwloc_bitmap_alloc();
+ if (set) {
+ set->infinite = 1;
+ set->ulongs[0] = HWLOC_SUBBITMAP_FULL;
+ }
+ return set;
+}
+
+void hwloc_bitmap_free(struct hwloc_bitmap_s * set)
+{
+ if (!set)
+ return;
+
+ HWLOC__BITMAP_CHECK(set);
+#ifdef HWLOC_DEBUG
+ set->magic = 0;
+#endif
+
+ free(set->ulongs);
+ free(set);
+}
+
+/* enlarge until it contains at least needed_count ulongs.
+ */
+static void
+hwloc_bitmap_enlarge_by_ulongs(struct hwloc_bitmap_s * set, unsigned needed_count)
+{
+ unsigned tmp = 1 << hwloc_flsl((unsigned long) needed_count - 1);
+ if (tmp > set->ulongs_allocated) {
+ set->ulongs = realloc(set->ulongs, tmp * sizeof(unsigned long));
+ assert(set->ulongs);
+ set->ulongs_allocated = tmp;
+ }
+}
+
+/* enlarge until it contains at least needed_count ulongs,
+ * and update new ulongs according to the infinite field.
+ */
+static void
+hwloc_bitmap_realloc_by_ulongs(struct hwloc_bitmap_s * set, unsigned needed_count)
+{
+ unsigned i;
+
+ HWLOC__BITMAP_CHECK(set);
+
+ if (needed_count <= set->ulongs_count)
+ return;
+
+ /* realloc larger if needed */
+ hwloc_bitmap_enlarge_by_ulongs(set, needed_count);
+
+ /* fill the newly allocated subset depending on the infinite flag */
+ for(i=set->ulongs_count; i<needed_count; i++)
+ set->ulongs[i] = set->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO;
+ set->ulongs_count = needed_count;
+}
+
+/* realloc until it contains at least cpu+1 bits */
+#define hwloc_bitmap_realloc_by_cpu_index(set, cpu) hwloc_bitmap_realloc_by_ulongs(set, ((cpu)/HWLOC_BITS_PER_LONG)+1)
+
+/* reset a bitmap to exactely the needed size.
+ * the caller must reinitialize all ulongs and the infinite flag later.
+ */
+static void
+hwloc_bitmap_reset_by_ulongs(struct hwloc_bitmap_s * set, unsigned needed_count)
+{
+ hwloc_bitmap_enlarge_by_ulongs(set, needed_count);
+ set->ulongs_count = needed_count;
+}
+
+/* reset until it contains exactly cpu+1 bits (roundup to a ulong).
+ * the caller must reinitialize all ulongs and the infinite flag later.
+ */
+#define hwloc_bitmap_reset_by_cpu_index(set, cpu) hwloc_bitmap_reset_by_ulongs(set, ((cpu)/HWLOC_BITS_PER_LONG)+1)
+
+struct hwloc_bitmap_s * hwloc_bitmap_dup(const struct hwloc_bitmap_s * old)
+{
+ struct hwloc_bitmap_s * new;
+
+ if (!old)
+ return NULL;
+
+ HWLOC__BITMAP_CHECK(old);
+
+ new = malloc(sizeof(struct hwloc_bitmap_s));
+ if (!new)
+ return NULL;
+
+ new->ulongs = malloc(old->ulongs_allocated * sizeof(unsigned long));
+ if (!new->ulongs) {
+ free(new);
+ return NULL;
+ }
+ new->ulongs_allocated = old->ulongs_allocated;
+ new->ulongs_count = old->ulongs_count;
+ memcpy(new->ulongs, old->ulongs, new->ulongs_count * sizeof(unsigned long));
+ new->infinite = old->infinite;
+#ifdef HWLOC_DEBUG
+ new->magic = HWLOC_BITMAP_MAGIC;
+#endif
+ return new;
+}
+
+void hwloc_bitmap_copy(struct hwloc_bitmap_s * dst, const struct hwloc_bitmap_s * src)
+{
+ HWLOC__BITMAP_CHECK(dst);
+ HWLOC__BITMAP_CHECK(src);
+
+ hwloc_bitmap_reset_by_ulongs(dst, src->ulongs_count);
+
+ memcpy(dst->ulongs, src->ulongs, src->ulongs_count * sizeof(unsigned long));
+ dst->infinite = src->infinite;
+}
+
+/* Strings always use 32bit groups */
+#define HWLOC_PRIxSUBBITMAP "%08lx"
+#define HWLOC_BITMAP_SUBSTRING_SIZE 32
+#define HWLOC_BITMAP_SUBSTRING_LENGTH (HWLOC_BITMAP_SUBSTRING_SIZE/4)
+#define HWLOC_BITMAP_STRING_PER_LONG (HWLOC_BITS_PER_LONG/HWLOC_BITMAP_SUBSTRING_SIZE)
+
+int hwloc_bitmap_snprintf(char * __hwloc_restrict buf, size_t buflen, const struct hwloc_bitmap_s * __hwloc_restrict set)
+{
+ ssize_t size = buflen;
+ char *tmp = buf;
+ int res, ret = 0;
+ int needcomma = 0;
+ int i;
+ unsigned long accum = 0;
+ int accumed = 0;
+#if HWLOC_BITS_PER_LONG == HWLOC_BITMAP_SUBSTRING_SIZE
+ const unsigned long accum_mask = ~0UL;
+#else /* HWLOC_BITS_PER_LONG != HWLOC_BITMAP_SUBSTRING_SIZE */
+ const unsigned long accum_mask = ((1UL << HWLOC_BITMAP_SUBSTRING_SIZE) - 1) << (HWLOC_BITS_PER_LONG - HWLOC_BITMAP_SUBSTRING_SIZE);
+#endif /* HWLOC_BITS_PER_LONG != HWLOC_BITMAP_SUBSTRING_SIZE */
+
+ HWLOC__BITMAP_CHECK(set);
+
+ /* mark the end in case we do nothing later */
+ if (buflen > 0)
+ tmp[0] = '\0';
+
+ if (set->infinite) {
+ res = hwloc_snprintf(tmp, size, "0xf...f");
+ needcomma = 1;
+ if (res < 0)
+ return -1;
+ ret += res;
+ if (res >= size)
+ res = size>0 ? size - 1 : 0;
+ tmp += res;
+ size -= res;
+ }
+
+ i=set->ulongs_count-1;
+
+ if (set->infinite) {
+ /* ignore starting FULL since we have 0xf...f already */
+ while (i>=0 && set->ulongs[i] == HWLOC_SUBBITMAP_FULL)
+ i--;
+ } else {
+ /* ignore starting ZERO except the last one */
+ while (i>=0 && set->ulongs[i] == HWLOC_SUBBITMAP_ZERO)
+ i--;
+ }
+
+ while (i>=0 || accumed) {
+ /* Refill accumulator */
+ if (!accumed) {
+ accum = set->ulongs[i--];
+ accumed = HWLOC_BITS_PER_LONG;
+ }
+
+ if (accum & accum_mask) {
+ /* print the whole subset if not empty */
+ res = hwloc_snprintf(tmp, size, needcomma ? ",0x" HWLOC_PRIxSUBBITMAP : "0x" HWLOC_PRIxSUBBITMAP,
+ (accum & accum_mask) >> (HWLOC_BITS_PER_LONG - HWLOC_BITMAP_SUBSTRING_SIZE));
+ needcomma = 1;
+ } else if (i == -1 && accumed == HWLOC_BITMAP_SUBSTRING_SIZE) {
+ /* print a single 0 to mark the last subset */
+ res = hwloc_snprintf(tmp, size, needcomma ? ",0x0" : "0x0");
+ } else if (needcomma) {
+ res = hwloc_snprintf(tmp, size, ",");
+ } else {
+ res = 0;
+ }
+ if (res < 0)
+ return -1;
+ ret += res;
+
+#if HWLOC_BITS_PER_LONG == HWLOC_BITMAP_SUBSTRING_SIZE
+ accum = 0;
+ accumed = 0;
+#else
+ accum <<= HWLOC_BITMAP_SUBSTRING_SIZE;
+ accumed -= HWLOC_BITMAP_SUBSTRING_SIZE;
+#endif
+
+ if (res >= size)
+ res = size>0 ? size - 1 : 0;
+
+ tmp += res;
+ size -= res;
+ }
+
+ /* if didn't display anything, display 0x0 */
+ if (!ret) {
+ res = hwloc_snprintf(tmp, size, "0x0");
+ if (res < 0)
+ return -1;
+ ret += res;
+ }
+
+ return ret;
+}
+
+int hwloc_bitmap_asprintf(char ** strp, const struct hwloc_bitmap_s * __hwloc_restrict set)
+{
+ int len;
+ char *buf;
+
+ HWLOC__BITMAP_CHECK(set);
+
+ len = hwloc_bitmap_snprintf(NULL, 0, set);
+ buf = malloc(len+1);
+ *strp = buf;
+ return hwloc_bitmap_snprintf(buf, len+1, set);
+}
+
+int hwloc_bitmap_sscanf(struct hwloc_bitmap_s *set, const char * __hwloc_restrict string)
+{
+ const char * current = string;
+ unsigned long accum = 0;
+ int count=0;
+ int infinite = 0;
+
+ /* count how many substrings there are */
+ count++;
+ while ((current = strchr(current+1, ',')) != NULL)
+ count++;
+
+ current = string;
+ if (!strncmp("0xf...f", current, 7)) {
+ current += 7;
+ if (*current != ',') {
+ /* special case for infinite/full bitmap */
+ hwloc_bitmap_fill(set);
+ return 0;
+ }
+ current++;
+ infinite = 1;
+ count--;
+ }
+
+ hwloc_bitmap_reset_by_ulongs(set, (count + HWLOC_BITMAP_STRING_PER_LONG - 1) / HWLOC_BITMAP_STRING_PER_LONG);
+ set->infinite = 0;
+
+ while (*current != '\0') {
+ unsigned long val;
+ char *next;
+ val = strtoul(current, &next, 16);
+
+ assert(count > 0);
+ count--;
+
+ accum |= (val << ((count * HWLOC_BITMAP_SUBSTRING_SIZE) % HWLOC_BITS_PER_LONG));
+ if (!(count % HWLOC_BITMAP_STRING_PER_LONG)) {
+ set->ulongs[count / HWLOC_BITMAP_STRING_PER_LONG] = accum;
+ accum = 0;
+ }
+
+ if (*next != ',') {
+ if (*next || count > 0)
+ goto failed;
+ else
+ break;
+ }
+ current = (const char*) next+1;
+ }
+
+ set->infinite = infinite; /* set at the end, to avoid spurious realloc with filled new ulongs */
+
+ return 0;
+
+ failed:
+ /* failure to parse */
+ hwloc_bitmap_zero(set);
+ return -1;
+}
+
+int hwloc_bitmap_list_snprintf(char * __hwloc_restrict buf, size_t buflen, const struct hwloc_bitmap_s * __hwloc_restrict set)
+{
+ int prev = -1;
+ hwloc_bitmap_t reverse;
+ ssize_t size = buflen;
+ char *tmp = buf;
+ int res, ret = 0;
+ int needcomma = 0;
+
+ HWLOC__BITMAP_CHECK(set);
+
+ reverse = hwloc_bitmap_alloc(); /* FIXME: add hwloc_bitmap_alloc_size() + hwloc_bitmap_init_allocated() to avoid malloc? */
+ hwloc_bitmap_not(reverse, set);
+
+ /* mark the end in case we do nothing later */
+ if (buflen > 0)
+ tmp[0] = '\0';
+
+ while (1) {
+ int begin, end;
+
+ begin = hwloc_bitmap_next(set, prev);
+ if (begin == -1)
+ break;
+ end = hwloc_bitmap_next(reverse, begin);
+
+ if (end == begin+1) {
+ res = hwloc_snprintf(tmp, size, needcomma ? ",%d" : "%d", begin);
+ } else if (end == -1) {
+ res = hwloc_snprintf(tmp, size, needcomma ? ",%d-" : "%d-", begin);
+ } else {
+ res = hwloc_snprintf(tmp, size, needcomma ? ",%d-%d" : "%d-%d", begin, end-1);
+ }
+ if (res < 0) {
+ hwloc_bitmap_free(reverse);
+ return -1;
+ }
+ ret += res;
+
+ if (res >= size)
+ res = size>0 ? size - 1 : 0;
+
+ tmp += res;
+ size -= res;
+ needcomma = 1;
+
+ if (end == -1)
+ break;
+ else
+ prev = end - 1;
+ }
+
+ hwloc_bitmap_free(reverse);
+
+ return ret;
+}
+
+int hwloc_bitmap_list_asprintf(char ** strp, const struct hwloc_bitmap_s * __hwloc_restrict set)
+{
+ int len;
+ char *buf;
+
+ HWLOC__BITMAP_CHECK(set);
+
+ len = hwloc_bitmap_list_snprintf(NULL, 0, set);
+ buf = malloc(len+1);
+ *strp = buf;
+ return hwloc_bitmap_list_snprintf(buf, len+1, set);
+}
+
+int hwloc_bitmap_list_sscanf(struct hwloc_bitmap_s *set, const char * __hwloc_restrict string)
+{
+ const char * current = string;
+ char *next;
+ long begin = -1, val;
+
+ hwloc_bitmap_zero(set);
+
+ while (*current != '\0') {
+
+ /* ignore empty ranges */
+ while (*current == ',')
+ current++;
+
+ val = strtoul(current, &next, 0);
+ /* make sure we got at least one digit */
+ if (next == current)
+ goto failed;
+
+ if (begin != -1) {
+ /* finishing a range */
+ hwloc_bitmap_set_range(set, begin, val);
+ begin = -1;
+
+ } else if (*next == '-') {
+ /* starting a new range */
+ if (*(next+1) == '\0') {
+ /* infinite range */
+ hwloc_bitmap_set_range(set, val, -1);
+ break;
+ } else {
+ /* normal range */
+ begin = val;
+ }
+
+ } else if (*next == ',' || *next == '\0') {
+ /* single digit */
+ hwloc_bitmap_set(set, val);
+ }
+
+ if (*next == '\0')
+ break;
+ current = next+1;
+ }
+
+ return 0;
+
+ failed:
+ /* failure to parse */
+ hwloc_bitmap_zero(set);
+ return -1;
+}
+
+int hwloc_bitmap_taskset_snprintf(char * __hwloc_restrict buf, size_t buflen, const struct hwloc_bitmap_s * __hwloc_restrict set)
+{
+ ssize_t size = buflen;
+ char *tmp = buf;
+ int res, ret = 0;
+ int started = 0;
+ int i;
+
+ HWLOC__BITMAP_CHECK(set);
+
+ /* mark the end in case we do nothing later */
+ if (buflen > 0)
+ tmp[0] = '\0';
+
+ if (set->infinite) {
+ res = hwloc_snprintf(tmp, size, "0xf...f");
+ started = 1;
+ if (res < 0)
+ return -1;
+ ret += res;
+ if (res >= size)
+ res = size>0 ? size - 1 : 0;
+ tmp += res;
+ size -= res;
+ }
+
+ i=set->ulongs_count-1;
+
+ if (set->infinite) {
+ /* ignore starting FULL since we have 0xf...f already */
+ while (i>=0 && set->ulongs[i] == HWLOC_SUBBITMAP_FULL)
+ i--;
+ } else {
+ /* ignore starting ZERO except the last one */
+ while (i>=1 && set->ulongs[i] == HWLOC_SUBBITMAP_ZERO)
+ i--;
+ }
+
+ while (i>=0) {
+ unsigned long val = set->ulongs[i--];
+ if (started) {
+ /* print the whole subset */
+#if HWLOC_BITS_PER_LONG == 64
+ res = hwloc_snprintf(tmp, size, "%016lx", val);
+#else
+ res = hwloc_snprintf(tmp, size, "%08lx", val);
+#endif
+ } else if (val || i == -1) {
+ res = hwloc_snprintf(tmp, size, "0x%lx", val);
+ started = 1;
+ } else {
+ res = 0;
+ }
+ if (res < 0)
+ return -1;
+ ret += res;
+ if (res >= size)
+ res = size>0 ? size - 1 : 0;
+ tmp += res;
+ size -= res;
+ }
+
+ /* if didn't display anything, display 0x0 */
+ if (!ret) {
+ res = hwloc_snprintf(tmp, size, "0x0");
+ if (res < 0)
+ return -1;
+ ret += res;
+ }
+
+ return ret;
+}
+
+int hwloc_bitmap_taskset_asprintf(char ** strp, const struct hwloc_bitmap_s * __hwloc_restrict set)
+{
+ int len;
+ char *buf;
+
+ HWLOC__BITMAP_CHECK(set);
+
+ len = hwloc_bitmap_taskset_snprintf(NULL, 0, set);
+ buf = malloc(len+1);
+ *strp = buf;
+ return hwloc_bitmap_taskset_snprintf(buf, len+1, set);
+}
+
+int hwloc_bitmap_taskset_sscanf(struct hwloc_bitmap_s *set, const char * __hwloc_restrict string)
+{
+ const char * current = string;
+ int chars;
+ int count;
+ int infinite = 0;
+
+ current = string;
+ if (!strncmp("0xf...f", current, 7)) {
+ /* infinite bitmap */
+ infinite = 1;
+ current += 7;
+ if (*current == '\0') {
+ /* special case for infinite/full bitmap */
+ hwloc_bitmap_fill(set);
+ return 0;
+ }
+ } else {
+ /* finite bitmap */
+ if (!strncmp("0x", current, 2))
+ current += 2;
+ if (*current == '\0') {
+ /* special case for empty bitmap */
+ hwloc_bitmap_zero(set);
+ return 0;
+ }
+ }
+ /* we know there are other characters now */
+
+ chars = strlen(current);
+ count = (chars * 4 + HWLOC_BITS_PER_LONG - 1) / HWLOC_BITS_PER_LONG;
+
+ hwloc_bitmap_reset_by_ulongs(set, count);
+ set->infinite = 0;
+
+ while (*current != '\0') {
+ int tmpchars;
+ char ustr[17];
+ unsigned long val;
+ char *next;
+
+ tmpchars = chars % (HWLOC_BITS_PER_LONG/4);
+ if (!tmpchars)
+ tmpchars = (HWLOC_BITS_PER_LONG/4);
+
+ memcpy(ustr, current, tmpchars);
+ ustr[tmpchars] = '\0';
+ val = strtoul(ustr, &next, 16);
+ if (*next != '\0')
+ goto failed;
+
+ set->ulongs[count-1] = val;
+
+ current += tmpchars;
+ chars -= tmpchars;
+ count--;
+ }
+
+ set->infinite = infinite; /* set at the end, to avoid spurious realloc with filled new ulongs */
+
+ return 0;
+
+ failed:
+ /* failure to parse */
+ hwloc_bitmap_zero(set);
+ return -1;
+}
+
+static void hwloc_bitmap__zero(struct hwloc_bitmap_s *set)
+{
+ unsigned i;
+ for(i=0; i<set->ulongs_count; i++)
+ set->ulongs[i] = HWLOC_SUBBITMAP_ZERO;
+ set->infinite = 0;
+}
+
+void hwloc_bitmap_zero(struct hwloc_bitmap_s * set)
+{
+ HWLOC__BITMAP_CHECK(set);
+
+ hwloc_bitmap_reset_by_ulongs(set, 1);
+ hwloc_bitmap__zero(set);
+}
+
+static void hwloc_bitmap__fill(struct hwloc_bitmap_s * set)
+{
+ unsigned i;
+ for(i=0; i<set->ulongs_count; i++)
+ set->ulongs[i] = HWLOC_SUBBITMAP_FULL;
+ set->infinite = 1;
+}
+
+void hwloc_bitmap_fill(struct hwloc_bitmap_s * set)
+{
+ HWLOC__BITMAP_CHECK(set);
+
+ hwloc_bitmap_reset_by_ulongs(set, 1);
+ hwloc_bitmap__fill(set);
+}
+
+void hwloc_bitmap_from_ulong(struct hwloc_bitmap_s *set, unsigned long mask)
+{
+ HWLOC__BITMAP_CHECK(set);
+
+ hwloc_bitmap_reset_by_ulongs(set, 1);
+ set->ulongs[0] = mask; /* there's always at least one ulong allocated */
+ set->infinite = 0;
+}
+
+void hwloc_bitmap_from_ith_ulong(struct hwloc_bitmap_s *set, unsigned i, unsigned long mask)
+{
+ unsigned j;
+
+ HWLOC__BITMAP_CHECK(set);
+
+ hwloc_bitmap_reset_by_ulongs(set, i+1);
+ set->ulongs[i] = mask;
+ for(j=0; j<i; j++)
+ set->ulongs[j] = HWLOC_SUBBITMAP_ZERO;
+ set->infinite = 0;
+}
+
+unsigned long hwloc_bitmap_to_ulong(const struct hwloc_bitmap_s *set)
+{
+ HWLOC__BITMAP_CHECK(set);
+
+ return set->ulongs[0]; /* there's always at least one ulong allocated */
+}
+
+unsigned long hwloc_bitmap_to_ith_ulong(const struct hwloc_bitmap_s *set, unsigned i)
+{
+ HWLOC__BITMAP_CHECK(set);
+
+ return HWLOC_SUBBITMAP_READULONG(set, i);
+}
+
+void hwloc_bitmap_only(struct hwloc_bitmap_s * set, unsigned cpu)
+{
+ unsigned index_ = HWLOC_SUBBITMAP_INDEX(cpu);
+
+ HWLOC__BITMAP_CHECK(set);
+
+ hwloc_bitmap_reset_by_cpu_index(set, cpu);
+ hwloc_bitmap__zero(set);
+ set->ulongs[index_] |= HWLOC_SUBBITMAP_CPU(cpu);
+}
+
+void hwloc_bitmap_allbut(struct hwloc_bitmap_s * set, unsigned cpu)
+{
+ unsigned index_ = HWLOC_SUBBITMAP_INDEX(cpu);
+
+ HWLOC__BITMAP_CHECK(set);
+
+ hwloc_bitmap_reset_by_cpu_index(set, cpu);
+ hwloc_bitmap__fill(set);
+ set->ulongs[index_] &= ~HWLOC_SUBBITMAP_CPU(cpu);
+}
+
+void hwloc_bitmap_set(struct hwloc_bitmap_s * set, unsigned cpu)
+{
+ unsigned index_ = HWLOC_SUBBITMAP_INDEX(cpu);
+
+ HWLOC__BITMAP_CHECK(set);
+
+ /* nothing to do if setting inside the infinite part of the bitmap */
+ if (set->infinite && cpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+ return;
+
+ hwloc_bitmap_realloc_by_cpu_index(set, cpu);
+ set->ulongs[index_] |= HWLOC_SUBBITMAP_CPU(cpu);
+}
+
+void hwloc_bitmap_set_range(struct hwloc_bitmap_s * set, unsigned begincpu, int _endcpu)
+{
+ unsigned i;
+ unsigned beginset,endset;
+ unsigned endcpu = (unsigned) _endcpu;
+
+ HWLOC__BITMAP_CHECK(set);
+
+ if (_endcpu == -1) {
+ set->infinite = 1;
+ /* keep endcpu == -1 since this unsigned is actually larger than anything else */
+ }
+
+ if (set->infinite) {
+ /* truncate the range according to the infinite part of the bitmap */
+ if (endcpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+ endcpu = set->ulongs_count * HWLOC_BITS_PER_LONG - 1;
+ if (begincpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+ return;
+ }
+ if (endcpu < begincpu)
+ return;
+ hwloc_bitmap_realloc_by_cpu_index(set, endcpu);
+
+ beginset = HWLOC_SUBBITMAP_INDEX(begincpu);
+ endset = HWLOC_SUBBITMAP_INDEX(endcpu);
+ for(i=beginset+1; i<endset; i++)
+ set->ulongs[i] = HWLOC_SUBBITMAP_FULL;
+ if (beginset == endset) {
+ set->ulongs[beginset] |= HWLOC_SUBBITMAP_ULBIT_FROMTO(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu), HWLOC_SUBBITMAP_CPU_ULBIT(endcpu));
+ } else {
+ set->ulongs[beginset] |= HWLOC_SUBBITMAP_ULBIT_FROM(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu));
+ set->ulongs[endset] |= HWLOC_SUBBITMAP_ULBIT_TO(HWLOC_SUBBITMAP_CPU_ULBIT(endcpu));
+ }
+}
+
+void hwloc_bitmap_set_ith_ulong(struct hwloc_bitmap_s *set, unsigned i, unsigned long mask)
+{
+ HWLOC__BITMAP_CHECK(set);
+
+ hwloc_bitmap_realloc_by_ulongs(set, i+1);
+ set->ulongs[i] = mask;
+}
+
+void hwloc_bitmap_clr(struct hwloc_bitmap_s * set, unsigned cpu)
+{
+ unsigned index_ = HWLOC_SUBBITMAP_INDEX(cpu);
+
+ HWLOC__BITMAP_CHECK(set);
+
+ /* nothing to do if clearing inside the infinitely-unset part of the bitmap */
+ if (!set->infinite && cpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+ return;
+
+ hwloc_bitmap_realloc_by_cpu_index(set, cpu);
+ set->ulongs[index_] &= ~HWLOC_SUBBITMAP_CPU(cpu);
+}
+
+void hwloc_bitmap_clr_range(struct hwloc_bitmap_s * set, unsigned begincpu, int _endcpu)
+{
+ unsigned i;
+ unsigned beginset,endset;
+ unsigned endcpu = (unsigned) _endcpu;
+
+ HWLOC__BITMAP_CHECK(set);
+
+ if (_endcpu == -1) {
+ set->infinite = 0;
+ /* keep endcpu == -1 since this unsigned is actually larger than anything else */
+ }
+
+ if (!set->infinite) {
+ /* truncate the range according to the infinitely-unset part of the bitmap */
+ if (endcpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+ endcpu = set->ulongs_count * HWLOC_BITS_PER_LONG - 1;
+ if (begincpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+ return;
+ }
+ if (endcpu < begincpu)
+ return;
+ hwloc_bitmap_realloc_by_cpu_index(set, endcpu);
+
+ beginset = HWLOC_SUBBITMAP_INDEX(begincpu);
+ endset = HWLOC_SUBBITMAP_INDEX(endcpu);
+ for(i=beginset+1; i<endset; i++)
+ set->ulongs[i] = HWLOC_SUBBITMAP_ZERO;
+ if (beginset == endset) {
+ set->ulongs[beginset] &= ~HWLOC_SUBBITMAP_ULBIT_FROMTO(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu), HWLOC_SUBBITMAP_CPU_ULBIT(endcpu));
+ } else {
+ set->ulongs[beginset] &= ~HWLOC_SUBBITMAP_ULBIT_FROM(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu));
+ set->ulongs[endset] &= ~HWLOC_SUBBITMAP_ULBIT_TO(HWLOC_SUBBITMAP_CPU_ULBIT(endcpu));
+ }
+}
+
+int hwloc_bitmap_isset(const struct hwloc_bitmap_s * set, unsigned cpu)
+{
+ unsigned index_ = HWLOC_SUBBITMAP_INDEX(cpu);
+
+ HWLOC__BITMAP_CHECK(set);
+
+ return (HWLOC_SUBBITMAP_READULONG(set, index_) & HWLOC_SUBBITMAP_CPU(cpu)) != 0;
+}
+
+int hwloc_bitmap_iszero(const struct hwloc_bitmap_s *set)
+{
+ unsigned i;
+
+ HWLOC__BITMAP_CHECK(set);
+
+ if (set->infinite)
+ return 0;
+ for(i=0; i<set->ulongs_count; i++)
+ if (set->ulongs[i] != HWLOC_SUBBITMAP_ZERO)
+ return 0;
+ return 1;
+}
+
+int hwloc_bitmap_isfull(const struct hwloc_bitmap_s *set)
+{
+ unsigned i;
+
+ HWLOC__BITMAP_CHECK(set);
+
+ if (!set->infinite)
+ return 0;
+ for(i=0; i<set->ulongs_count; i++)
+ if (set->ulongs[i] != HWLOC_SUBBITMAP_FULL)
+ return 0;
+ return 1;
+}
+
+int hwloc_bitmap_isequal (const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+{
+ unsigned count1 = set1->ulongs_count;
+ unsigned count2 = set2->ulongs_count;
+ unsigned min_count = count1 < count2 ? count1 : count2;
+ unsigned i;
+
+ HWLOC__BITMAP_CHECK(set1);
+ HWLOC__BITMAP_CHECK(set2);
+
+ for(i=0; i<min_count; i++)
+ if (set1->ulongs[i] != set2->ulongs[i])
+ return 0;
+
+ if (count1 != count2) {
+ unsigned long w1 = set1->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO;
+ unsigned long w2 = set2->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO;
+ for(i=min_count; i<count1; i++) {
+ if (set1->ulongs[i] != w2)
+ return 0;
+ }
+ for(i=min_count; i<count2; i++) {
+ if (set2->ulongs[i] != w1)
+ return 0;
+ }
+ }
+
+ if (set1->infinite != set2->infinite)
+ return 0;
+
+ return 1;
+}
+
+int hwloc_bitmap_intersects (const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+{
+ unsigned count1 = set1->ulongs_count;
+ unsigned count2 = set2->ulongs_count;
+ unsigned min_count = count1 < count2 ? count1 : count2;
+ unsigned i;
+
+ HWLOC__BITMAP_CHECK(set1);
+ HWLOC__BITMAP_CHECK(set2);
+
+ for(i=0; i<min_count; i++)
+ if (set1->ulongs[i] & set2->ulongs[i])
+ return 1;
+
+ if (count1 != count2) {
+ if (set2->infinite) {
+ for(i=min_count; i<set1->ulongs_count; i++)
+ if (set1->ulongs[i])
+ return 1;
+ }
+ if (set1->infinite) {
+ for(i=min_count; i<set2->ulongs_count; i++)
+ if (set2->ulongs[i])
+ return 1;
+ }
+ }
+
+ if (set1->infinite && set2->infinite)
+ return 1;
+
+ return 0;
+}
+
+int hwloc_bitmap_isincluded (const struct hwloc_bitmap_s *sub_set, const struct hwloc_bitmap_s *super_set)
+{
+ unsigned super_count = super_set->ulongs_count;
+ unsigned sub_count = sub_set->ulongs_count;
+ unsigned min_count = super_count < sub_count ? super_count : sub_count;
+ unsigned i;
+
+ HWLOC__BITMAP_CHECK(sub_set);
+ HWLOC__BITMAP_CHECK(super_set);
+
+ for(i=0; i<min_count; i++)
+ if (super_set->ulongs[i] != (super_set->ulongs[i] | sub_set->ulongs[i]))
+ return 0;
+
+ if (super_count != sub_count) {
+ if (!super_set->infinite)
+ for(i=min_count; i<sub_count; i++)
+ if (sub_set->ulongs[i])
+ return 0;
+ if (sub_set->infinite)
+ for(i=min_count; i<super_count; i++)
+ if (super_set->ulongs[i] != HWLOC_SUBBITMAP_FULL)
+ return 0;
+ }
+
+ if (sub_set->infinite && !super_set->infinite)
+ return 0;
+
+ return 1;
+}
+
+void hwloc_bitmap_or (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+{
+ /* cache counts so that we can reset res even if it's also set1 or set2 */
+ unsigned count1 = set1->ulongs_count;
+ unsigned count2 = set2->ulongs_count;
+ unsigned max_count = count1 > count2 ? count1 : count2;
+ unsigned min_count = count1 + count2 - max_count;
+ unsigned i;
+
+ HWLOC__BITMAP_CHECK(res);
+ HWLOC__BITMAP_CHECK(set1);
+ HWLOC__BITMAP_CHECK(set2);
+
+ hwloc_bitmap_reset_by_ulongs(res, max_count);
+
+ for(i=0; i<min_count; i++)
+ res->ulongs[i] = set1->ulongs[i] | set2->ulongs[i];
+
+ if (count1 != count2) {
+ if (min_count < count1) {
+ if (set2->infinite) {
+ res->ulongs_count = min_count;
+ } else {
+ for(i=min_count; i<max_count; i++)
+ res->ulongs[i] = set1->ulongs[i];
+ }
+ } else {
+ if (set1->infinite) {
+ res->ulongs_count = min_count;
+ } else {
+ for(i=min_count; i<max_count; i++)
+ res->ulongs[i] = set2->ulongs[i];
+ }
+ }
+ }
+
+ res->infinite = set1->infinite || set2->infinite;
+}
+
+void hwloc_bitmap_and (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+{
+ /* cache counts so that we can reset res even if it's also set1 or set2 */
+ unsigned count1 = set1->ulongs_count;
+ unsigned count2 = set2->ulongs_count;
+ unsigned max_count = count1 > count2 ? count1 : count2;
+ unsigned min_count = count1 + count2 - max_count;
+ unsigned i;
+
+ HWLOC__BITMAP_CHECK(res);
+ HWLOC__BITMAP_CHECK(set1);
+ HWLOC__BITMAP_CHECK(set2);
+
+ hwloc_bitmap_reset_by_ulongs(res, max_count);
+
+ for(i=0; i<min_count; i++)
+ res->ulongs[i] = set1->ulongs[i] & set2->ulongs[i];
+
+ if (count1 != count2) {
+ if (min_count < count1) {
+ if (set2->infinite) {
+ for(i=min_count; i<max_count; i++)
+ res->ulongs[i] = set1->ulongs[i];
+ } else {
+ res->ulongs_count = min_count;
+ }
+ } else {
+ if (set1->infinite) {
+ for(i=min_count; i<max_count; i++)
+ res->ulongs[i] = set2->ulongs[i];
+ } else {
+ res->ulongs_count = min_count;
+ }
+ }
+ }
+
+ res->infinite = set1->infinite && set2->infinite;
+}
+
+void hwloc_bitmap_andnot (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+{
+ /* cache counts so that we can reset res even if it's also set1 or set2 */
+ unsigned count1 = set1->ulongs_count;
+ unsigned count2 = set2->ulongs_count;
+ unsigned max_count = count1 > count2 ? count1 : count2;
+ unsigned min_count = count1 + count2 - max_count;
+ unsigned i;
+
+ HWLOC__BITMAP_CHECK(res);
+ HWLOC__BITMAP_CHECK(set1);
+ HWLOC__BITMAP_CHECK(set2);
+
+ hwloc_bitmap_reset_by_ulongs(res, max_count);
+
+ for(i=0; i<min_count; i++)
+ res->ulongs[i] = set1->ulongs[i] & ~set2->ulongs[i];
+
+ if (count1 != count2) {
+ if (min_count < count1) {
+ if (!set2->infinite) {
+ for(i=min_count; i<max_count; i++)
+ res->ulongs[i] = set1->ulongs[i];
+ } else {
+ res->ulongs_count = min_count;
+ }
+ } else {
+ if (set1->infinite) {
+ for(i=min_count; i<max_count; i++)
+ res->ulongs[i] = ~set2->ulongs[i];
+ } else {
+ res->ulongs_count = min_count;
+ }
+ }
+ }
+
+ res->infinite = set1->infinite && !set2->infinite;
+}
+
+void hwloc_bitmap_xor (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+{
+ /* cache counts so that we can reset res even if it's also set1 or set2 */
+ unsigned count1 = set1->ulongs_count;
+ unsigned count2 = set2->ulongs_count;
+ unsigned max_count = count1 > count2 ? count1 : count2;
+ unsigned min_count = count1 + count2 - max_count;
+ unsigned i;
+
+ HWLOC__BITMAP_CHECK(res);
+ HWLOC__BITMAP_CHECK(set1);
+ HWLOC__BITMAP_CHECK(set2);
+
+ hwloc_bitmap_reset_by_ulongs(res, max_count);
+
+ for(i=0; i<min_count; i++)
+ res->ulongs[i] = set1->ulongs[i] ^ set2->ulongs[i];
+
+ if (count1 != count2) {
+ if (min_count < count1) {
+ unsigned long w2 = set2->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO;
+ for(i=min_count; i<max_count; i++)
+ res->ulongs[i] = set1->ulongs[i] ^ w2;
+ } else {
+ unsigned long w1 = set1->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO;
+ for(i=min_count; i<max_count; i++)
+ res->ulongs[i] = set2->ulongs[i] ^ w1;
+ }
+ }
+
+ res->infinite = (!set1->infinite) != (!set2->infinite);
+}
+
+void hwloc_bitmap_not (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set)
+{
+ unsigned count = set->ulongs_count;
+ unsigned i;
+
+ HWLOC__BITMAP_CHECK(res);
+ HWLOC__BITMAP_CHECK(set);
+
+ hwloc_bitmap_reset_by_ulongs(res, count);
+
+ for(i=0; i<count; i++)
+ res->ulongs[i] = ~set->ulongs[i];
+
+ res->infinite = !set->infinite;
+}
+
+int hwloc_bitmap_first(const struct hwloc_bitmap_s * set)
+{
+ unsigned i;
+
+ HWLOC__BITMAP_CHECK(set);
+
+ for(i=0; i<set->ulongs_count; i++) {
+ /* subsets are unsigned longs, use ffsl */
+ unsigned long w = set->ulongs[i];
+ if (w)
+ return hwloc_ffsl(w) - 1 + HWLOC_BITS_PER_LONG*i;
+ }
+
+ if (set->infinite)
+ return set->ulongs_count * HWLOC_BITS_PER_LONG;
+
+ return -1;
+}
+
+int hwloc_bitmap_last(const struct hwloc_bitmap_s * set)
+{
+ int i;
+
+ HWLOC__BITMAP_CHECK(set);
+
+ if (set->infinite)
+ return -1;
+
+ for(i=set->ulongs_count-1; i>=0; i--) {
+ /* subsets are unsigned longs, use flsl */
+ unsigned long w = set->ulongs[i];
+ if (w)
+ return hwloc_flsl(w) - 1 + HWLOC_BITS_PER_LONG*i;
+ }
+
+ return -1;
+}
+
+int hwloc_bitmap_next(const struct hwloc_bitmap_s * set, int prev_cpu)
+{
+ unsigned i = HWLOC_SUBBITMAP_INDEX(prev_cpu + 1);
+
+ HWLOC__BITMAP_CHECK(set);
+
+ if (i >= set->ulongs_count) {
+ if (set->infinite)
+ return prev_cpu + 1;
+ else
+ return -1;
+ }
+
+ for(; i<set->ulongs_count; i++) {
+ /* subsets are unsigned longs, use ffsl */
+ unsigned long w = set->ulongs[i];
+
+ /* if the prev cpu is in the same word as the possible next one,
+ we need to mask out previous cpus */
+ if (prev_cpu >= 0 && HWLOC_SUBBITMAP_INDEX((unsigned) prev_cpu) == i)
+ w &= ~HWLOC_SUBBITMAP_ULBIT_TO(HWLOC_SUBBITMAP_CPU_ULBIT(prev_cpu));
+
+ if (w)
+ return hwloc_ffsl(w) - 1 + HWLOC_BITS_PER_LONG*i;
+ }
+
+ if (set->infinite)
+ return set->ulongs_count * HWLOC_BITS_PER_LONG;
+
+ return -1;
+}
+
+void hwloc_bitmap_singlify(struct hwloc_bitmap_s * set)
+{
+ unsigned i;
+ int found = 0;
+
+ HWLOC__BITMAP_CHECK(set);
+
+ for(i=0; i<set->ulongs_count; i++) {
+ if (found) {
+ set->ulongs[i] = HWLOC_SUBBITMAP_ZERO;
+ continue;
+ } else {
+ /* subsets are unsigned longs, use ffsl */
+ unsigned long w = set->ulongs[i];
+ if (w) {
+ int _ffs = hwloc_ffsl(w);
+ set->ulongs[i] = HWLOC_SUBBITMAP_CPU(_ffs-1);
+ found = 1;
+ }
+ }
+ }
+
+ if (set->infinite) {
+ if (found) {
+ set->infinite = 0;
+ } else {
+ /* set the first non allocated bit */
+ unsigned first = set->ulongs_count * HWLOC_BITS_PER_LONG;
+ set->infinite = 0; /* do not let realloc fill the newly allocated sets */
+ hwloc_bitmap_set(set, first);
+ }
+ }
+}
+
+int hwloc_bitmap_compare_first(const struct hwloc_bitmap_s * set1, const struct hwloc_bitmap_s * set2)
+{
+ unsigned count1 = set1->ulongs_count;
+ unsigned count2 = set2->ulongs_count;
+ unsigned max_count = count1 > count2 ? count1 : count2;
+ unsigned min_count = count1 + count2 - max_count;
+ unsigned i;
+
+ HWLOC__BITMAP_CHECK(set1);
+ HWLOC__BITMAP_CHECK(set2);
+
+ for(i=0; i<min_count; i++) {
+ unsigned long w1 = set1->ulongs[i];
+ unsigned long w2 = set2->ulongs[i];
+ if (w1 || w2) {
+ int _ffs1 = hwloc_ffsl(w1);
+ int _ffs2 = hwloc_ffsl(w2);
+ /* if both have a bit set, compare for real */
+ if (_ffs1 && _ffs2)
+ return _ffs1-_ffs2;
+ /* one is empty, and it is considered higher, so reverse-compare them */
+ return _ffs2-_ffs1;
+ }
+ }
+
+ if (count1 != count2) {
+ if (min_count < count2) {
+ for(i=min_count; i<count2; i++) {
+ unsigned long w2 = set2->ulongs[i];
+ if (set1->infinite)
+ return -!(w2 & 1);
+ else if (w2)
+ return 1;
+ }
+ } else {
+ for(i=min_count; i<count1; i++) {
+ unsigned long w1 = set1->ulongs[i];
+ if (set2->infinite)
+ return !(w1 & 1);
+ else if (w1)
+ return -1;
+ }
+ }
+ }
+
+ return !!set1->infinite - !!set2->infinite;
+}
+
+int hwloc_bitmap_compare(const struct hwloc_bitmap_s * set1, const struct hwloc_bitmap_s * set2)
+{
+ unsigned count1 = set1->ulongs_count;
+ unsigned count2 = set2->ulongs_count;
+ unsigned max_count = count1 > count2 ? count1 : count2;
+ unsigned min_count = count1 + count2 - max_count;
+ int i;
+
+ HWLOC__BITMAP_CHECK(set1);
+ HWLOC__BITMAP_CHECK(set2);
+
+ if ((!set1->infinite) != (!set2->infinite))
+ return !!set1->infinite - !!set2->infinite;
+
+ if (count1 != count2) {
+ if (min_count < count2) {
+ unsigned long val1 = set1->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO;
+ for(i=max_count-1; i>=(signed) min_count; i--) {
+ unsigned long val2 = set2->ulongs[i];
+ if (val1 == val2)
+ continue;
+ return val1 < val2 ? -1 : 1;
+ }
+ } else {
+ unsigned long val2 = set2->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO;
+ for(i=max_count-1; i>=(signed) min_count; i--) {
+ unsigned long val1 = set1->ulongs[i];
+ if (val1 == val2)
+ continue;
+ return val1 < val2 ? -1 : 1;
+ }
+ }
+ }
+
+ for(i=min_count-1; i>=0; i--) {
+ unsigned long val1 = set1->ulongs[i];
+ unsigned long val2 = set2->ulongs[i];
+ if (val1 == val2)
+ continue;
+ return val1 < val2 ? -1 : 1;
+ }
+
+ return 0;
+}
+
+int hwloc_bitmap_weight(const struct hwloc_bitmap_s * set)
+{
+ int weight = 0;
+ unsigned i;
+
+ HWLOC__BITMAP_CHECK(set);
+
+ if (set->infinite)
+ return -1;
+
+ for(i=0; i<set->ulongs_count; i++)
+ weight += hwloc_weight_long(set->ulongs[i]);
+ return weight;
+}
+
+int hwloc_bitmap_compare_inclusion(const struct hwloc_bitmap_s * set1, const struct hwloc_bitmap_s * set2)
+{
+ unsigned max_count = set1->ulongs_count > set2->ulongs_count ? set1->ulongs_count : set2->ulongs_count;
+ int result = HWLOC_BITMAP_EQUAL; /* means empty sets return equal */
+ int empty1 = 1;
+ int empty2 = 1;
+ unsigned i;
+
+ HWLOC__BITMAP_CHECK(set1);
+ HWLOC__BITMAP_CHECK(set2);
+
+ for(i=0; i<max_count; i++) {
+ unsigned long val1 = HWLOC_SUBBITMAP_READULONG(set1, (unsigned) i);
+ unsigned long val2 = HWLOC_SUBBITMAP_READULONG(set2, (unsigned) i);
+
+ if (!val1) {
+ if (!val2)
+ /* both empty, no change */
+ continue;
+
+ /* val1 empty, val2 not */
+ if (result == HWLOC_BITMAP_CONTAINS) {
+ if (!empty2)
+ return HWLOC_BITMAP_INTERSECTS;
+ result = HWLOC_BITMAP_DIFFERENT;
+ } else if (result == HWLOC_BITMAP_EQUAL) {
+ result = HWLOC_BITMAP_INCLUDED;
+ }
+ /* no change otherwise */
+
+ } else if (!val2) {
+ /* val2 empty, val1 not */
+ if (result == HWLOC_BITMAP_INCLUDED) {
+ if (!empty1)
+ return HWLOC_BITMAP_INTERSECTS;
+ result = HWLOC_BITMAP_DIFFERENT;
+ } else if (result == HWLOC_BITMAP_EQUAL) {
+ result = HWLOC_BITMAP_CONTAINS;
+ }
+ /* no change otherwise */
+
+ } else if (val1 == val2) {
+ /* equal and not empty */
+ if (result == HWLOC_BITMAP_DIFFERENT)
+ return HWLOC_BITMAP_INTERSECTS;
+ /* equal/contains/included unchanged */
+
+ } else if ((val1 & val2) == val1) {
+ /* included and not empty */
+ if (result == HWLOC_BITMAP_CONTAINS || result == HWLOC_BITMAP_DIFFERENT)
+ return HWLOC_BITMAP_INTERSECTS;
+ /* equal/included unchanged */
+ result = HWLOC_BITMAP_INCLUDED;
+
+ } else if ((val1 & val2) == val2) {
+ /* contains and not empty */
+ if (result == HWLOC_BITMAP_INCLUDED || result == HWLOC_BITMAP_DIFFERENT)
+ return HWLOC_BITMAP_INTERSECTS;
+ /* equal/contains unchanged */
+ result = HWLOC_BITMAP_CONTAINS;
+
+ } else if ((val1 & val2) != 0) {
+ /* intersects and not empty */
+ return HWLOC_BITMAP_INTERSECTS;
+
+ } else {
+ /* different and not empty */
+
+ /* equal/included/contains with non-empty sets means intersects */
+ if (result == HWLOC_BITMAP_EQUAL && !empty1 /* implies !empty2 */)
+ return HWLOC_BITMAP_INTERSECTS;
+ if (result == HWLOC_BITMAP_INCLUDED && !empty1)
+ return HWLOC_BITMAP_INTERSECTS;
+ if (result == HWLOC_BITMAP_CONTAINS && !empty2)
+ return HWLOC_BITMAP_INTERSECTS;
+ /* otherwise means different */
+ result = HWLOC_BITMAP_DIFFERENT;
+ }
+
+ empty1 &= !val1;
+ empty2 &= !val2;
+ }
+
+ if (!set1->infinite) {
+ if (set2->infinite) {
+ /* set2 infinite only */
+ if (result == HWLOC_BITMAP_CONTAINS) {
+ if (!empty2)
+ return HWLOC_BITMAP_INTERSECTS;
+ result = HWLOC_BITMAP_DIFFERENT;
+ } else if (result == HWLOC_BITMAP_EQUAL) {
+ result = HWLOC_BITMAP_INCLUDED;
+ }
+ /* no change otherwise */
+ }
+ } else if (!set2->infinite) {
+ /* set1 infinite only */
+ if (result == HWLOC_BITMAP_INCLUDED) {
+ if (!empty1)
+ return HWLOC_BITMAP_INTERSECTS;
+ result = HWLOC_BITMAP_DIFFERENT;
+ } else if (result == HWLOC_BITMAP_EQUAL) {
+ result = HWLOC_BITMAP_CONTAINS;
+ }
+ /* no change otherwise */
+ } else {
+ /* both infinite */
+ if (result == HWLOC_BITMAP_DIFFERENT)
+ return HWLOC_BITMAP_INTERSECTS;
+ /* equal/contains/included unchanged */
+ }
+
+ return result;
+}
diff --git a/ext/hwloc/hwloc/components.c b/ext/hwloc/hwloc/components.c
new file mode 100644
index 0000000..7aa3b9d
--- /dev/null
+++ b/ext/hwloc/hwloc/components.c
@@ -0,0 +1,792 @@
+/*
+ * Copyright © 2009-2015 Inria. All rights reserved.
+ * Copyright © 2012 Université Bordeau 1
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/xml.h>
+
+#define HWLOC_COMPONENT_STOP_NAME "stop"
+#define HWLOC_COMPONENT_EXCLUDE_CHAR '-'
+#define HWLOC_COMPONENT_SEPS ","
+
+/* list of all registered discovery components, sorted by priority, higher priority first.
+ * noos is last because its priority is 0.
+ * others' priority is 10.
+ */
+static struct hwloc_disc_component * hwloc_disc_components = NULL;
+
+static unsigned hwloc_components_users = 0; /* first one initializes, last ones destroys */
+
+static int hwloc_components_verbose = 0;
+#ifdef HWLOC_HAVE_PLUGINS
+static int hwloc_plugins_verbose = 0;
+#endif
+
+/* hwloc_components_mutex serializes:
+ * - loading/unloading plugins, and modifications of the hwloc_plugins list
+ * - calls to ltdl, including in hwloc_check_plugin_namespace()
+ * - registration of components with hwloc_disc_component_register()
+ * and hwloc_xml_callbacks_register()
+ */
+#ifdef HWLOC_WIN_SYS
+/* Basic mutex on top of InterlockedCompareExchange() on windows,
+ * Far from perfect, but easy to maintain, and way enough given that this code will never be needed for real. */
+#include <windows.h>
+static LONG hwloc_components_mutex = 0;
+#define HWLOC_COMPONENTS_LOCK() do { \
+ while (InterlockedCompareExchange(&hwloc_components_mutex, 1, 0) != 0) \
+ SwitchToThread(); \
+} while (0)
+#define HWLOC_COMPONENTS_UNLOCK() do { \
+ assert(hwloc_components_mutex == 1); \
+ hwloc_components_mutex = 0; \
+} while (0)
+
+#elif defined HWLOC_HAVE_PTHREAD_MUTEX
+/* pthread mutex if available (except on windows) */
+#include <pthread.h>
+static pthread_mutex_t hwloc_components_mutex = PTHREAD_MUTEX_INITIALIZER;
+#define HWLOC_COMPONENTS_LOCK() pthread_mutex_lock(&hwloc_components_mutex)
+#define HWLOC_COMPONENTS_UNLOCK() pthread_mutex_unlock(&hwloc_components_mutex)
+
+#else /* HWLOC_WIN_SYS || HWLOC_HAVE_PTHREAD_MUTEX */
+#error No mutex implementation available
+#endif
+
+
+#ifdef HWLOC_HAVE_PLUGINS
+
+#include <ltdl.h>
+
+/* array of pointers to dynamically loaded plugins */
+static struct hwloc__plugin_desc {
+ char *name;
+ struct hwloc_component *component;
+ char *filename;
+ lt_dlhandle handle;
+ struct hwloc__plugin_desc *next;
+} *hwloc_plugins = NULL;
+
+static int
+hwloc__dlforeach_cb(const char *filename, void *_data __hwloc_attribute_unused)
+{
+ const char *basename;
+ lt_dlhandle handle;
+ char *componentsymbolname = NULL;
+ struct hwloc_component *component;
+ struct hwloc__plugin_desc *desc, **prevdesc;
+
+ if (hwloc_plugins_verbose)
+ fprintf(stderr, "Plugin dlforeach found `%s'\n", filename);
+
+ basename = strrchr(filename, '/');
+ if (!basename)
+ basename = filename;
+ else
+ basename++;
+
+ /* dlopen and get the component structure */
+ handle = lt_dlopenext(filename);
+ if (!handle) {
+ if (hwloc_plugins_verbose)
+ fprintf(stderr, "Failed to load plugin: %s\n", lt_dlerror());
+ goto out;
+ }
+ componentsymbolname = malloc(strlen(basename)+10+1);
+ sprintf(componentsymbolname, "%s_component", basename);
+ component = lt_dlsym(handle, componentsymbolname);
+ if (!component) {
+ if (hwloc_plugins_verbose)
+ fprintf(stderr, "Failed to find component symbol `%s'\n",
+ componentsymbolname);
+ goto out_with_handle;
+ }
+ if (component->abi != HWLOC_COMPONENT_ABI) {
+ if (hwloc_plugins_verbose)
+ fprintf(stderr, "Plugin symbol ABI %u instead of %u\n",
+ component->abi, HWLOC_COMPONENT_ABI);
+ goto out_with_handle;
+ }
+ if (hwloc_plugins_verbose)
+ fprintf(stderr, "Plugin contains expected symbol `%s'\n",
+ componentsymbolname);
+ free(componentsymbolname);
+ componentsymbolname = NULL;
+
+ if (HWLOC_COMPONENT_TYPE_DISC == component->type) {
+ if (strncmp(basename, "hwloc_", 6)) {
+ if (hwloc_plugins_verbose)
+ fprintf(stderr, "Plugin name `%s' doesn't match its type DISCOVERY\n", basename);
+ goto out_with_handle;
+ }
+ } else if (HWLOC_COMPONENT_TYPE_XML == component->type) {
+ if (strncmp(basename, "hwloc_xml_", 10)) {
+ if (hwloc_plugins_verbose)
+ fprintf(stderr, "Plugin name `%s' doesn't match its type XML\n", basename);
+ goto out_with_handle;
+ }
+ } else {
+ if (hwloc_plugins_verbose)
+ fprintf(stderr, "Plugin name `%s' has invalid type %u\n",
+ basename, (unsigned) component->type);
+ goto out_with_handle;
+ }
+
+ /* allocate a plugin_desc and queue it */
+ desc = malloc(sizeof(*desc));
+ if (!desc)
+ goto out_with_handle;
+ desc->name = strdup(basename);
+ desc->filename = strdup(filename);
+ desc->component = component;
+ desc->handle = handle;
+ desc->next = NULL;
+ if (hwloc_plugins_verbose)
+ fprintf(stderr, "Plugin descriptor `%s' ready\n", basename);
+
+ /* append to the list */
+ prevdesc = &hwloc_plugins;
+ while (*prevdesc)
+ prevdesc = &((*prevdesc)->next);
+ *prevdesc = desc;
+ if (hwloc_plugins_verbose)
+ fprintf(stderr, "Plugin descriptor `%s' queued\n", basename);
+ return 0;
+
+ out_with_handle:
+ lt_dlclose(handle);
+ free(componentsymbolname); /* NULL if already freed */
+ out:
+ return 0;
+}
+
+static void
+hwloc_plugins_exit(void)
+{
+ struct hwloc__plugin_desc *desc, *next;
+
+ if (hwloc_plugins_verbose)
+ fprintf(stderr, "Closing all plugins\n");
+
+ desc = hwloc_plugins;
+ while (desc) {
+ next = desc->next;
+ lt_dlclose(desc->handle);
+ free(desc->name);
+ free(desc->filename);
+ free(desc);
+ desc = next;
+ }
+ hwloc_plugins = NULL;
+
+ lt_dlexit();
+}
+
+static int
+hwloc_plugins_init(void)
+{
+ const char *verboseenv;
+ char *path = HWLOC_PLUGINS_PATH;
+ const char *env;
+ int err;
+
+ verboseenv = getenv("HWLOC_PLUGINS_VERBOSE");
+ hwloc_plugins_verbose = verboseenv ? atoi(verboseenv) : 0;
+
+ err = lt_dlinit();
+ if (err)
+ goto out;
+
+ env = getenv("HWLOC_PLUGINS_PATH");
+ if (env)
+ path = env;
+
+ hwloc_plugins = NULL;
+
+ if (hwloc_plugins_verbose)
+ fprintf(stderr, "Starting plugin dlforeach in %s\n", path);
+ err = lt_dlforeachfile(path, hwloc__dlforeach_cb, NULL);
+ if (err)
+ goto out_with_init;
+
+ return 0;
+
+ out_with_init:
+ hwloc_plugins_exit();
+ out:
+ return -1;
+}
+
+#endif /* HWLOC_HAVE_PLUGINS */
+
+static const char *
+hwloc_disc_component_type_string(hwloc_disc_component_type_t type)
+{
+ switch (type) {
+ case HWLOC_DISC_COMPONENT_TYPE_CPU: return "cpu";
+ case HWLOC_DISC_COMPONENT_TYPE_GLOBAL: return "global";
+ case HWLOC_DISC_COMPONENT_TYPE_MISC: return "misc";
+ default: return "**unknown**";
+ }
+}
+
+static int
+hwloc_disc_component_register(struct hwloc_disc_component *component,
+ const char *filename)
+{
+ struct hwloc_disc_component **prev;
+
+ /* check that the component name is valid */
+ if (!strcmp(component->name, HWLOC_COMPONENT_STOP_NAME)) {
+ if (hwloc_components_verbose)
+ fprintf(stderr, "Cannot register discovery component with reserved name `" HWLOC_COMPONENT_STOP_NAME "'\n");
+ return -1;
+ }
+ if (strchr(component->name, HWLOC_COMPONENT_EXCLUDE_CHAR)
+ || strcspn(component->name, HWLOC_COMPONENT_SEPS) != strlen(component->name)) {
+ if (hwloc_components_verbose)
+ fprintf(stderr, "Cannot register discovery component with name `%s' containing reserved characters `%c" HWLOC_COMPONENT_SEPS "'\n",
+ component->name, HWLOC_COMPONENT_EXCLUDE_CHAR);
+ return -1;
+ }
+ /* check that the component type is valid */
+ switch ((unsigned) component->type) {
+ case HWLOC_DISC_COMPONENT_TYPE_CPU:
+ case HWLOC_DISC_COMPONENT_TYPE_GLOBAL:
+ case HWLOC_DISC_COMPONENT_TYPE_MISC:
+ break;
+ default:
+ fprintf(stderr, "Cannot register discovery component `%s' with unknown type %u\n",
+ component->name, (unsigned) component->type);
+ return -1;
+ }
+
+ prev = &hwloc_disc_components;
+ while (NULL != *prev) {
+ if (!strcmp((*prev)->name, component->name)) {
+ /* if two components have the same name, only keep the highest priority one */
+ if ((*prev)->priority < component->priority) {
+ /* drop the existing component */
+ if (hwloc_components_verbose)
+ fprintf(stderr, "Dropping previously registered discovery component `%s', priority %u lower than new one %u\n",
+ (*prev)->name, (*prev)->priority, component->priority);
+ *prev = (*prev)->next;
+ } else {
+ /* drop the new one */
+ if (hwloc_components_verbose)
+ fprintf(stderr, "Ignoring new discovery component `%s', priority %u lower than previously registered one %u\n",
+ component->name, component->priority, (*prev)->priority);
+ return -1;
+ }
+ }
+ prev = &((*prev)->next);
+ }
+ if (hwloc_components_verbose)
+ fprintf(stderr, "Registered %s discovery component `%s' with priority %u (%s%s)\n",
+ hwloc_disc_component_type_string(component->type), component->name, component->priority,
+ filename ? "from plugin " : "statically build", filename ? filename : "");
+
+ prev = &hwloc_disc_components;
+ while (NULL != *prev) {
+ if ((*prev)->priority < component->priority)
+ break;
+ prev = &((*prev)->next);
+ }
+ component->next = *prev;
+ *prev = component;
+ return 0;
+}
+
+#include <static-components.h>
+
+static void (**hwloc_component_finalize_cbs)(unsigned long);
+static unsigned hwloc_component_finalize_cb_count;
+
+void
+hwloc_components_init(struct hwloc_topology *topology __hwloc_attribute_unused)
+{
+#ifdef HWLOC_HAVE_PLUGINS
+ struct hwloc__plugin_desc *desc;
+#endif
+ const char *verboseenv;
+ unsigned i;
+
+ HWLOC_COMPONENTS_LOCK();
+ assert((unsigned) -1 != hwloc_components_users);
+ if (0 != hwloc_components_users++) {
+ HWLOC_COMPONENTS_UNLOCK();
+ goto ok;
+ }
+
+ verboseenv = getenv("HWLOC_COMPONENTS_VERBOSE");
+ hwloc_components_verbose = verboseenv ? atoi(verboseenv) : 0;
+
+#ifdef HWLOC_HAVE_PLUGINS
+ hwloc_plugins_init();
+#endif
+
+ hwloc_component_finalize_cbs = NULL;
+ hwloc_component_finalize_cb_count = 0;
+ /* count the max number of finalize callbacks */
+ for(i=0; NULL != hwloc_static_components[i]; i++)
+ hwloc_component_finalize_cb_count++;
+#ifdef HWLOC_HAVE_PLUGINS
+ for(desc = hwloc_plugins; NULL != desc; desc = desc->next)
+ hwloc_component_finalize_cb_count++;
+#endif
+ if (hwloc_component_finalize_cb_count) {
+ hwloc_component_finalize_cbs = calloc(hwloc_component_finalize_cb_count,
+ sizeof(*hwloc_component_finalize_cbs));
+ assert(hwloc_component_finalize_cbs);
+ /* forget that max number and recompute the real one below */
+ hwloc_component_finalize_cb_count = 0;
+ }
+
+ /* hwloc_static_components is created by configure in static-components.h */
+ for(i=0; NULL != hwloc_static_components[i]; i++) {
+ if (hwloc_static_components[i]->flags) {
+ fprintf(stderr, "Ignoring static component with invalid flags %lx\n",
+ hwloc_static_components[i]->flags);
+ continue;
+ }
+
+ /* initialize the component */
+ if (hwloc_static_components[i]->init && hwloc_static_components[i]->init(0) < 0) {
+ if (hwloc_components_verbose)
+ fprintf(stderr, "Ignoring static component, failed to initialize\n");
+ continue;
+ }
+ /* queue ->finalize() callback if any */
+ if (hwloc_static_components[i]->finalize)
+ hwloc_component_finalize_cbs[hwloc_component_finalize_cb_count++] = hwloc_static_components[i]->finalize;
+
+ /* register for real now */
+ if (HWLOC_COMPONENT_TYPE_DISC == hwloc_static_components[i]->type)
+ hwloc_disc_component_register(hwloc_static_components[i]->data, NULL);
+ /*else if (HWLOC_COMPONENT_TYPE_XML == hwloc_static_components[i]->type)
+ hwloc_xml_callbacks_register(hwloc_static_components[i]->data);*/
+ else
+ assert(0);
+ }
+
+ /* dynamic plugins */
+#ifdef HWLOC_HAVE_PLUGINS
+ for(desc = hwloc_plugins; NULL != desc; desc = desc->next) {
+ if (desc->component->flags) {
+ fprintf(stderr, "Ignoring plugin `%s' component with invalid flags %lx\n",
+ desc->name, desc->component->flags);
+ continue;
+ }
+
+ /* initialize the component */
+ if (desc->component->init && desc->component->init(0) < 0) {
+ if (hwloc_components_verbose)
+ fprintf(stderr, "Ignoring plugin `%s', failed to initialize\n", desc->name);
+ continue;
+ }
+ /* queue ->finalize() callback if any */
+ if (desc->component->finalize)
+ hwloc_component_finalize_cbs[hwloc_component_finalize_cb_count++] = desc->component->finalize;
+
+ /* register for real now */
+ if (HWLOC_COMPONENT_TYPE_DISC == desc->component->type)
+ hwloc_disc_component_register(desc->component->data, desc->filename);
+ /*else if (HWLOC_COMPONENT_TYPE_XML == desc->component->type)
+ hwloc_xml_callbacks_register(desc->component->data);*/
+ else
+ assert(0);
+ }
+#endif
+
+ HWLOC_COMPONENTS_UNLOCK();
+
+ ok:
+ topology->backends = NULL;
+}
+
+static struct hwloc_disc_component *
+hwloc_disc_component_find(int type /* hwloc_disc_component_type_t or -1 if any */,
+ const char *name /* name of NULL if any */)
+{
+ struct hwloc_disc_component *comp = hwloc_disc_components;
+ while (NULL != comp) {
+ if ((-1 == type || type == (int) comp->type)
+ && (NULL == name || !strcmp(name, comp->name)))
+ return comp;
+ comp = comp->next;
+ }
+ return NULL;
+}
+
+/* used by set_xml(), set_synthetic(), ... environment variables, ... to force the first backend */
+int
+hwloc_disc_component_force_enable(struct hwloc_topology *topology,
+ int envvar_forced,
+ int type, const char *name,
+ const void *data1, const void *data2, const void *data3)
+{
+ struct hwloc_disc_component *comp;
+ struct hwloc_backend *backend;
+
+ if (topology->is_loaded) {
+ errno = EBUSY;
+ return -1;
+ }
+
+ comp = hwloc_disc_component_find(type, name);
+ if (!comp) {
+ errno = ENOSYS;
+ return -1;
+ }
+
+ backend = comp->instantiate(comp, data1, data2, data3);
+ if (backend) {
+ backend->envvar_forced = envvar_forced;
+ if (topology->backends)
+ hwloc_backends_disable_all(topology);
+ return hwloc_backend_enable(topology, backend);
+ } else
+ return -1;
+}
+
+static int
+hwloc_disc_component_try_enable(struct hwloc_topology *topology,
+ struct hwloc_disc_component *comp,
+ const char *comparg,
+ unsigned *excludes,
+ int envvar_forced,
+ int verbose_errors)
+{
+ struct hwloc_backend *backend;
+ int err;
+
+ if ((*excludes) & comp->type) {
+ if (hwloc_components_verbose || verbose_errors)
+ fprintf(stderr, "Excluding %s discovery component `%s', conflicts with excludes 0x%x\n",
+ hwloc_disc_component_type_string(comp->type), comp->name, *excludes);
+ return -1;
+ }
+
+ backend = comp->instantiate(comp, comparg, NULL, NULL);
+ if (!backend) {
+ if (hwloc_components_verbose || verbose_errors)
+ fprintf(stderr, "Failed to instantiate discovery component `%s'\n", comp->name);
+ return -1;
+ }
+
+ backend->envvar_forced = envvar_forced;
+ err = hwloc_backend_enable(topology, backend);
+ if (err < 0)
+ return -1;
+
+ *excludes |= comp->excludes;
+
+ return 0;
+}
+
+void
+hwloc_disc_components_enable_others(struct hwloc_topology *topology)
+{
+ struct hwloc_disc_component *comp;
+ struct hwloc_backend *backend;
+ unsigned excludes = 0;
+ int tryall = 1;
+ const char *_env;
+ char *env; /* we'll to modify the env value, so duplicate it */
+
+ _env = getenv("HWLOC_COMPONENTS");
+ env = _env ? strdup(_env) : NULL;
+
+ /* compute current excludes */
+ backend = topology->backends;
+ while (backend) {
+ excludes |= backend->component->excludes;
+ backend = backend->next;
+ }
+
+ /* enable explicitly listed components */
+ if (env) {
+ char *curenv = env;
+ size_t s;
+
+ if (topology->backends) {
+ hwloc_backends_disable_all(topology);
+ excludes = 0;
+ }
+
+ while (*curenv) {
+ s = strcspn(curenv, HWLOC_COMPONENT_SEPS);
+ if (s) {
+ char c;
+
+ if (curenv[0] == HWLOC_COMPONENT_EXCLUDE_CHAR)
+ goto nextname;
+
+ if (!strncmp(curenv, HWLOC_COMPONENT_STOP_NAME, s)) {
+ tryall = 0;
+ break;
+ }
+
+ /* save the last char and replace with \0 */
+ c = curenv[s];
+ curenv[s] = '\0';
+
+ comp = hwloc_disc_component_find(-1, curenv);
+ if (comp) {
+ hwloc_disc_component_try_enable(topology, comp, NULL, &excludes, 1 /* envvar forced */, 1 /* envvar forced need warnings */);
+ } else {
+ fprintf(stderr, "Cannot find discovery component `%s'\n", curenv);
+ }
+
+ /* restore chars (the second loop below needs env to be unmodified) */
+ curenv[s] = c;
+ }
+
+nextname:
+ curenv += s;
+ if (*curenv)
+ /* Skip comma */
+ curenv++;
+ }
+ }
+
+ /* env is still the same, the above loop didn't modify it */
+
+ /* now enable remaining components (except the explicitly '-'-listed ones) */
+ if (tryall) {
+ comp = hwloc_disc_components;
+ while (NULL != comp) {
+ /* check if this component was explicitly excluded in env */
+ if (env) {
+ char *curenv = env;
+ while (*curenv) {
+ size_t s = strcspn(curenv, HWLOC_COMPONENT_SEPS);
+ if (curenv[0] == HWLOC_COMPONENT_EXCLUDE_CHAR && !strncmp(curenv+1, comp->name, s-1)) {
+ if (hwloc_components_verbose)
+ fprintf(stderr, "Excluding %s discovery component `%s' because of HWLOC_COMPONENTS environment variable\n",
+ hwloc_disc_component_type_string(comp->type), comp->name);
+ goto nextcomp;
+ }
+ curenv += s;
+ if (*curenv)
+ /* Skip comma */
+ curenv++;
+ }
+ }
+ hwloc_disc_component_try_enable(topology, comp, NULL, &excludes, 0 /* defaults, not envvar forced */, 0 /* defaults don't need warnings on conflicts */);
+nextcomp:
+ comp = comp->next;
+ }
+ }
+
+ if (hwloc_components_verbose) {
+ /* print a summary */
+ int first = 1;
+ backend = topology->backends;
+ fprintf(stderr, "Final list of enabled discovery components: ");
+ while (backend != NULL) {
+ fprintf(stderr, "%s%s", first ? "" : ",", backend->component->name);
+ backend = backend->next;
+ first = 0;
+ }
+ fprintf(stderr, "\n");
+ }
+
+ if (env)
+ free(env);
+}
+
+void
+hwloc_components_destroy_all(struct hwloc_topology *topology __hwloc_attribute_unused)
+{
+ unsigned i;
+
+ HWLOC_COMPONENTS_LOCK();
+ assert(0 != hwloc_components_users);
+ if (0 != --hwloc_components_users) {
+ HWLOC_COMPONENTS_UNLOCK();
+ return;
+ }
+
+ for(i=0; i<hwloc_component_finalize_cb_count; i++)
+ hwloc_component_finalize_cbs[hwloc_component_finalize_cb_count-i-1](0);
+ free(hwloc_component_finalize_cbs);
+ hwloc_component_finalize_cbs = NULL;
+ hwloc_component_finalize_cb_count = 0;
+
+ /* no need to unlink/free the list of components, they'll be unloaded below */
+
+ hwloc_disc_components = NULL;
+// hwloc_xml_callbacks_reset();
+
+#ifdef HWLOC_HAVE_PLUGINS
+ hwloc_plugins_exit();
+#endif
+
+ HWLOC_COMPONENTS_UNLOCK();
+}
+
+struct hwloc_backend *
+hwloc_backend_alloc(struct hwloc_disc_component *component)
+{
+ struct hwloc_backend * backend = malloc(sizeof(*backend));
+ if (!backend) {
+ errno = ENOMEM;
+ return NULL;
+ }
+ backend->component = component;
+ backend->flags = 0;
+ backend->discover = NULL;
+ backend->get_obj_cpuset = NULL;
+ backend->notify_new_object = NULL;
+ backend->disable = NULL;
+ backend->is_thissystem = -1;
+ backend->next = NULL;
+ backend->envvar_forced = 0;
+ return backend;
+}
+
+static void
+hwloc_backend_disable(struct hwloc_backend *backend)
+{
+ if (backend->disable)
+ backend->disable(backend);
+ free(backend);
+}
+
+int
+hwloc_backend_enable(struct hwloc_topology *topology, struct hwloc_backend *backend)
+{
+ struct hwloc_backend **pprev;
+
+ /* check backend flags */
+ if (backend->flags & (~(HWLOC_BACKEND_FLAG_NEED_LEVELS))) {
+ fprintf(stderr, "Cannot enable %s discovery component `%s' with unknown flags %lx\n",
+ hwloc_disc_component_type_string(backend->component->type), backend->component->name, backend->flags);
+ return -1;
+ }
+
+ /* make sure we didn't already enable this backend, we don't want duplicates */
+ pprev = &topology->backends;
+ while (NULL != *pprev) {
+ if ((*pprev)->component == backend->component) {
+ if (hwloc_components_verbose)
+ fprintf(stderr, "Cannot enable %s discovery component `%s' twice\n",
+ hwloc_disc_component_type_string(backend->component->type), backend->component->name);
+ hwloc_backend_disable(backend);
+ errno = EBUSY;
+ return -1;
+ }
+ pprev = &((*pprev)->next);
+ }
+
+ if (hwloc_components_verbose)
+ fprintf(stderr, "Enabling %s discovery component `%s'\n",
+ hwloc_disc_component_type_string(backend->component->type), backend->component->name);
+
+ /* enqueue at the end */
+ pprev = &topology->backends;
+ while (NULL != *pprev)
+ pprev = &((*pprev)->next);
+ backend->next = *pprev;
+ *pprev = backend;
+
+ backend->topology = topology;
+
+ return 0;
+}
+
+void
+hwloc_backends_is_thissystem(struct hwloc_topology *topology)
+{
+ struct hwloc_backend *backend;
+ const char *local_env;
+
+ /* Apply is_thissystem topology flag before we enforce envvar backends.
+ * If the application changed the backend with set_foo(),
+ * it may use set_flags() update the is_thissystem flag here.
+ * If it changes the backend with environment variables below,
+ * it may use HWLOC_THISSYSTEM envvar below as well.
+ */
+
+ topology->is_thissystem = 1;
+
+ /* apply thissystem from normally-given backends (envvar_forced=0, either set_foo() or defaults) */
+ backend = topology->backends;
+ while (backend != NULL) {
+ if (backend->envvar_forced == 0 && backend->is_thissystem != -1) {
+ assert(backend->is_thissystem == 0);
+ topology->is_thissystem = 0;
+ }
+ backend = backend->next;
+ }
+
+ /* override set_foo() with flags */
+ if (topology->flags & HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM)
+ topology->is_thissystem = 1;
+
+ /* now apply envvar-forced backend (envvar_forced=1) */
+ backend = topology->backends;
+ while (backend != NULL) {
+ if (backend->envvar_forced == 1 && backend->is_thissystem != -1) {
+ assert(backend->is_thissystem == 0);
+ topology->is_thissystem = 0;
+ }
+ backend = backend->next;
+ }
+
+ /* override with envvar-given flag */
+ local_env = getenv("HWLOC_THISSYSTEM");
+ if (local_env)
+ topology->is_thissystem = atoi(local_env);
+}
+
+int
+hwloc_backends_get_obj_cpuset(struct hwloc_backend *caller, struct hwloc_obj *obj, hwloc_bitmap_t cpuset)
+{
+ struct hwloc_topology *topology = caller->topology;
+ struct hwloc_backend *backend = topology->backends;
+ /* use the first backend's get_obj_cpuset callback */
+ while (backend != NULL) {
+ if (backend->get_obj_cpuset)
+ return backend->get_obj_cpuset(backend, caller, obj, cpuset);
+ backend = backend->next;
+ }
+ return -1;
+}
+
+int
+hwloc_backends_notify_new_object(struct hwloc_backend *caller, struct hwloc_obj *obj)
+{
+ struct hwloc_backend *backend;
+ int res = 0;
+
+ backend = caller->topology->backends;
+ while (NULL != backend) {
+ if (backend != caller && backend->notify_new_object)
+ res += backend->notify_new_object(backend, caller, obj);
+ backend = backend->next;
+ }
+
+ return res;
+}
+
+void
+hwloc_backends_disable_all(struct hwloc_topology *topology)
+{
+ struct hwloc_backend *backend;
+
+ while (NULL != (backend = topology->backends)) {
+ struct hwloc_backend *next = backend->next;
+ if (hwloc_components_verbose)
+ fprintf(stderr, "Disabling %s discovery component `%s'\n",
+ hwloc_disc_component_type_string(backend->component->type), backend->component->name);
+ hwloc_backend_disable(backend);
+ topology->backends = next;
+ }
+ topology->backends = NULL;
+}
diff --git a/ext/hwloc/hwloc/diff.c b/ext/hwloc/hwloc/diff.c
new file mode 100644
index 0000000..ee401d2
--- /dev/null
+++ b/ext/hwloc/hwloc/diff.c
@@ -0,0 +1,426 @@
+/*
+ * Copyright © 2013-2015 Inria. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <private/private.h>
+#include <private/misc.h>
+
+int hwloc_topology_diff_destroy(hwloc_topology_t topology __hwloc_attribute_unused,
+ hwloc_topology_diff_t diff)
+{
+ hwloc_topology_diff_t next;
+ while (diff) {
+ next = diff->generic.next;
+ switch (diff->generic.type) {
+ default:
+ break;
+ case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR:
+ switch (diff->obj_attr.diff.generic.type) {
+ default:
+ break;
+ case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME:
+ case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO:
+ free(diff->obj_attr.diff.string.name);
+ free(diff->obj_attr.diff.string.oldvalue);
+ free(diff->obj_attr.diff.string.newvalue);
+ break;
+ }
+ break;
+ }
+ free(diff);
+ diff = next;
+ }
+ return 0;
+}
+
+/************************
+ * Computing diffs
+ */
+
+static void hwloc_append_diff(hwloc_topology_diff_t newdiff,
+ hwloc_topology_diff_t *firstdiffp,
+ hwloc_topology_diff_t *lastdiffp)
+{
+ if (*firstdiffp)
+ (*lastdiffp)->generic.next = newdiff;
+ else
+ *firstdiffp = newdiff;
+ *lastdiffp = newdiff;
+ newdiff->generic.next = NULL;
+}
+
+static int hwloc_append_diff_too_complex(hwloc_obj_t obj1,
+ hwloc_topology_diff_t *firstdiffp,
+ hwloc_topology_diff_t *lastdiffp)
+{
+ hwloc_topology_diff_t newdiff;
+ newdiff = malloc(sizeof(*newdiff));
+ if (!newdiff)
+ return -1;
+
+ newdiff->too_complex.type = HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX;
+ newdiff->too_complex.obj_depth = obj1->depth;
+ newdiff->too_complex.obj_index = obj1->logical_index;
+ hwloc_append_diff(newdiff, firstdiffp, lastdiffp);
+ return 0;
+}
+
+static int hwloc_append_diff_obj_attr_string(hwloc_obj_t obj,
+ hwloc_topology_diff_obj_attr_type_t type,
+ const char *name,
+ const char *oldvalue,
+ const char *newvalue,
+ hwloc_topology_diff_t *firstdiffp,
+ hwloc_topology_diff_t *lastdiffp)
+{
+ hwloc_topology_diff_t newdiff;
+ newdiff = malloc(sizeof(*newdiff));
+ if (!newdiff)
+ return -1;
+
+ newdiff->obj_attr.type = HWLOC_TOPOLOGY_DIFF_OBJ_ATTR;
+ newdiff->obj_attr.obj_depth = obj->depth;
+ newdiff->obj_attr.obj_index = obj->logical_index;
+ newdiff->obj_attr.diff.string.type = type;
+ newdiff->obj_attr.diff.string.name = name ? strdup(name) : NULL;
+ newdiff->obj_attr.diff.string.oldvalue = oldvalue ? strdup(oldvalue) : NULL;
+ newdiff->obj_attr.diff.string.newvalue = newvalue ? strdup(newvalue) : NULL;
+ hwloc_append_diff(newdiff, firstdiffp, lastdiffp);
+ return 0;
+}
+
+static int hwloc_append_diff_obj_attr_uint64(hwloc_obj_t obj,
+ hwloc_topology_diff_obj_attr_type_t type,
+ hwloc_uint64_t idx,
+ hwloc_uint64_t oldvalue,
+ hwloc_uint64_t newvalue,
+ hwloc_topology_diff_t *firstdiffp,
+ hwloc_topology_diff_t *lastdiffp)
+{
+ hwloc_topology_diff_t newdiff;
+ newdiff = malloc(sizeof(*newdiff));
+ if (!newdiff)
+ return -1;
+
+ newdiff->obj_attr.type = HWLOC_TOPOLOGY_DIFF_OBJ_ATTR;
+ newdiff->obj_attr.obj_depth = obj->depth;
+ newdiff->obj_attr.obj_index = obj->logical_index;
+ newdiff->obj_attr.diff.uint64.type = type;
+ newdiff->obj_attr.diff.uint64.index = idx;
+ newdiff->obj_attr.diff.uint64.oldvalue = oldvalue;
+ newdiff->obj_attr.diff.uint64.newvalue = newvalue;
+ hwloc_append_diff(newdiff, firstdiffp, lastdiffp);
+ return 0;
+}
+
+static int
+hwloc_diff_trees(hwloc_topology_t topo1, hwloc_obj_t obj1,
+ hwloc_topology_t topo2, hwloc_obj_t obj2,
+ unsigned flags,
+ hwloc_topology_diff_t *firstdiffp, hwloc_topology_diff_t *lastdiffp)
+{
+ unsigned i;
+ int err;
+ hwloc_obj_t child1, child2;
+
+ if (obj1->depth != obj2->depth)
+ goto out_too_complex;
+ if (obj1->type != obj2->type)
+ goto out_too_complex;
+
+ if (obj1->os_index != obj2->os_index)
+ /* we could allow different os_index for non-PU non-NUMAnode objects
+ * but it's likely useless anyway */
+ goto out_too_complex;
+
+#define _SETS_DIFFERENT(_set1, _set2) \
+ ( ( !(_set1) != !(_set2) ) \
+ || ( (_set1) && !hwloc_bitmap_isequal(_set1, _set2) ) )
+#define SETS_DIFFERENT(_set, _obj1, _obj2) _SETS_DIFFERENT((_obj1)->_set, (_obj2)->_set)
+ if (SETS_DIFFERENT(cpuset, obj1, obj2)
+ || SETS_DIFFERENT(complete_cpuset, obj1, obj2)
+ || SETS_DIFFERENT(allowed_cpuset, obj1, obj2)
+ || SETS_DIFFERENT(nodeset, obj1, obj2)
+ || SETS_DIFFERENT(complete_nodeset, obj1, obj2)
+ || SETS_DIFFERENT(allowed_nodeset, obj1, obj2))
+ goto out_too_complex;
+
+ /* no need to check logical_index, sibling_rank, symmetric_subtree,
+ * the parents did it */
+
+ if ((!obj1->name) != (!obj2->name)
+ || (obj1->name && strcmp(obj1->name, obj2->name))) {
+ err = hwloc_append_diff_obj_attr_string(obj1,
+ HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME,
+ NULL,
+ obj1->name,
+ obj2->name,
+ firstdiffp, lastdiffp);
+ if (err < 0)
+ return err;
+ }
+
+ /* memory */
+ if (obj1->memory.local_memory != obj2->memory.local_memory) {
+ err = hwloc_append_diff_obj_attr_uint64(obj1,
+ HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE,
+ 0,
+ obj1->memory.local_memory,
+ obj2->memory.local_memory,
+ firstdiffp, lastdiffp);
+ if (err < 0)
+ return err;
+ }
+ /* ignore memory page_types */
+
+ /* type-specific attrs */
+ switch (obj1->type) {
+ default:
+ break;
+ case HWLOC_OBJ_CACHE:
+ if (memcmp(obj1->attr, obj2->attr, sizeof(obj1->attr->cache)))
+ goto out_too_complex;
+ break;
+ case HWLOC_OBJ_GROUP:
+ if (memcmp(obj1->attr, obj2->attr, sizeof(obj1->attr->group)))
+ goto out_too_complex;
+ break;
+ case HWLOC_OBJ_PCI_DEVICE:
+ if (memcmp(obj1->attr, obj2->attr, sizeof(obj1->attr->pcidev)))
+ goto out_too_complex;
+ break;
+ case HWLOC_OBJ_BRIDGE:
+ if (memcmp(obj1->attr, obj2->attr, sizeof(obj1->attr->bridge)))
+ goto out_too_complex;
+ break;
+ case HWLOC_OBJ_OS_DEVICE:
+ if (memcmp(obj1->attr, obj2->attr, sizeof(obj1->attr->osdev)))
+ goto out_too_complex;
+ break;
+ }
+
+ /* distances */
+ if (obj1->distances_count != obj2->distances_count)
+ goto out_too_complex;
+ for(i=0; i<obj1->distances_count; i++) {
+ struct hwloc_distances_s *d1 = obj1->distances[i], *d2 = obj2->distances[i];
+ if (d1->relative_depth != d2->relative_depth
+ || d1->nbobjs != d2->nbobjs
+ || d1->latency_max != d2->latency_max
+ || d1->latency_base != d2->latency_base
+ || memcmp(d1->latency, d2->latency, d1->nbobjs * d1->nbobjs * sizeof(*d1->latency)))
+ goto out_too_complex;
+ }
+
+ /* infos */
+ if (obj1->infos_count != obj2->infos_count)
+ goto out_too_complex;
+ for(i=0; i<obj1->infos_count; i++) {
+ if (strcmp(obj1->infos[i].name, obj2->infos[i].name))
+ goto out_too_complex;
+ if (strcmp(obj1->infos[i].value, obj2->infos[i].value)) {
+ err = hwloc_append_diff_obj_attr_string(obj1,
+ HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO,
+ obj1->infos[i].name,
+ obj1->infos[i].value,
+ obj2->infos[i].value,
+ firstdiffp, lastdiffp);
+ if (err < 0)
+ return err;
+ }
+ }
+
+ /* ignore userdata */
+
+ /* children */
+ for(child1 = obj1->first_child, child2 = obj2->first_child;
+ child1 != NULL && child2 != NULL;
+ child1 = child1->next_sibling, child2 = child2->next_sibling) {
+ err = hwloc_diff_trees(topo1, child1,
+ topo2, child2,
+ flags,
+ firstdiffp, lastdiffp);
+ if (err < 0)
+ return err;
+ }
+ if (child1 || child2)
+ goto out_too_complex;
+
+ /* I/O children */
+ for(child1 = obj1->io_first_child, child2 = obj2->io_first_child;
+ child1 != NULL && child2 != NULL;
+ child1 = child1->next_sibling, child2 = child2->next_sibling) {
+ err = hwloc_diff_trees(topo1, child1,
+ topo2, child2,
+ flags,
+ firstdiffp, lastdiffp);
+ if (err < 0)
+ return err;
+ }
+ if (child1 || child2)
+ goto out_too_complex;
+
+ /* misc children */
+ for(child1 = obj1->misc_first_child, child2 = obj2->misc_first_child;
+ child1 != NULL && child2 != NULL;
+ child1 = child1->next_sibling, child2 = child2->next_sibling) {
+ err = hwloc_diff_trees(topo1, child1,
+ topo2, child2,
+ flags,
+ firstdiffp, lastdiffp);
+ if (err < 0)
+ return err;
+ }
+ if (child1 || child2)
+ goto out_too_complex;
+
+ return 0;
+
+out_too_complex:
+ hwloc_append_diff_too_complex(obj1, firstdiffp, lastdiffp);
+ return 0;
+}
+
+int hwloc_topology_diff_build(hwloc_topology_t topo1,
+ hwloc_topology_t topo2,
+ unsigned long flags,
+ hwloc_topology_diff_t *diffp)
+{
+ hwloc_topology_diff_t lastdiff, tmpdiff;
+ int err;
+
+ if (flags != 0) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ *diffp = NULL;
+ err = hwloc_diff_trees(topo1, hwloc_get_root_obj(topo1),
+ topo2, hwloc_get_root_obj(topo2),
+ flags,
+ diffp, &lastdiff);
+
+ if (!err) {
+ tmpdiff = *diffp;
+ while (tmpdiff) {
+ if (tmpdiff->generic.type == HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX) {
+ err = 1;
+ break;
+ }
+ tmpdiff = tmpdiff->generic.next;
+ }
+ }
+
+ return err;
+}
+
+/********************
+ * Applying diffs
+ */
+
+static int
+hwloc_apply_diff_one(hwloc_topology_t topology,
+ hwloc_topology_diff_t diff,
+ unsigned long flags)
+{
+ int reverse = !!(flags & HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE);
+
+ switch (diff->generic.type) {
+ case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR: {
+ struct hwloc_topology_diff_obj_attr_s *obj_attr = &diff->obj_attr;
+ hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, obj_attr->obj_depth, obj_attr->obj_index);
+ if (!obj)
+ return -1;
+
+ switch (obj_attr->diff.generic.type) {
+ case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE: {
+ hwloc_obj_t tmpobj;
+ hwloc_uint64_t oldvalue = reverse ? obj_attr->diff.uint64.newvalue : obj_attr->diff.uint64.oldvalue;
+ hwloc_uint64_t newvalue = reverse ? obj_attr->diff.uint64.oldvalue : obj_attr->diff.uint64.newvalue;
+ hwloc_uint64_t valuediff = newvalue - oldvalue;
+ if (obj->memory.local_memory != oldvalue)
+ return -1;
+ obj->memory.local_memory = newvalue;
+ tmpobj = obj;
+ while (tmpobj) {
+ tmpobj->memory.total_memory += valuediff;
+ tmpobj = tmpobj->parent;
+ }
+ break;
+ }
+ case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME: {
+ const char *oldvalue = reverse ? obj_attr->diff.string.newvalue : obj_attr->diff.string.oldvalue;
+ const char *newvalue = reverse ? obj_attr->diff.string.oldvalue : obj_attr->diff.string.newvalue;
+ if (!obj->name || strcmp(obj->name, oldvalue))
+ return -1;
+ free(obj->name);
+ obj->name = strdup(newvalue);
+ break;
+ }
+ case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO: {
+ const char *name = obj_attr->diff.string.name;
+ const char *oldvalue = reverse ? obj_attr->diff.string.newvalue : obj_attr->diff.string.oldvalue;
+ const char *newvalue = reverse ? obj_attr->diff.string.oldvalue : obj_attr->diff.string.newvalue;
+ unsigned i;
+ int found = 0;
+ for(i=0; i<obj->infos_count; i++) {
+ if (!strcmp(obj->infos[i].name, name)
+ && !strcmp(obj->infos[i].value, oldvalue)) {
+ free(obj->infos[i].value);
+ obj->infos[i].value = strdup(newvalue);
+ found = 1;
+ break;
+ }
+ }
+ if (!found)
+ return -1;
+ break;
+ }
+ default:
+ return -1;
+ }
+
+ break;
+ }
+ default:
+ return -1;
+ }
+
+ return 0;
+}
+
+int hwloc_topology_diff_apply(hwloc_topology_t topology,
+ hwloc_topology_diff_t diff,
+ unsigned long flags)
+{
+ hwloc_topology_diff_t tmpdiff, tmpdiff2;
+ int err, nr;
+
+ if (flags & ~HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ tmpdiff = diff;
+ nr = 0;
+ while (tmpdiff) {
+ nr++;
+ err = hwloc_apply_diff_one(topology, tmpdiff, flags);
+ if (err < 0)
+ goto cancel;
+ tmpdiff = tmpdiff->generic.next;
+ }
+ return 0;
+
+cancel:
+ tmpdiff2 = tmpdiff;
+ tmpdiff = diff;
+ while (tmpdiff != tmpdiff2) {
+ hwloc_apply_diff_one(topology, tmpdiff, flags ^ HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE);
+ tmpdiff = tmpdiff->generic.next;
+ }
+ errno = EINVAL;
+ return -nr; /* return the index (starting at 1) of the first element that couldn't be applied */
+}
diff --git a/ext/hwloc/hwloc/distances.c b/ext/hwloc/hwloc/distances.c
new file mode 100644
index 0000000..51382b1
--- /dev/null
+++ b/ext/hwloc/hwloc/distances.c
@@ -0,0 +1,995 @@
+/*
+ * Copyright © 2010-2015 Inria. All rights reserved.
+ * Copyright © 2011-2012 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+
+#include <float.h>
+#include <math.h>
+
+/**************************
+ * Main Init/Clear/Destroy
+ */
+
+/* called during topology init */
+void hwloc_distances_init(struct hwloc_topology *topology)
+{
+ topology->first_osdist = topology->last_osdist = NULL;
+}
+
+/* called during topology destroy */
+void hwloc_distances_destroy(struct hwloc_topology * topology)
+{
+ struct hwloc_os_distances_s *osdist, *next = topology->first_osdist;
+ while ((osdist = next) != NULL) {
+ next = osdist->next;
+ /* remove final distance matrics AND physically-ordered ones */
+ free(osdist->indexes);
+ free(osdist->objs);
+ free(osdist->distances);
+ free(osdist);
+ }
+ topology->first_osdist = topology->last_osdist = NULL;
+}
+
+/******************************************************
+ * Inserting distances in the topology
+ * from a backend, from the environment or by the user
+ */
+
+/* insert a distance matrix in the topology.
+ * the caller gives us those pointers, we take care of freeing them later and so on.
+ */
+void hwloc_distances_set(hwloc_topology_t __hwloc_restrict topology, hwloc_obj_type_t type,
+ unsigned nbobjs, unsigned *indexes, hwloc_obj_t *objs, float *distances,
+ int force)
+{
+ struct hwloc_os_distances_s *osdist, *next = topology->first_osdist;
+ /* look for existing distances for the same type */
+ while ((osdist = next) != NULL) {
+ next = osdist->next;
+ if (osdist->type == type) {
+ if (osdist->forced && !force) {
+ /* there is a forced distance element, ignore the new non-forced one */
+ free(indexes);
+ free(objs);
+ free(distances);
+ return;
+ } else if (force) {
+ /* we're forcing a new distance, remove the old ones */
+ free(osdist->indexes);
+ free(osdist->objs);
+ free(osdist->distances);
+ /* remove current object */
+ if (osdist->prev)
+ osdist->prev->next = next;
+ else
+ topology->first_osdist = next;
+ if (next)
+ next->prev = osdist->prev;
+ else
+ topology->last_osdist = osdist->prev;
+ /* free current object */
+ free(osdist);
+ }
+ }
+ }
+
+ if (!nbobjs)
+ /* we're just clearing, return now */
+ return;
+
+ /* create the new element */
+ osdist = malloc(sizeof(struct hwloc_os_distances_s));
+ osdist->nbobjs = nbobjs;
+ osdist->indexes = indexes;
+ osdist->objs = objs;
+ osdist->distances = distances;
+ osdist->forced = force;
+ osdist->type = type;
+ /* insert it */
+ osdist->next = NULL;
+ osdist->prev = topology->last_osdist;
+ if (topology->last_osdist)
+ topology->last_osdist->next = osdist;
+ else
+ topology->first_osdist = osdist;
+ topology->last_osdist = osdist;
+}
+
+/* make sure a user-given distance matrix is sane */
+static int hwloc_distances__check_matrix(hwloc_topology_t __hwloc_restrict topology __hwloc_attribute_unused, hwloc_obj_type_t type __hwloc_attribute_unused,
+ unsigned nbobjs, unsigned *indexes, hwloc_obj_t *objs __hwloc_attribute_unused, float *distances __hwloc_attribute_unused)
+{
+ unsigned i,j;
+ /* make sure we don't have the same index twice */
+ for(i=0; i<nbobjs; i++)
+ for(j=i+1; j<nbobjs; j++)
+ if (indexes[i] == indexes[j]) {
+ errno = EINVAL;
+ return -1;
+ }
+ return 0;
+}
+
+static void hwloc_distances__set_from_string(struct hwloc_topology *topology,
+ hwloc_obj_type_t type, const char *string)
+{
+ /* the string format is: "index[0],...,index[N-1]:distance[0],...,distance[N*N-1]"
+ * or "index[0],...,index[N-1]:X*Y" or "index[0],...,index[N-1]:X*Y*Z"
+ */
+ const char *tmp = string, *next;
+ unsigned *indexes;
+ float *distances;
+ unsigned nbobjs = 0, i, j, x, y, z;
+
+ if (!strcmp(string, "none")) {
+ hwloc_distances_set(topology, type, 0, NULL, NULL, NULL, 1 /* force */);
+ return;
+ }
+
+ if (sscanf(string, "%u-%u:", &i, &j) == 2) {
+ /* range i-j */
+ nbobjs = j-i+1;
+ indexes = calloc(nbobjs, sizeof(unsigned));
+ distances = calloc(nbobjs*nbobjs, sizeof(float));
+ /* make sure the user didn't give a veeeeery large range */
+ if (!indexes || !distances) {
+ free(indexes);
+ free(distances);
+ return;
+ }
+ for(j=0; j<nbobjs; j++)
+ indexes[j] = j+i;
+ tmp = strchr(string, ':') + 1;
+
+ } else {
+ /* explicit list of indexes, count them */
+ while (1) {
+ size_t size = strspn(tmp, "0123456789");
+ if (tmp[size] != ',') {
+ /* last element */
+ tmp += size;
+ nbobjs++;
+ break;
+ }
+ /* another index */
+ tmp += size+1;
+ nbobjs++;
+ }
+
+ if (*tmp != ':') {
+ fprintf(stderr, "Ignoring %s distances from environment variable, missing colon\n",
+ hwloc_obj_type_string(type));
+ return;
+ }
+
+ indexes = calloc(nbobjs, sizeof(unsigned));
+ distances = calloc(nbobjs*nbobjs, sizeof(float));
+ tmp = string;
+
+ /* parse indexes */
+ for(i=0; i<nbobjs; i++) {
+ indexes[i] = strtoul(tmp, (char **) &next, 0);
+ tmp = next+1;
+ }
+ }
+
+
+ /* parse distances */
+ z=1; /* default if sscanf finds only 2 values below */
+ if (sscanf(tmp, "%u*%u*%u", &x, &y, &z) >= 2) {
+ /* generate the matrix to create x groups of y elements */
+ if (x*y*z != nbobjs) {
+ fprintf(stderr, "Ignoring %s distances from environment variable, invalid grouping (%u*%u*%u=%u instead of %u)\n",
+ hwloc_obj_type_string(type), x, y, z, x*y*z, nbobjs);
+ free(indexes);
+ free(distances);
+ return;
+ }
+ for(i=0; i<nbobjs; i++)
+ for(j=0; j<nbobjs; j++)
+ if (i==j)
+ distances[i*nbobjs+j] = 1;
+ else if (i/z == j/z)
+ distances[i*nbobjs+j] = 2;
+ else if (i/z/y == j/z/y)
+ distances[i*nbobjs+j] = 4;
+ else
+ distances[i*nbobjs+j] = 8;
+
+ } else {
+ /* parse a comma separated list of distances */
+ for(i=0; i<nbobjs*nbobjs; i++) {
+ distances[i] = (float) atof(tmp);
+ next = strchr(tmp, ',');
+ if (next) {
+ tmp = next+1;
+ } else if (i!=nbobjs*nbobjs-1) {
+ fprintf(stderr, "Ignoring %s distances from environment variable, not enough values (%u out of %u)\n",
+ hwloc_obj_type_string(type), i+1, nbobjs*nbobjs);
+ free(indexes);
+ free(distances);
+ return;
+ }
+ }
+ }
+
+ if (hwloc_distances__check_matrix(topology, type, nbobjs, indexes, NULL, distances) < 0) {
+ fprintf(stderr, "Ignoring invalid %s distances from environment variable\n", hwloc_obj_type_string(type));
+ free(indexes);
+ free(distances);
+ return;
+ }
+
+ hwloc_distances_set(topology, type, nbobjs, indexes, NULL, distances, 1 /* force */);
+}
+
+/* take distances in the environment, store them as is in the topology.
+ * we'll convert them into object later once the tree is filled
+ */
+void hwloc_distances_set_from_env(struct hwloc_topology *topology)
+{
+ hwloc_obj_type_t type;
+ for(type = HWLOC_OBJ_SYSTEM; type < HWLOC_OBJ_TYPE_MAX; type++) {
+ const char *env;
+ char envname[64];
+ snprintf(envname, sizeof(envname), "HWLOC_%s_DISTANCES", hwloc_obj_type_string(type));
+ env = getenv(envname);
+ if (env) {
+ hwloc_localeswitch_declare;
+ hwloc_localeswitch_init();
+ hwloc_distances__set_from_string(topology, type, env);
+ hwloc_localeswitch_fini();
+ }
+ }
+}
+
+/* The actual set() function exported to the user
+ *
+ * take the given distance, store them as is in the topology.
+ * we'll convert them into object later once the tree is filled.
+ */
+int hwloc_topology_set_distance_matrix(hwloc_topology_t __hwloc_restrict topology, hwloc_obj_type_t type,
+ unsigned nbobjs, unsigned *indexes, float *distances)
+{
+ unsigned *_indexes;
+ float *_distances;
+
+ if (!nbobjs && !indexes && !distances) {
+ hwloc_distances_set(topology, type, 0, NULL, NULL, NULL, 1 /* force */);
+ return 0;
+ }
+
+ if (!nbobjs || !indexes || !distances)
+ return -1;
+
+ if (hwloc_distances__check_matrix(topology, type, nbobjs, indexes, NULL, distances) < 0)
+ return -1;
+
+ /* copy the input arrays and give them to the topology */
+ _indexes = malloc(nbobjs*sizeof(unsigned));
+ memcpy(_indexes, indexes, nbobjs*sizeof(unsigned));
+ _distances = malloc(nbobjs*nbobjs*sizeof(float));
+ memcpy(_distances, distances, nbobjs*nbobjs*sizeof(float));
+ hwloc_distances_set(topology, type, nbobjs, _indexes, NULL, _distances, 1 /* force */);
+
+ return 0;
+}
+
+/************************
+ * Restricting distances
+ */
+
+/* called when some objects have been removed because empty/ignored/cgroup/restrict,
+ * we must rebuild the list of objects from indexes (in hwloc_distances_finalize_os())
+ */
+void hwloc_distances_restrict_os(struct hwloc_topology *topology)
+{
+ struct hwloc_os_distances_s * osdist;
+ for(osdist = topology->first_osdist; osdist; osdist = osdist->next) {
+ /* remove the objs array, we'll rebuild it from the indexes
+ * depending on remaining objects */
+ free(osdist->objs);
+ osdist->objs = NULL;
+ }
+}
+
+
+/* cleanup everything we created from distances so that we may rebuild them
+ * at the end of restrict()
+ */
+void hwloc_distances_restrict(struct hwloc_topology *topology, unsigned long flags)
+{
+ if (flags & HWLOC_RESTRICT_FLAG_ADAPT_DISTANCES) {
+ /* some objects may have been removed, clear objects arrays so that finalize_os rebuilds them properly */
+ hwloc_distances_restrict_os(topology);
+ } else {
+ /* if not adapting distances, drop everything */
+ hwloc_distances_destroy(topology);
+ }
+}
+
+/**************************************************************
+ * Convert user/env given array of indexes into actual objects
+ */
+
+static hwloc_obj_t hwloc_find_obj_by_type_and_os_index(hwloc_obj_t root, hwloc_obj_type_t type, unsigned os_index)
+{
+ hwloc_obj_t child;
+ if (root->type == type && root->os_index == os_index)
+ return root;
+ child = root->first_child;
+ while (child) {
+ hwloc_obj_t found = hwloc_find_obj_by_type_and_os_index(child, type, os_index);
+ if (found)
+ return found;
+ child = child->next_sibling;
+ }
+ return NULL;
+}
+
+/* convert distance indexes that were previously stored in the topology
+ * into actual objects if not done already.
+ * it's already done when distances come from backends (this function should not be called then).
+ * it's not done when distances come from the user.
+ *
+ * returns -1 if the matrix was invalid
+ */
+static int
+hwloc_distances__finalize_os(struct hwloc_topology *topology, struct hwloc_os_distances_s *osdist)
+{
+ unsigned nbobjs = osdist->nbobjs;
+ unsigned *indexes = osdist->indexes;
+ float *distances = osdist->distances;
+ unsigned i, j;
+ hwloc_obj_type_t type = osdist->type;
+ hwloc_obj_t *objs = calloc(nbobjs, sizeof(hwloc_obj_t));
+
+ assert(!osdist->objs);
+
+ /* traverse the topology and look for the relevant objects */
+ for(i=0; i<nbobjs; i++) {
+ hwloc_obj_t obj = hwloc_find_obj_by_type_and_os_index(topology->levels[0][0], type, indexes[i]);
+ if (!obj) {
+
+ /* shift the matrix */
+#define OLDPOS(i,j) (distances+(i)*nbobjs+(j))
+#define NEWPOS(i,j) (distances+(i)*(nbobjs-1)+(j))
+ if (i>0) {
+ /** no need to move beginning of 0th line */
+ for(j=0; j<i-1; j++)
+ /** move end of jth line + beginning of (j+1)th line */
+ memmove(NEWPOS(j,i), OLDPOS(j,i+1), (nbobjs-1)*sizeof(*distances));
+ /** move end of (i-1)th line */
+ memmove(NEWPOS(i-1,i), OLDPOS(i-1,i+1), (nbobjs-i-1)*sizeof(*distances));
+ }
+ if (i<nbobjs-1) {
+ /** move beginning of (i+1)th line */
+ memmove(NEWPOS(i,0), OLDPOS(i+1,0), i*sizeof(*distances));
+ /** move end of jth line + beginning of (j+1)th line */
+ for(j=i; j<nbobjs-2; j++)
+ memmove(NEWPOS(j,i), OLDPOS(j+1,i+1), (nbobjs-1)*sizeof(*distances));
+ /** move end of (nbobjs-2)th line */
+ memmove(NEWPOS(nbobjs-2,i), OLDPOS(nbobjs-1,i+1), (nbobjs-i-1)*sizeof(*distances));
+ }
+
+ /* shift the indexes array */
+ memmove(indexes+i, indexes+i+1, (nbobjs-i-1)*sizeof(*indexes));
+
+ /* update counters */
+ nbobjs--;
+ i--;
+ continue;
+ }
+ objs[i] = obj;
+ }
+
+ osdist->nbobjs = nbobjs;
+ if (!nbobjs) {
+ /* the whole matrix was invalid, let the caller remove this distances */
+ free(objs);
+ return -1;
+ }
+
+ /* setup the objs array */
+ osdist->objs = objs;
+ return 0;
+}
+
+
+void hwloc_distances_finalize_os(struct hwloc_topology *topology)
+{
+ struct hwloc_os_distances_s *osdist, *next = topology->first_osdist;
+ while ((osdist = next) != NULL) {
+ int err;
+ next = osdist->next;
+
+ /* remove final distance matrics AND physically-ordered ones */
+
+ if (osdist->objs)
+ /* nothing to do, switch to the next element */
+ continue;
+
+ err = hwloc_distances__finalize_os(topology, osdist);
+ if (!err)
+ /* convert ok, switch to the next element */
+ continue;
+
+ /* remove this element */
+ free(osdist->indexes);
+ free(osdist->distances);
+ /* remove current object */
+ if (osdist->prev)
+ osdist->prev->next = next;
+ else
+ topology->first_osdist = next;
+ if (next)
+ next->prev = osdist->prev;
+ else
+ topology->last_osdist = osdist->prev;
+ /* free current object */
+ free(osdist);
+ }
+}
+
+/***********************************************************
+ * Convert internal distances given by the backend/env/user
+ * into exported logical distances attached to objects
+ */
+
+static void
+hwloc_distances__finalize_logical(struct hwloc_topology *topology,
+ unsigned nbobjs,
+ hwloc_obj_t *objs, float *osmatrix)
+{
+ unsigned i, j, li, lj, minl;
+ float min = FLT_MAX, max = FLT_MIN;
+ hwloc_obj_t root;
+ float *matrix;
+ hwloc_cpuset_t cpuset, complete_cpuset;
+ hwloc_nodeset_t nodeset, complete_nodeset;
+ unsigned relative_depth;
+ int idx;
+
+ /* find the root */
+ cpuset = hwloc_bitmap_alloc();
+ complete_cpuset = hwloc_bitmap_alloc();
+ nodeset = hwloc_bitmap_alloc();
+ complete_nodeset = hwloc_bitmap_alloc();
+ for(i=0; i<nbobjs; i++) {
+ hwloc_bitmap_or(cpuset, cpuset, objs[i]->cpuset);
+ hwloc_bitmap_or(complete_cpuset, complete_cpuset, objs[i]->complete_cpuset);
+ hwloc_bitmap_or(nodeset, nodeset, objs[i]->nodeset);
+ hwloc_bitmap_or(complete_nodeset, complete_nodeset, objs[i]->complete_nodeset);
+ }
+ /* find the object covering cpuset, we'll take care of the nodeset later */
+ root = hwloc_get_obj_covering_cpuset(topology, cpuset);
+ /* walk up to find a parent that also covers the nodeset and complete sets */
+ while (root &&
+ (!hwloc_bitmap_isincluded(nodeset, root->nodeset)
+ || !hwloc_bitmap_isincluded(complete_nodeset, root->complete_nodeset)
+ || !hwloc_bitmap_isincluded(complete_cpuset, root->complete_cpuset)))
+ root = root->parent;
+ if (!root) {
+ /* should not happen, ignore the distance matrix and report an error. */
+ if (!hwloc_hide_errors()) {
+ char *a, *b;
+ hwloc_bitmap_asprintf(&a, cpuset);
+ hwloc_bitmap_asprintf(&b, nodeset);
+ fprintf(stderr, "****************************************************************************\n");
+ fprintf(stderr, "* hwloc %s has encountered an error when adding a distance matrix to the topology.\n", HWLOC_VERSION);
+ fprintf(stderr, "*\n");
+ fprintf(stderr, "* hwloc_distances__finalize_logical() could not find any object covering\n");
+ fprintf(stderr, "* cpuset %s and nodeset %s\n", a, b);
+ fprintf(stderr, "*\n");
+ fprintf(stderr, "* Please report this error message to the hwloc user's mailing list,\n");
+#ifdef HWLOC_LINUX_SYS
+ fprintf(stderr, "* along with the output from the hwloc-gather-topology script.\n");
+#else
+ fprintf(stderr, "* along with any relevant topology information from your platform.\n");
+#endif
+ fprintf(stderr, "****************************************************************************\n");
+ free(a);
+ free(b);
+ }
+ hwloc_bitmap_free(cpuset);
+ hwloc_bitmap_free(complete_cpuset);
+ hwloc_bitmap_free(nodeset);
+ hwloc_bitmap_free(complete_nodeset);
+ return;
+ }
+ /* ideally, root has the exact cpuset and nodeset.
+ * but ignoring or other things that remove objects may cause the object array to reduce */
+ assert(hwloc_bitmap_isincluded(cpuset, root->cpuset));
+ assert(hwloc_bitmap_isincluded(complete_cpuset, root->complete_cpuset));
+ assert(hwloc_bitmap_isincluded(nodeset, root->nodeset));
+ assert(hwloc_bitmap_isincluded(complete_nodeset, root->complete_nodeset));
+ hwloc_bitmap_free(cpuset);
+ hwloc_bitmap_free(complete_cpuset);
+ hwloc_bitmap_free(nodeset);
+ hwloc_bitmap_free(complete_nodeset);
+ if (root->depth >= objs[0]->depth) {
+ /* strange topology led us to find invalid relative depth, ignore */
+ return;
+ }
+ relative_depth = objs[0]->depth - root->depth; /* this assume that we have distances between objects of the same level */
+
+ if (nbobjs != hwloc_get_nbobjs_inside_cpuset_by_depth(topology, root->cpuset, root->depth + relative_depth))
+ /* the root does not cover the right number of objects, maybe we failed to insert a root (bad intersect or so). */
+ return;
+
+ /* get the logical index offset, it's the min of all logical indexes */
+ minl = UINT_MAX;
+ for(i=0; i<nbobjs; i++)
+ if (minl > objs[i]->logical_index)
+ minl = objs[i]->logical_index;
+
+ /* compute/check min/max values */
+ for(i=0; i<nbobjs; i++)
+ for(j=0; j<nbobjs; j++) {
+ float val = osmatrix[i*nbobjs+j];
+ if (val < min)
+ min = val;
+ if (val > max)
+ max = val;
+ }
+ if (!min) {
+ /* Linux up to 2.6.36 reports ACPI SLIT distances, which should be memory latencies.
+ * Except of SGI IP27 (SGI Origin 200/2000 with MIPS processors) where the distances
+ * are the number of hops between routers.
+ */
+ hwloc_debug("%s", "minimal distance is 0, matrix does not seem to contain latencies, ignoring\n");
+ return;
+ }
+
+ /* store the normalized latency matrix in the root object */
+ idx = root->distances_count++;
+ root->distances = realloc(root->distances, root->distances_count * sizeof(struct hwloc_distances_s *));
+ root->distances[idx] = malloc(sizeof(struct hwloc_distances_s));
+ root->distances[idx]->relative_depth = relative_depth;
+ root->distances[idx]->nbobjs = nbobjs;
+ root->distances[idx]->latency = matrix = malloc(nbobjs*nbobjs*sizeof(float));
+ root->distances[idx]->latency_base = (float) min;
+#define NORMALIZE_LATENCY(d) ((d)/(min))
+ root->distances[idx]->latency_max = NORMALIZE_LATENCY(max);
+ for(i=0; i<nbobjs; i++) {
+ li = objs[i]->logical_index - minl;
+ matrix[li*nbobjs+li] = NORMALIZE_LATENCY(osmatrix[i*nbobjs+i]);
+ for(j=i+1; j<nbobjs; j++) {
+ lj = objs[j]->logical_index - minl;
+ matrix[li*nbobjs+lj] = NORMALIZE_LATENCY(osmatrix[i*nbobjs+j]);
+ matrix[lj*nbobjs+li] = NORMALIZE_LATENCY(osmatrix[j*nbobjs+i]);
+ }
+ }
+}
+
+/* convert internal distances into logically-ordered distances
+ * that can be exposed in the API
+ */
+void
+hwloc_distances_finalize_logical(struct hwloc_topology *topology)
+{
+ unsigned nbobjs;
+ int depth;
+ struct hwloc_os_distances_s * osdist;
+ for(osdist = topology->first_osdist; osdist; osdist = osdist->next) {
+
+ nbobjs = osdist->nbobjs;
+ if (!nbobjs)
+ continue;
+
+ depth = hwloc_get_type_depth(topology, osdist->type);
+ if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+ continue;
+
+ if (osdist->objs) {
+ assert(osdist->distances);
+ hwloc_distances__finalize_logical(topology, nbobjs,
+ osdist->objs,
+ osdist->distances);
+ }
+ }
+}
+
+/***************************************************
+ * Destroying logical distances attached to objects
+ */
+
+/* destroy an object distances structure */
+void
+hwloc_clear_object_distances_one(struct hwloc_distances_s * distances)
+{
+ free(distances->latency);
+ free(distances);
+}
+
+void
+hwloc_clear_object_distances(hwloc_obj_t obj)
+{
+ unsigned i;
+ for (i=0; i<obj->distances_count; i++)
+ hwloc_clear_object_distances_one(obj->distances[i]);
+ free(obj->distances);
+ obj->distances = NULL;
+ obj->distances_count = 0;
+}
+
+/******************************************
+ * Grouping objects according to distances
+ */
+
+static void hwloc_report_user_distance_error(const char *msg, int line)
+{
+ static int reported = 0;
+
+ if (!reported && !hwloc_hide_errors()) {
+ fprintf(stderr, "****************************************************************************\n");
+ fprintf(stderr, "* hwloc %s has encountered what looks like an error from user-given distances.\n", HWLOC_VERSION);
+ fprintf(stderr, "*\n");
+ fprintf(stderr, "* %s\n", msg);
+ fprintf(stderr, "* Error occurred in topology.c line %d\n", line);
+ fprintf(stderr, "*\n");
+ fprintf(stderr, "* Please make sure that distances given through the interface or environment\n");
+ fprintf(stderr, "* variables do not contradict any other topology information.\n");
+ fprintf(stderr, "****************************************************************************\n");
+ reported = 1;
+ }
+}
+
+static int hwloc_compare_distances(float a, float b, float accuracy)
+{
+ if (accuracy != 0.0 && fabsf(a-b) < a * accuracy)
+ return 0;
+ return a < b ? -1 : a == b ? 0 : 1;
+}
+
+/*
+ * Place objects in groups if they are in a transitive graph of minimal distances.
+ * Return how many groups were created, or 0 if some incomplete distance graphs were found.
+ */
+static unsigned
+hwloc__find_groups_by_min_distance(unsigned nbobjs,
+ float *_distances,
+ float accuracy,
+ unsigned *groupids,
+ int verbose)
+{
+ float min_distance = FLT_MAX;
+ unsigned groupid = 1;
+ unsigned i,j,k;
+ unsigned skipped = 0;
+
+#define DISTANCE(i, j) _distances[(i) * nbobjs + (j)]
+
+ memset(groupids, 0, nbobjs*sizeof(*groupids));
+
+ /* find the minimal distance */
+ for(i=0; i<nbobjs; i++)
+ for(j=0; j<nbobjs; j++) /* check the entire matrix, it may not be perfectly symmetric depending on the accuracy */
+ if (i != j && DISTANCE(i, j) < min_distance) /* no accuracy here, we want the real minimal */
+ min_distance = DISTANCE(i, j);
+ hwloc_debug("found minimal distance %f between objects\n", min_distance);
+
+ if (min_distance == FLT_MAX)
+ return 0;
+
+ /* build groups of objects connected with this distance */
+ for(i=0; i<nbobjs; i++) {
+ unsigned size;
+ int firstfound;
+
+ /* if already grouped, skip */
+ if (groupids[i])
+ continue;
+
+ /* start a new group */
+ groupids[i] = groupid;
+ size = 1;
+ firstfound = i;
+
+ while (firstfound != -1) {
+ /* we added new objects to the group, the first one was firstfound.
+ * rescan all connections from these new objects (starting at first found) to any other objects,
+ * so as to find new objects minimally-connected by transivity.
+ */
+ int newfirstfound = -1;
+ for(j=firstfound; j<nbobjs; j++)
+ if (groupids[j] == groupid)
+ for(k=0; k<nbobjs; k++)
+ if (!groupids[k] && !hwloc_compare_distances(DISTANCE(j, k), min_distance, accuracy)) {
+ groupids[k] = groupid;
+ size++;
+ if (newfirstfound == -1)
+ newfirstfound = k;
+ if (i == j)
+ hwloc_debug("object %u is minimally connected to %u\n", k, i);
+ else
+ hwloc_debug("object %u is minimally connected to %u through %u\n", k, i, j);
+ }
+ firstfound = newfirstfound;
+ }
+
+ if (size == 1) {
+ /* cancel this useless group, ignore this object and try from the next one */
+ groupids[i] = 0;
+ skipped++;
+ continue;
+ }
+
+ /* valid this group */
+ groupid++;
+ if (verbose)
+ fprintf(stderr, "Found transitive graph with %u objects with minimal distance %f accuracy %f\n",
+ size, min_distance, accuracy);
+ }
+
+ if (groupid == 2 && !skipped)
+ /* we created a single group containing all objects, ignore it */
+ return 0;
+
+ /* return the last id, since it's also the number of used group ids */
+ return groupid-1;
+}
+
+/* check that the matrix is ok */
+static int
+hwloc__check_grouping_matrix(unsigned nbobjs, float *_distances, float accuracy, int verbose)
+{
+ unsigned i,j;
+ for(i=0; i<nbobjs; i++) {
+ for(j=i+1; j<nbobjs; j++) {
+ /* should be symmetric */
+ if (hwloc_compare_distances(DISTANCE(i, j), DISTANCE(j, i), accuracy)) {
+ if (verbose)
+ fprintf(stderr, "Distance matrix asymmetric ([%u,%u]=%f != [%u,%u]=%f), aborting\n",
+ i, j, DISTANCE(i, j), j, i, DISTANCE(j, i));
+ return -1;
+ }
+ /* diagonal is smaller than everything else */
+ if (hwloc_compare_distances(DISTANCE(i, j), DISTANCE(i, i), accuracy) <= 0) {
+ if (verbose)
+ fprintf(stderr, "Distance to self not strictly minimal ([%u,%u]=%f <= [%u,%u]=%f), aborting\n",
+ i, j, DISTANCE(i, j), i, i, DISTANCE(i, i));
+ return -1;
+ }
+ }
+ }
+ return 0;
+}
+
+/*
+ * Look at object physical distances to group them.
+ */
+static void
+hwloc__groups_by_distances(struct hwloc_topology *topology,
+ unsigned nbobjs,
+ struct hwloc_obj **objs,
+ float *_distances,
+ unsigned nbaccuracies, float *accuracies,
+ int fromuser,
+ int needcheck,
+ int verbose)
+{
+ unsigned *groupids = NULL;
+ unsigned nbgroups = 0;
+ unsigned i,j;
+
+ if (nbobjs <= 2) {
+ return;
+ }
+
+ groupids = malloc(sizeof(unsigned) * nbobjs);
+ if (NULL == groupids) {
+ return;
+ }
+
+ for(i=0; i<nbaccuracies; i++) {
+ if (verbose)
+ fprintf(stderr, "Trying to group %u %s objects according to physical distances with accuracy %f\n",
+ nbobjs, hwloc_obj_type_string(objs[0]->type), accuracies[i]);
+ if (needcheck && hwloc__check_grouping_matrix(nbobjs, _distances, accuracies[i], verbose) < 0)
+ continue;
+ nbgroups = hwloc__find_groups_by_min_distance(nbobjs, _distances, accuracies[i], groupids, verbose);
+ if (nbgroups)
+ break;
+ }
+ if (!nbgroups)
+ goto outter_free;
+
+ /* For convenience, put these declarations inside a block. It's a
+ crying shame we can't use C99 syntax here, and have to do a bunch
+ of mallocs. :-( */
+ {
+ hwloc_obj_t *groupobjs = NULL;
+ unsigned *groupsizes = NULL;
+ float *groupdistances = NULL;
+ unsigned failed = 0;
+
+ groupobjs = malloc(sizeof(hwloc_obj_t) * nbgroups);
+ groupsizes = malloc(sizeof(unsigned) * nbgroups);
+ groupdistances = malloc(sizeof(float) * nbgroups * nbgroups);
+ if (NULL == groupobjs || NULL == groupsizes || NULL == groupdistances) {
+ goto inner_free;
+ }
+ /* create new Group objects and record their size */
+ memset(&(groupsizes[0]), 0, sizeof(groupsizes[0]) * nbgroups);
+ for(i=0; i<nbgroups; i++) {
+ /* create the Group object */
+ hwloc_obj_t group_obj, res_obj;
+ group_obj = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, -1);
+ group_obj->cpuset = hwloc_bitmap_alloc();
+ group_obj->attr->group.depth = topology->next_group_depth;
+ for (j=0; j<nbobjs; j++)
+ if (groupids[j] == i+1) {
+ /* assemble the group sets */
+ hwloc_obj_add_other_obj_sets(group_obj, objs[j]);
+ groupsizes[i]++;
+ }
+ hwloc_debug_1arg_bitmap("adding Group object with %u objects and cpuset %s\n",
+ groupsizes[i], group_obj->cpuset);
+ res_obj = hwloc__insert_object_by_cpuset(topology, group_obj,
+ fromuser ? hwloc_report_user_distance_error : hwloc_report_os_error);
+ /* res_obj may be NULL on failure to insert. */
+ if (!res_obj)
+ failed++;
+ /* or it may be different from groupobjs if we got groups from XML import before grouping */
+ groupobjs[i] = res_obj;
+ }
+
+ if (failed)
+ /* don't try to group above if we got a NULL group here, just keep this incomplete level */
+ goto inner_free;
+
+ /* factorize distances */
+ memset(&(groupdistances[0]), 0, sizeof(groupdistances[0]) * nbgroups * nbgroups);
+#undef DISTANCE
+#define DISTANCE(i, j) _distances[(i) * nbobjs + (j)]
+#define GROUP_DISTANCE(i, j) groupdistances[(i) * nbgroups + (j)]
+ for(i=0; i<nbobjs; i++)
+ if (groupids[i])
+ for(j=0; j<nbobjs; j++)
+ if (groupids[j])
+ GROUP_DISTANCE(groupids[i]-1, groupids[j]-1) += DISTANCE(i, j);
+ for(i=0; i<nbgroups; i++)
+ for(j=0; j<nbgroups; j++) {
+ unsigned groupsize = groupsizes[i]*groupsizes[j];
+ float groupsizef = (float) groupsize;
+ GROUP_DISTANCE(i, j) /= groupsizef;
+ }
+#ifdef HWLOC_DEBUG
+ hwloc_debug("%s", "generated new distance matrix between groups:\n");
+ hwloc_debug("%s", " index");
+ for(j=0; j<nbgroups; j++)
+ hwloc_debug(" % 5d", (int) j); /* print index because os_index is -1 for Groups */
+ hwloc_debug("%s", "\n");
+ for(i=0; i<nbgroups; i++) {
+ hwloc_debug(" % 5d", (int) i);
+ for(j=0; j<nbgroups; j++)
+ hwloc_debug(" %2.3f", GROUP_DISTANCE(i, j));
+ hwloc_debug("%s", "\n");
+ }
+#endif
+
+ topology->next_group_depth++;
+ hwloc__groups_by_distances(topology, nbgroups, groupobjs, (float*) groupdistances, nbaccuracies, accuracies, fromuser, 0 /* no need to check generated matrix */, verbose);
+
+ inner_free:
+ /* Safely free everything */
+ if (NULL != groupobjs) {
+ free(groupobjs);
+ }
+ if (NULL != groupsizes) {
+ free(groupsizes);
+ }
+ if (NULL != groupdistances) {
+ free(groupdistances);
+ }
+ }
+
+ outter_free:
+ if (NULL != groupids) {
+ free(groupids);
+ }
+}
+
+void
+hwloc_group_by_distances(struct hwloc_topology *topology)
+{
+ unsigned nbobjs;
+ struct hwloc_os_distances_s * osdist;
+ const char *env;
+ float accuracies[5] = { 0.0f, 0.01f, 0.02f, 0.05f, 0.1f };
+ unsigned nbaccuracies = 5;
+ hwloc_obj_t group_obj;
+ int verbose = 0;
+ unsigned i;
+ hwloc_localeswitch_declare;
+#ifdef HWLOC_DEBUG
+ unsigned j;
+#endif
+
+ env = getenv("HWLOC_GROUPING");
+ if (env && !atoi(env))
+ return;
+ /* backward compat with v1.2 */
+ if (getenv("HWLOC_IGNORE_DISTANCES"))
+ return;
+
+ hwloc_localeswitch_init();
+ env = getenv("HWLOC_GROUPING_ACCURACY");
+ if (!env) {
+ /* only use 0.0 */
+ nbaccuracies = 1;
+ } else if (strcmp(env, "try")) {
+ /* use the given value */
+ nbaccuracies = 1;
+ accuracies[0] = (float) atof(env);
+ } /* otherwise try all values */
+ hwloc_localeswitch_fini();
+
+#ifdef HWLOC_DEBUG
+ verbose = 1;
+#else
+ env = getenv("HWLOC_GROUPING_VERBOSE");
+ if (env)
+ verbose = atoi(env);
+#endif
+
+ for(osdist = topology->first_osdist; osdist; osdist = osdist->next) {
+
+ nbobjs = osdist->nbobjs;
+ if (!nbobjs)
+ continue;
+
+ if (osdist->objs) {
+ /* if we have objs, we must have distances as well,
+ * thanks to hwloc_convert_distances_indexes_into_objects()
+ */
+ assert(osdist->distances);
+
+#ifdef HWLOC_DEBUG
+ hwloc_debug("%s", "trying to group objects using distance matrix:\n");
+ hwloc_debug("%s", " index");
+ for(j=0; j<nbobjs; j++)
+ hwloc_debug(" % 5d", (int) osdist->objs[j]->os_index);
+ hwloc_debug("%s", "\n");
+ for(i=0; i<nbobjs; i++) {
+ hwloc_debug(" % 5d", (int) osdist->objs[i]->os_index);
+ for(j=0; j<nbobjs; j++)
+ hwloc_debug(" %2.3f", osdist->distances[i*nbobjs + j]);
+ hwloc_debug("%s", "\n");
+ }
+#endif
+
+ hwloc__groups_by_distances(topology, nbobjs,
+ osdist->objs,
+ osdist->distances,
+ nbaccuracies, accuracies,
+ osdist->indexes != NULL,
+ 1 /* check the first matrice */,
+ verbose);
+
+ /* add a final group object covering everybody so that the distance matrix can be stored somewhere.
+ * this group will be merged into a regular object if the matrix isn't strangely incomplete
+ */
+ group_obj = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, -1);
+ group_obj->attr->group.depth = (unsigned) -1;
+ group_obj->cpuset = hwloc_bitmap_alloc();
+ for(i=0; i<nbobjs; i++) {
+ /* assemble the group sets */
+ hwloc_obj_add_other_obj_sets(group_obj, osdist->objs[i]);
+ }
+ hwloc_debug_1arg_bitmap("adding Group object (as root of distance matrix with %u objects) with cpuset %s\n",
+ nbobjs, group_obj->cpuset);
+ hwloc__insert_object_by_cpuset(topology, group_obj,
+ osdist->indexes != NULL ? hwloc_report_user_distance_error : hwloc_report_os_error);
+ }
+ }
+}
diff --git a/ext/hwloc/hwloc/dolib.c b/ext/hwloc/hwloc/dolib.c
new file mode 100644
index 0000000..d5eff58
--- /dev/null
+++ b/ext/hwloc/hwloc/dolib.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009 inria. All rights reserved.
+ * Copyright © 2009, 2012 Université Bordeaux
+ * See COPYING in top-level directory.
+ */
+
+/* Wrapper to avoid msys' tendency to turn / into \ and : into ; */
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(int argc, char *argv[]) {
+ char *prog, *arch, *def, *version, *lib;
+ char s[1024];
+ char name[16];
+ int current, age, revision;
+
+ if (argc != 6) {
+ fprintf(stderr,"bad number of arguments");
+ exit(EXIT_FAILURE);
+ }
+
+ prog = argv[1];
+ arch = argv[2];
+ def = argv[3];
+ version = argv[4];
+ lib = argv[5];
+
+ if (sscanf(version, "%d:%d:%d", ¤t, &revision, &age) != 3)
+ exit(EXIT_FAILURE);
+
+ snprintf(name, sizeof(name), "libhwloc-%d", current - age);
+ printf("using soname %s\n", name);
+
+ snprintf(s, sizeof(s), "\"%s\" /machine:%s /def:%s /name:%s /out:%s",
+ prog, arch, def, name, lib);
+ if (system(s)) {
+ fprintf(stderr, "%s failed\n", s);
+ exit(EXIT_FAILURE);
+ }
+
+ exit(EXIT_SUCCESS);
+}
diff --git a/ext/hwloc/hwloc/misc.c b/ext/hwloc/hwloc/misc.c
new file mode 100644
index 0000000..3da6687
--- /dev/null
+++ b/ext/hwloc/hwloc/misc.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria. All rights reserved.
+ * Copyright © 2009-2010 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <private/private.h>
+#include <private/misc.h>
+
+#include <stdarg.h>
+#ifdef HAVE_SYS_UTSNAME_H
+#include <sys/utsname.h>
+#endif
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+#include <ctype.h>
+
+#ifdef HAVE_PROGRAM_INVOCATION_NAME
+#include <errno.h>
+extern char *program_invocation_name;
+#endif
+#ifdef HAVE___PROGNAME
+extern char *__progname;
+#endif
+
+int hwloc_snprintf(char *str, size_t size, const char *format, ...)
+{
+ int ret;
+ va_list ap;
+ static char bin;
+ size_t fakesize;
+ char *fakestr;
+
+ /* Some systems crash on str == NULL */
+ if (!size) {
+ str = &bin;
+ size = 1;
+ }
+
+ va_start(ap, format);
+ ret = vsnprintf(str, size, format, ap);
+ va_end(ap);
+
+ if (ret >= 0 && (size_t) ret != size-1)
+ return ret;
+
+ /* vsnprintf returned size-1 or -1. That could be a system which reports the
+ * written data and not the actually required room. Try increasing buffer
+ * size to get the latter. */
+
+ fakesize = size;
+ fakestr = NULL;
+ do {
+ fakesize *= 2;
+ free(fakestr);
+ fakestr = malloc(fakesize);
+ if (NULL == fakestr)
+ return -1;
+ va_start(ap, format);
+ errno = 0;
+ ret = vsnprintf(fakestr, fakesize, format, ap);
+ va_end(ap);
+ } while ((size_t) ret == fakesize-1 || (ret < 0 && (!errno || errno == ERANGE)));
+
+ if (ret >= 0 && size) {
+ if (size > (size_t) ret+1)
+ size = ret+1;
+ memcpy(str, fakestr, size-1);
+ str[size-1] = 0;
+ }
+ free(fakestr);
+
+ return ret;
+}
+
+int hwloc_namecoloncmp(const char *haystack, const char *needle, size_t n)
+{
+ size_t i = 0;
+ while (*haystack && *haystack != ':') {
+ int ha = *haystack++;
+ int low_h = tolower(ha);
+ int ne = *needle++;
+ int low_n = tolower(ne);
+ if (low_h != low_n)
+ return 1;
+ i++;
+ }
+ return i < n;
+}
+
+void hwloc_add_uname_info(struct hwloc_topology *topology __hwloc_attribute_unused,
+ void *cached_uname __hwloc_attribute_unused)
+{
+#ifdef HAVE_UNAME
+ struct utsname _utsname, *utsname;
+
+ if (hwloc_obj_get_info_by_name(topology->levels[0][0], "OSName"))
+ /* don't annotate twice */
+ return;
+
+ if (cached_uname)
+ utsname = (struct utsname *) cached_uname;
+ else {
+ utsname = &_utsname;
+ if (uname(utsname) < 0)
+ return;
+ }
+
+ if (*utsname->sysname)
+ hwloc_obj_add_info(topology->levels[0][0], "OSName", utsname->sysname);
+ if (*utsname->release)
+ hwloc_obj_add_info(topology->levels[0][0], "OSRelease", utsname->release);
+ if (*utsname->version)
+ hwloc_obj_add_info(topology->levels[0][0], "OSVersion", utsname->version);
+ if (*utsname->nodename)
+ hwloc_obj_add_info(topology->levels[0][0], "HostName", utsname->nodename);
+ if (*utsname->machine)
+ hwloc_obj_add_info(topology->levels[0][0], "Architecture", utsname->machine);
+#endif /* HAVE_UNAME */
+}
+
+char *
+hwloc_progname(struct hwloc_topology *topology __hwloc_attribute_unused)
+{
+#if HAVE_DECL_GETMODULEFILENAME
+ char name[256], *basename;
+ unsigned res = GetModuleFileName(NULL, name, sizeof(name));
+ if (res == sizeof(name) || !res)
+ return NULL;
+ basename = strrchr(name, '\\');
+ if (!basename)
+ basename = name;
+ else
+ basename++;
+ return strdup(basename);
+#else /* !HAVE_GETMODULEFILENAME */
+ const char *name, *basename;
+#if HAVE_DECL_GETPROGNAME
+ name = getprogname(); /* FreeBSD, NetBSD, some Solaris */
+#elif HAVE_DECL_GETEXECNAME
+ name = getexecname(); /* Solaris */
+#elif defined HAVE_PROGRAM_INVOCATION_NAME
+ name = program_invocation_name; /* Glibc. BGQ CNK. */
+ /* could use program_invocation_short_name directly, but we have the code to remove the path below anyway */
+#elif defined HAVE___PROGNAME
+ name = __progname; /* fallback for most unix, used for OpenBSD */
+#else
+ /* TODO: _NSGetExecutablePath(path, &size) on Darwin */
+ /* TODO: AIX, HPUX, OSF */
+ name = NULL;
+#endif
+ if (!name)
+ return NULL;
+ basename = strrchr(name, '/');
+ if (!basename)
+ basename = name;
+ else
+ basename++;
+ return strdup(basename);
+#endif /* !HAVE_GETMODULEFILENAME */
+}
diff --git a/ext/hwloc/hwloc/pci-common.c b/ext/hwloc/hwloc/pci-common.c
new file mode 100644
index 0000000..1000ca1
--- /dev/null
+++ b/ext/hwloc/hwloc/pci-common.c
@@ -0,0 +1,482 @@
+/*
+ * Copyright © 2009-2015 Inria. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <hwloc/plugins.h>
+#include <private/debug.h>
+
+#ifdef HWLOC_DEBUG
+static void
+hwloc_pci_traverse_print_cb(void * cbdata __hwloc_attribute_unused,
+ struct hwloc_obj *pcidev)
+{
+ char busid[14];
+ hwloc_obj_t parent;
+
+ /* indent */
+ parent = pcidev->parent;
+ while (parent) {
+ hwloc_debug("%s", " ");
+ parent = parent->parent;
+ }
+
+ snprintf(busid, sizeof(busid), "%04x:%02x:%02x.%01x",
+ pcidev->attr->pcidev.domain, pcidev->attr->pcidev.bus, pcidev->attr->pcidev.dev, pcidev->attr->pcidev.func);
+
+ if (pcidev->type == HWLOC_OBJ_BRIDGE) {
+ if (pcidev->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST)
+ hwloc_debug("HostBridge");
+ else
+ hwloc_debug("Bridge [%04x:%04x]", busid,
+ pcidev->attr->pcidev.vendor_id, pcidev->attr->pcidev.device_id);
+ hwloc_debug(" to %04x:[%02x:%02x]\n",
+ pcidev->attr->bridge.downstream.pci.domain, pcidev->attr->bridge.downstream.pci.secondary_bus, pcidev->attr->bridge.downstream.pci.subordinate_bus);
+ } else
+ hwloc_debug("%s Device [%04x:%04x (%04x:%04x) rev=%02x class=%04x]\n", busid,
+ pcidev->attr->pcidev.vendor_id, pcidev->attr->pcidev.device_id,
+ pcidev->attr->pcidev.subvendor_id, pcidev->attr->pcidev.subdevice_id,
+ pcidev->attr->pcidev.revision, pcidev->attr->pcidev.class_id);
+}
+#endif /* HWLOC_DEBUG */
+
+static void
+hwloc_pci_traverse_lookuposdevices_cb(void * cbdata,
+ struct hwloc_obj *pcidev)
+{
+ struct hwloc_backend *backend = cbdata;
+
+ if (pcidev->type == HWLOC_OBJ_BRIDGE)
+ return;
+
+ hwloc_backends_notify_new_object(backend, pcidev);
+}
+
+static void
+hwloc_pci__traverse(void * cbdata, struct hwloc_obj *root,
+ void (*cb)(void * cbdata, struct hwloc_obj *))
+{
+ struct hwloc_obj *child = root->io_first_child;
+ while (child) {
+ cb(cbdata, child);
+ if (child->type == HWLOC_OBJ_BRIDGE)
+ hwloc_pci__traverse(cbdata, child, cb);
+ child = child->next_sibling;
+ }
+}
+
+static void
+hwloc_pci_traverse(void * cbdata, struct hwloc_obj *root,
+ void (*cb)(void * cbdata, struct hwloc_obj *))
+{
+ hwloc_pci__traverse(cbdata, root, cb);
+}
+
+enum hwloc_pci_busid_comparison_e {
+ HWLOC_PCI_BUSID_LOWER,
+ HWLOC_PCI_BUSID_HIGHER,
+ HWLOC_PCI_BUSID_INCLUDED,
+ HWLOC_PCI_BUSID_SUPERSET
+};
+
+static enum hwloc_pci_busid_comparison_e
+hwloc_pci_compare_busids(struct hwloc_obj *a, struct hwloc_obj *b)
+{
+ if (a->type == HWLOC_OBJ_BRIDGE)
+ assert(a->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI);
+ if (b->type == HWLOC_OBJ_BRIDGE)
+ assert(b->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI);
+
+ if (a->attr->pcidev.domain < b->attr->pcidev.domain)
+ return HWLOC_PCI_BUSID_LOWER;
+ if (a->attr->pcidev.domain > b->attr->pcidev.domain)
+ return HWLOC_PCI_BUSID_HIGHER;
+
+ if (a->type == HWLOC_OBJ_BRIDGE
+ && b->attr->pcidev.bus >= a->attr->bridge.downstream.pci.secondary_bus
+ && b->attr->pcidev.bus <= a->attr->bridge.downstream.pci.subordinate_bus)
+ return HWLOC_PCI_BUSID_SUPERSET;
+ if (b->type == HWLOC_OBJ_BRIDGE
+ && a->attr->pcidev.bus >= b->attr->bridge.downstream.pci.secondary_bus
+ && a->attr->pcidev.bus <= b->attr->bridge.downstream.pci.subordinate_bus)
+ return HWLOC_PCI_BUSID_INCLUDED;
+
+ if (a->attr->pcidev.bus < b->attr->pcidev.bus)
+ return HWLOC_PCI_BUSID_LOWER;
+ if (a->attr->pcidev.bus > b->attr->pcidev.bus)
+ return HWLOC_PCI_BUSID_HIGHER;
+
+ if (a->attr->pcidev.dev < b->attr->pcidev.dev)
+ return HWLOC_PCI_BUSID_LOWER;
+ if (a->attr->pcidev.dev > b->attr->pcidev.dev)
+ return HWLOC_PCI_BUSID_HIGHER;
+
+ if (a->attr->pcidev.func < b->attr->pcidev.func)
+ return HWLOC_PCI_BUSID_LOWER;
+ if (a->attr->pcidev.func > b->attr->pcidev.func)
+ return HWLOC_PCI_BUSID_HIGHER;
+
+ /* Should never reach here. Abort on both debug builds and
+ non-debug builds */
+ assert(0);
+ fprintf(stderr, "Bad assertion in hwloc %s:%d (aborting)\n", __FILE__, __LINE__);
+ exit(1);
+}
+
+static void
+hwloc_pci_add_object(struct hwloc_obj *root, struct hwloc_obj *new)
+{
+ struct hwloc_obj **curp, **childp;
+
+ curp = &root->io_first_child;
+ while (*curp) {
+ enum hwloc_pci_busid_comparison_e comp = hwloc_pci_compare_busids(new, *curp);
+ switch (comp) {
+ case HWLOC_PCI_BUSID_HIGHER:
+ /* go further */
+ curp = &(*curp)->next_sibling;
+ continue;
+ case HWLOC_PCI_BUSID_INCLUDED:
+ /* insert new below current bridge */
+ hwloc_pci_add_object(*curp, new);
+ return;
+ case HWLOC_PCI_BUSID_LOWER:
+ case HWLOC_PCI_BUSID_SUPERSET: {
+ /* insert new before current */
+ new->next_sibling = *curp;
+ *curp = new;
+ new->parent = root;
+ if (new->type == HWLOC_OBJ_BRIDGE) {
+ /* look at remaining siblings and move some below new */
+ childp = &new->io_first_child;
+ curp = &new->next_sibling;
+ while (*curp) {
+ if (hwloc_pci_compare_busids(new, *curp) == HWLOC_PCI_BUSID_LOWER) {
+ /* this sibling remains under root, after new */
+ curp = &(*curp)->next_sibling;
+ /* even if the list is sorted by busid, we can't break because the current bridge creates a bus that may be higher. some object may have to go there */
+ } else {
+ /* this sibling goes under new */
+ *childp = *curp;
+ *curp = (*curp)->next_sibling;
+ (*childp)->parent = new;
+ (*childp)->next_sibling = NULL;
+ childp = &(*childp)->next_sibling;
+ }
+ }
+ }
+ return;
+ }
+ }
+ }
+ /* add to the end of the list if higher than everybody */
+ new->parent = root;
+ new->next_sibling = NULL;
+ *curp = new;
+}
+
+static struct hwloc_obj *
+hwloc_pci_fixup_hostbridge_parent(struct hwloc_topology *topology __hwloc_attribute_unused,
+ struct hwloc_obj *hostbridge,
+ struct hwloc_obj *parent)
+{
+ /* Xeon E5v3 in cluster-on-die mode only have PCI on the first NUMA node of each package.
+ * but many dual-processor host report the second PCI hierarchy on 2nd NUMA of first package.
+ */
+ if (parent->depth >= 2
+ && parent->type == HWLOC_OBJ_NUMANODE
+ && parent->sibling_rank == 1 && parent->parent->arity == 2
+ && parent->parent->type == HWLOC_OBJ_PACKAGE
+ && parent->parent->sibling_rank == 0 && parent->parent->parent->arity == 2) {
+ const char *cpumodel = hwloc_obj_get_info_by_name(parent->parent, "CPUModel");
+ if (cpumodel && strstr(cpumodel, "Xeon")) {
+ if (!hwloc_hide_errors()) {
+ fprintf(stderr, "****************************************************************************\n");
+ fprintf(stderr, "* hwloc %s has encountered an incorrect PCI locality information.\n", HWLOC_VERSION);
+ fprintf(stderr, "* PCI bus %04x:%02x is supposedly close to 2nd NUMA node of 1st package,\n",
+ hostbridge->io_first_child->attr->pcidev.domain, hostbridge->io_first_child->attr->pcidev.bus);
+ fprintf(stderr, "* however hwloc believes this is impossible on this architecture.\n");
+ fprintf(stderr, "* Therefore the PCI bus will be moved to 1st NUMA node of 2nd package.\n");
+ fprintf(stderr, "*\n");
+ fprintf(stderr, "* If you feel this fixup is wrong, disable it by setting in your environment\n");
+ fprintf(stderr, "* HWLOC_PCI_%04x_%02x_LOCALCPUS= (empty value), and report the problem\n",
+ hostbridge->io_first_child->attr->pcidev.domain, hostbridge->io_first_child->attr->pcidev.bus);
+ fprintf(stderr, "* to the hwloc's user mailing list together with the XML output of lstopo.\n");
+ fprintf(stderr, "*\n");
+ fprintf(stderr, "* You may silence this message by setting HWLOC_HIDE_ERRORS=1 in your environment.\n");
+ fprintf(stderr, "****************************************************************************\n");
+ }
+ return parent->parent->next_sibling->first_child;
+ }
+ }
+
+ return parent;
+}
+
+static struct hwloc_obj *
+hwloc_pci_find_hostbridge_parent(struct hwloc_topology *topology, struct hwloc_backend *backend,
+ struct hwloc_obj *hostbridge)
+{
+ hwloc_bitmap_t cpuset = hwloc_bitmap_alloc();
+ hwloc_obj_t group_obj, parent;
+ const char *env;
+ int err;
+
+ /* override the cpuset with the environment if given */
+ int forced = 0;
+ char envname[256];
+ snprintf(envname, sizeof(envname), "HWLOC_PCI_%04x_%02x_LOCALCPUS",
+ hostbridge->io_first_child->attr->pcidev.domain, hostbridge->io_first_child->attr->pcidev.bus);
+ env = getenv(envname);
+ if (env)
+ /* if env exists but is empty, don't let quirks change what the OS reports */
+ forced = 1;
+ if (env && *env) {
+ /* force the hostbridge cpuset */
+ hwloc_debug("Overriding localcpus using %s in the environment\n", envname);
+ hwloc_bitmap_sscanf(cpuset, env);
+ } else {
+ /* get the hostbridge cpuset by acking the OS backend.
+ * it's not a PCI device, so we use its first child locality info.
+ */
+ err = hwloc_backends_get_obj_cpuset(backend, hostbridge->io_first_child, cpuset);
+ if (err < 0)
+ /* if we got nothing, assume the hostbridge is attached to the top of hierarchy */
+ hwloc_bitmap_copy(cpuset, hwloc_topology_get_topology_cpuset(topology));
+ }
+
+ hwloc_debug_bitmap("Attaching hostbridge to cpuset %s\n", cpuset);
+
+ /* restrict to the existing complete cpuset to avoid errors later */
+ hwloc_bitmap_and(cpuset, cpuset, hwloc_topology_get_complete_cpuset(topology));
+
+ /* if the remaining cpuset is empty, take the root */
+ if (hwloc_bitmap_iszero(cpuset))
+ hwloc_bitmap_copy(cpuset, hwloc_topology_get_complete_cpuset(topology));
+
+ group_obj = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, -1);
+ if (group_obj) {
+ group_obj->complete_cpuset = hwloc_bitmap_dup(cpuset);
+ hwloc_bitmap_and(cpuset, cpuset, hwloc_topology_get_topology_cpuset(topology));
+ group_obj->cpuset = hwloc_bitmap_dup(cpuset);
+ group_obj->attr->group.depth = (unsigned) -1;
+ parent = hwloc__insert_object_by_cpuset(topology, group_obj, hwloc_report_os_error);
+ if (parent == group_obj) {
+ /* group inserted without being merged, setup its sets */
+ hwloc_obj_add_children_sets(group_obj);
+ } else if (!parent) {
+ /* Failed to insert the parent, maybe a conflicting cpuset, attach to the root object instead */
+ parent = hwloc_get_root_obj(topology);
+ } else {
+ /* Got merged. This object has the right cpuset, but it could be a cache or so,
+ * go up as long as the (complete)cpuset is the same.
+ */
+ while (parent->parent) {
+ if (parent->complete_cpuset && parent->parent->complete_cpuset) {
+ if (!hwloc_bitmap_isequal(parent->complete_cpuset, parent->parent->complete_cpuset))
+ break;
+ } else {
+ if (!hwloc_bitmap_isequal(parent->cpuset, parent->parent->cpuset))
+ break;
+ }
+ parent = parent->parent;
+ }
+
+ if (!forced)
+ parent = hwloc_pci_fixup_hostbridge_parent(topology, hostbridge, parent);
+ }
+ } else {
+ /* Failed to create the Group, attach to the root object instead */
+ parent = hwloc_get_root_obj(topology);
+ }
+
+ hwloc_bitmap_free(cpuset);
+
+ return parent;
+}
+
+int
+hwloc_insert_pci_device_list(struct hwloc_backend *backend,
+ struct hwloc_obj *first_obj)
+{
+ struct hwloc_topology *topology = backend->topology;
+ struct hwloc_obj fakeparent;
+ struct hwloc_obj *obj;
+ unsigned current_hostbridge;
+
+ if (!first_obj)
+ /* found nothing, exit */
+ return 0;
+
+ /* first, organise object as tree under a fake parent object */
+ fakeparent.parent = NULL;
+ fakeparent.io_first_child = NULL;
+ while (first_obj) {
+ obj = first_obj;
+ first_obj = obj->next_sibling;
+ hwloc_pci_add_object(&fakeparent, obj);
+ }
+
+#ifdef HWLOC_DEBUG
+ hwloc_debug("%s", "\nPCI hierarchy under fake parent:\n");
+ hwloc_pci_traverse(NULL, &fakeparent, hwloc_pci_traverse_print_cb);
+ hwloc_debug("%s", "\n");
+#endif
+
+ /* walk the hierarchy, and lookup OS devices */
+ hwloc_pci_traverse(backend, &fakeparent, hwloc_pci_traverse_lookuposdevices_cb);
+
+ /*
+ * fakeparent lists all objects connected to any upstream bus in the machine.
+ * We now create one real hostbridge object per upstream bus.
+ * It's not actually a PCI device so we have to create it.
+ */
+ current_hostbridge = 0;
+ while (fakeparent.io_first_child) {
+ /* start a new host bridge */
+ struct hwloc_obj *hostbridge = hwloc_alloc_setup_object(HWLOC_OBJ_BRIDGE, current_hostbridge++);
+ struct hwloc_obj **dstnextp = &hostbridge->io_first_child;
+ struct hwloc_obj **srcnextp = &fakeparent.io_first_child;
+ struct hwloc_obj *child = *srcnextp;
+ struct hwloc_obj *parent;
+ unsigned short current_domain = child->attr->pcidev.domain;
+ unsigned char current_bus = child->attr->pcidev.bus;
+ unsigned char current_subordinate = current_bus;
+
+ hwloc_debug("Starting new PCI hostbridge %04x:%02x\n", current_domain, current_bus);
+
+ next_child:
+ /* remove next child from fakeparent */
+ *srcnextp = child->next_sibling;
+ /* append it to hostbridge */
+ *dstnextp = child;
+ child->parent = hostbridge;
+ child->next_sibling = NULL;
+ dstnextp = &child->next_sibling;
+
+ /* compute hostbridge secondary/subordinate buses */
+ if (child->type == HWLOC_OBJ_BRIDGE
+ && child->attr->bridge.downstream.pci.subordinate_bus > current_subordinate)
+ current_subordinate = child->attr->bridge.downstream.pci.subordinate_bus;
+
+ /* use next child if it has the same domains/bus */
+ child = *srcnextp;
+ if (child
+ && child->attr->pcidev.domain == current_domain
+ && child->attr->pcidev.bus == current_bus)
+ goto next_child;
+
+ /* finish setting up this hostbridge */
+ hostbridge->attr->bridge.upstream_type = HWLOC_OBJ_BRIDGE_HOST;
+ hostbridge->attr->bridge.downstream_type = HWLOC_OBJ_BRIDGE_PCI;
+ hostbridge->attr->bridge.downstream.pci.domain = current_domain;
+ hostbridge->attr->bridge.downstream.pci.secondary_bus = current_bus;
+ hostbridge->attr->bridge.downstream.pci.subordinate_bus = current_subordinate;
+ hwloc_debug("New PCI hostbridge %04x:[%02x-%02x]\n",
+ current_domain, current_bus, current_subordinate);
+
+ /* attach the hostbridge where it belongs */
+ parent = hwloc_pci_find_hostbridge_parent(topology, backend, hostbridge);
+ hwloc_insert_object_by_parent(topology, parent, hostbridge);
+ }
+
+ return 1;
+}
+
+#define HWLOC_PCI_STATUS 0x06
+#define HWLOC_PCI_STATUS_CAP_LIST 0x10
+#define HWLOC_PCI_CAPABILITY_LIST 0x34
+#define HWLOC_PCI_CAP_LIST_ID 0
+#define HWLOC_PCI_CAP_LIST_NEXT 1
+
+unsigned
+hwloc_pci_find_cap(const unsigned char *config, unsigned cap)
+{
+ unsigned char seen[256] = { 0 };
+ unsigned char ptr; /* unsigned char to make sure we stay within the 256-byte config space */
+
+ if (!(config[HWLOC_PCI_STATUS] & HWLOC_PCI_STATUS_CAP_LIST))
+ return 0;
+
+ for (ptr = config[HWLOC_PCI_CAPABILITY_LIST] & ~3;
+ ptr; /* exit if next is 0 */
+ ptr = config[ptr + HWLOC_PCI_CAP_LIST_NEXT] & ~3) {
+ unsigned char id;
+
+ /* Looped around! */
+ if (seen[ptr])
+ break;
+ seen[ptr] = 1;
+
+ id = config[ptr + HWLOC_PCI_CAP_LIST_ID];
+ if (id == cap)
+ return ptr;
+ if (id == 0xff) /* exit if id is 0 or 0xff */
+ break;
+ }
+ return 0;
+}
+
+#define HWLOC_PCI_EXP_LNKSTA 0x12
+#define HWLOC_PCI_EXP_LNKSTA_SPEED 0x000f
+#define HWLOC_PCI_EXP_LNKSTA_WIDTH 0x03f0
+
+int
+hwloc_pci_find_linkspeed(const unsigned char *config,
+ unsigned offset, float *linkspeed)
+{
+ unsigned linksta, speed, width;
+ float lanespeed;
+
+ memcpy(&linksta, &config[offset + HWLOC_PCI_EXP_LNKSTA], 4);
+ speed = linksta & HWLOC_PCI_EXP_LNKSTA_SPEED; /* PCIe generation */
+ width = (linksta & HWLOC_PCI_EXP_LNKSTA_WIDTH) >> 4; /* how many lanes */
+ /* PCIe Gen1 = 2.5GT/s signal-rate per lane with 8/10 encoding = 0.25GB/s data-rate per lane
+ * PCIe Gen2 = 5 GT/s signal-rate per lane with 8/10 encoding = 0.5 GB/s data-rate per lane
+ * PCIe Gen3 = 8 GT/s signal-rate per lane with 128/130 encoding = 1 GB/s data-rate per lane
+ */
+ lanespeed = speed <= 2 ? 2.5 * speed * 0.8 : 8.0 * 128/130; /* Gbit/s per lane */
+ *linkspeed = lanespeed * width / 8; /* GB/s */
+ return 0;
+}
+
+#define HWLOC_PCI_HEADER_TYPE 0x0e
+#define HWLOC_PCI_HEADER_TYPE_BRIDGE 1
+#define HWLOC_PCI_CLASS_BRIDGE_PCI 0x0604
+#define HWLOC_PCI_PRIMARY_BUS 0x18
+#define HWLOC_PCI_SECONDARY_BUS 0x19
+#define HWLOC_PCI_SUBORDINATE_BUS 0x1a
+
+int
+hwloc_pci_prepare_bridge(hwloc_obj_t obj,
+ const unsigned char *config)
+{
+ unsigned char headertype;
+ unsigned isbridge;
+ struct hwloc_pcidev_attr_s *pattr = &obj->attr->pcidev;
+ struct hwloc_bridge_attr_s *battr;
+
+ headertype = config[HWLOC_PCI_HEADER_TYPE] & 0x7f;
+ isbridge = (pattr->class_id == HWLOC_PCI_CLASS_BRIDGE_PCI
+ && headertype == HWLOC_PCI_HEADER_TYPE_BRIDGE);
+
+ if (!isbridge)
+ return 0;
+
+ battr = &obj->attr->bridge;
+
+ if (config[HWLOC_PCI_PRIMARY_BUS] != pattr->bus)
+ hwloc_debug(" %04x:%02x:%02x.%01x bridge with (ignored) invalid PCI_PRIMARY_BUS %02x\n",
+ pattr->domain, pattr->bus, pattr->dev, pattr->func, config[HWLOC_PCI_PRIMARY_BUS]);
+
+ obj->type = HWLOC_OBJ_BRIDGE;
+ battr->upstream_type = HWLOC_OBJ_BRIDGE_PCI;
+ battr->downstream_type = HWLOC_OBJ_BRIDGE_PCI;
+ battr->downstream.pci.domain = pattr->domain;
+ battr->downstream.pci.secondary_bus = config[HWLOC_PCI_SECONDARY_BUS];
+ battr->downstream.pci.subordinate_bus = config[HWLOC_PCI_SUBORDINATE_BUS];
+
+ return 0;
+}
diff --git a/ext/hwloc/hwloc/topology-bgq.cb b/ext/hwloc/hwloc/topology-bgq.cb
new file mode 100644
index 0000000..3998f31
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-bgq.cb
@@ -0,0 +1,246 @@
+/*
+ * Copyright © 2013-2015 Inria. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+
+#include <stdlib.h>
+#include <pthread.h>
+#include <sys/utsname.h>
+#include <spi/include/kernel/location.h>
+#include <spi/include/kernel/process.h>
+
+static int
+hwloc_look_bgq(struct hwloc_backend *backend)
+{
+ struct hwloc_topology *topology = backend->topology;
+ unsigned i;
+ const char *env;
+
+ if (!topology->levels[0][0]->cpuset) {
+ /* Nobody created objects yet, setup everything */
+ hwloc_bitmap_t set;
+ hwloc_obj_t obj;
+
+#define HWLOC_BGQ_CORES 17 /* spare core ignored for now */
+
+ hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+ /* mark the 17th core (OS-reserved) as disallowed */
+ hwloc_bitmap_clr_range(topology->levels[0][0]->allowed_cpuset, (HWLOC_BGQ_CORES-1)*4, HWLOC_BGQ_CORES*4-1);
+
+ env = getenv("BG_THREADMODEL");
+ if (!env || atoi(env) != 2) {
+ /* process cannot use cores/threads outside of its Kernel_ThreadMask() */
+ uint64_t bgmask = Kernel_ThreadMask(Kernel_MyTcoord());
+ /* the mask is reversed, manually reverse it */
+ for(i=0; i<64; i++)
+ if (((bgmask >> i) & 1) == 0)
+ hwloc_bitmap_clr(topology->levels[0][0]->allowed_cpuset, 63-i);
+ }
+
+ /* a single memory bank */
+ obj = hwloc_alloc_setup_object(HWLOC_OBJ_NUMANODE, 0);
+ set = hwloc_bitmap_alloc();
+ hwloc_bitmap_set_range(set, 0, HWLOC_BGQ_CORES*4-1);
+ obj->cpuset = set;
+ set = hwloc_bitmap_alloc();
+ hwloc_bitmap_set(set, 0);
+ obj->nodeset = set;
+ obj->memory.local_memory = 16ULL*1024*1024*1024ULL;
+ hwloc_insert_object_by_cpuset(topology, obj);
+
+ /* package */
+ obj = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, 0);
+ set = hwloc_bitmap_alloc();
+ hwloc_bitmap_set_range(set, 0, HWLOC_BGQ_CORES*4-1);
+ obj->cpuset = set;
+ hwloc_obj_add_info(obj, "CPUModel", "IBM PowerPC A2");
+ hwloc_insert_object_by_cpuset(topology, obj);
+
+ /* shared L2 */
+ obj = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
+ obj->cpuset = hwloc_bitmap_dup(set);
+ obj->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
+ obj->attr->cache.depth = 2;
+ obj->attr->cache.size = 32*1024*1024;
+ obj->attr->cache.linesize = 128;
+ obj->attr->cache.associativity = 16;
+ hwloc_insert_object_by_cpuset(topology, obj);
+
+ /* Cores */
+ for(i=0; i<HWLOC_BGQ_CORES; i++) {
+ /* Core */
+ obj = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, i);
+ set = hwloc_bitmap_alloc();
+ hwloc_bitmap_set_range(set, i*4, i*4+3);
+ obj->cpuset = set;
+ hwloc_insert_object_by_cpuset(topology, obj);
+ /* L1d */
+ obj = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
+ obj->cpuset = hwloc_bitmap_dup(set);
+ obj->attr->cache.type = HWLOC_OBJ_CACHE_DATA;
+ obj->attr->cache.depth = 1;
+ obj->attr->cache.size = 16*1024;
+ obj->attr->cache.linesize = 64;
+ obj->attr->cache.associativity = 8;
+ hwloc_insert_object_by_cpuset(topology, obj);
+ /* L1i */
+ obj = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
+ obj->cpuset = hwloc_bitmap_dup(set);
+ obj->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION;
+ obj->attr->cache.depth = 1;
+ obj->attr->cache.size = 16*1024;
+ obj->attr->cache.linesize = 64;
+ obj->attr->cache.associativity = 4;
+ hwloc_insert_object_by_cpuset(topology, obj);
+ /* there's also a L1p "prefetch cache" of 4kB with 128B lines */
+ }
+
+ /* PUs */
+ hwloc_setup_pu_level(topology, HWLOC_BGQ_CORES*4);
+ }
+
+ /* Add BGQ specific information */
+
+ hwloc_obj_add_info(topology->levels[0][0], "Backend", "BGQ");
+ if (topology->is_thissystem)
+ hwloc_add_uname_info(topology, NULL);
+ return 1;
+}
+
+static int
+hwloc_bgq_get_thread_cpubind(hwloc_topology_t topology, pthread_t thread, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+ unsigned pu;
+ cpu_set_t bg_set;
+ int err;
+
+ if (topology->pid) {
+ errno = ENOSYS;
+ return -1;
+ }
+ err = pthread_getaffinity_np(thread, sizeof(bg_set), &bg_set);
+ if (err) {
+ errno = err;
+ return -1;
+ }
+ for(pu=0; pu<64; pu++)
+ if (CPU_ISSET(pu, &bg_set)) {
+ /* the binding cannot contain multiple PUs */
+ hwloc_bitmap_only(hwloc_set, pu);
+ break;
+ }
+ return 0;
+}
+
+static int
+hwloc_bgq_get_thisthread_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+ if (topology->pid) {
+ errno = ENOSYS;
+ return -1;
+ }
+ hwloc_bitmap_only(hwloc_set, Kernel_ProcessorID());
+ return 0;
+}
+
+static int
+hwloc_bgq_set_thread_cpubind(hwloc_topology_t topology, pthread_t thread, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+ unsigned pu;
+ cpu_set_t bg_set;
+ int err;
+
+ if (topology->pid) {
+ errno = ENOSYS;
+ return -1;
+ }
+ /* the binding cannot contain multiple PUs.
+ * keep the first PU only, and error out if STRICT.
+ */
+ if (hwloc_bitmap_weight(hwloc_set) != 1) {
+ if ((flags & HWLOC_CPUBIND_STRICT)) {
+ errno = ENOSYS;
+ return -1;
+ }
+ }
+ pu = hwloc_bitmap_first(hwloc_set);
+ CPU_ZERO(&bg_set);
+ CPU_SET(pu, &bg_set);
+ err = pthread_setaffinity_np(thread, sizeof(bg_set), &bg_set);
+ if (err) {
+ errno = err;
+ return -1;
+ }
+ return 0;
+}
+
+static int
+hwloc_bgq_set_thisthread_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+ return hwloc_bgq_set_thread_cpubind(topology, pthread_self(), hwloc_set, flags);
+}
+
+void
+hwloc_set_bgq_hooks(struct hwloc_binding_hooks *hooks __hwloc_attribute_unused,
+ struct hwloc_topology_support *support __hwloc_attribute_unused)
+{
+ hooks->set_thisthread_cpubind = hwloc_bgq_set_thisthread_cpubind;
+ hooks->set_thread_cpubind = hwloc_bgq_set_thread_cpubind;
+ hooks->get_thisthread_cpubind = hwloc_bgq_get_thisthread_cpubind;
+ hooks->get_thread_cpubind = hwloc_bgq_get_thread_cpubind;
+ /* threads cannot be bound to more than one PU, so get_last_cpu_location == get_cpubind */
+ hooks->get_thisthread_last_cpu_location = hwloc_bgq_get_thisthread_cpubind;
+ /* hooks->get_thread_last_cpu_location = hwloc_bgq_get_thread_cpubind; */
+}
+
+static struct hwloc_backend *
+hwloc_bgq_component_instantiate(struct hwloc_disc_component *component,
+ const void *_data1 __hwloc_attribute_unused,
+ const void *_data2 __hwloc_attribute_unused,
+ const void *_data3 __hwloc_attribute_unused)
+{
+ struct utsname utsname;
+ struct hwloc_backend *backend;
+ const char *env;
+ int err;
+
+ env = getenv("HWLOC_FORCE_BGQ");
+ if (!env || !atoi(env)) {
+ err = uname(&utsname);
+ if (err || strcmp(utsname.sysname, "CNK") || strcmp(utsname.machine, "BGQ")) {
+ fprintf(stderr, "*** Found unexpected uname sysname `%s' machine `%s'\n", utsname.sysname, utsname.machine);
+ fprintf(stderr, "*** The BGQ backend is only enabled on compute nodes by default (sysname=CNK machine=BGQ)\n");
+ fprintf(stderr, "*** Set HWLOC_FORCE_BGQ=1 in the environment to enforce the BGQ backend anyway.\n");
+ return NULL;
+ }
+ }
+
+ backend = hwloc_backend_alloc(component);
+ if (!backend)
+ return NULL;
+ backend->discover = hwloc_look_bgq;
+ return backend;
+}
+
+static struct hwloc_disc_component hwloc_bgq_disc_component = {
+ HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+ "bgq",
+ ~0,
+ hwloc_bgq_component_instantiate,
+ 50,
+ NULL
+};
+
+const struct hwloc_component hwloc_bgq_component = {
+ HWLOC_COMPONENT_ABI,
+ NULL, NULL,
+ HWLOC_COMPONENT_TYPE_DISC,
+ 0,
+ &hwloc_bgq_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology-darwin.cb b/ext/hwloc/hwloc/topology-darwin.cb
new file mode 100644
index 0000000..1062a1d
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-darwin.cb
@@ -0,0 +1,307 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria. All rights reserved.
+ * Copyright © 2009-2013 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/* Detect topology change: registering for power management changes and check
+ * if for example hw.activecpu changed */
+
+/* Apparently, Darwin people do not _want_ to provide binding functions. */
+
+#include <private/autogen/config.h>
+
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+
+static int
+hwloc_look_darwin(struct hwloc_backend *backend)
+{
+ struct hwloc_topology *topology = backend->topology;
+ int64_t _nprocs;
+ unsigned nprocs;
+ int64_t _npackages;
+ unsigned i, j, cpu;
+ struct hwloc_obj *obj;
+ size_t size;
+ int64_t l1dcachesize, l1icachesize;
+ int64_t cacheways[2];
+ int64_t l2cachesize;
+ int64_t cachelinesize;
+ int64_t memsize;
+ char cpumodel[64];
+
+ if (topology->levels[0][0]->cpuset)
+ /* somebody discovered things */
+ return 0;
+
+ hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+
+ if (hwloc_get_sysctlbyname("hw.ncpu", &_nprocs) || _nprocs <= 0)
+ return -1;
+ nprocs = _nprocs;
+ topology->support.discovery->pu = 1;
+
+ hwloc_debug("%u procs\n", nprocs);
+
+ size = sizeof(cpumodel);
+ if (sysctlbyname("machdep.cpu.brand_string", cpumodel, &size, NULL, 0))
+ cpumodel[0] = '\0';
+
+ if (!hwloc_get_sysctlbyname("hw.packages", &_npackages) && _npackages > 0) {
+ unsigned npackages = _npackages;
+ int64_t _cores_per_package;
+ int64_t _logical_per_package;
+ unsigned logical_per_package;
+
+ hwloc_debug("%u packages\n", npackages);
+
+ if (!hwloc_get_sysctlbyname("machdep.cpu.logical_per_package", &_logical_per_package) && _logical_per_package > 0)
+ logical_per_package = _logical_per_package;
+ else
+ /* Assume the trivia. */
+ logical_per_package = nprocs / npackages;
+
+ hwloc_debug("%u threads per package\n", logical_per_package);
+
+
+ if (nprocs == npackages * logical_per_package)
+ for (i = 0; i < npackages; i++) {
+ obj = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, i);
+ obj->cpuset = hwloc_bitmap_alloc();
+ for (cpu = i*logical_per_package; cpu < (i+1)*logical_per_package; cpu++)
+ hwloc_bitmap_set(obj->cpuset, cpu);
+
+ hwloc_debug_1arg_bitmap("package %u has cpuset %s\n",
+ i, obj->cpuset);
+
+ if (cpumodel[0] != '\0')
+ hwloc_obj_add_info(obj, "CPUModel", cpumodel);
+ hwloc_insert_object_by_cpuset(topology, obj);
+ }
+ else
+ if (cpumodel[0] != '\0')
+ hwloc_obj_add_info(topology->levels[0][0], "CPUModel", cpumodel);
+
+ if (!hwloc_get_sysctlbyname("machdep.cpu.cores_per_package", &_cores_per_package) && _cores_per_package > 0) {
+ unsigned cores_per_package = _cores_per_package;
+ hwloc_debug("%u cores per package\n", cores_per_package);
+
+ if (!(logical_per_package % cores_per_package))
+ for (i = 0; i < npackages * cores_per_package; i++) {
+ obj = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, i);
+ obj->cpuset = hwloc_bitmap_alloc();
+ for (cpu = i*(logical_per_package/cores_per_package);
+ cpu < (i+1)*(logical_per_package/cores_per_package);
+ cpu++)
+ hwloc_bitmap_set(obj->cpuset, cpu);
+
+ hwloc_debug_1arg_bitmap("core %u has cpuset %s\n",
+ i, obj->cpuset);
+ hwloc_insert_object_by_cpuset(topology, obj);
+ }
+ }
+ } else
+ if (cpumodel[0] != '\0')
+ hwloc_obj_add_info(topology->levels[0][0], "CPUModel", cpumodel);
+
+ if (hwloc_get_sysctlbyname("hw.l1dcachesize", &l1dcachesize))
+ l1dcachesize = 0;
+
+ if (hwloc_get_sysctlbyname("hw.l1icachesize", &l1icachesize))
+ l1icachesize = 0;
+
+ if (hwloc_get_sysctlbyname("hw.l2cachesize", &l2cachesize))
+ l2cachesize = 0;
+
+ if (hwloc_get_sysctlbyname("machdep.cpu.cache.L1_associativity", &cacheways[0]))
+ cacheways[0] = 0;
+ else if (cacheways[0] == 0xff)
+ cacheways[0] = -1;
+
+ if (hwloc_get_sysctlbyname("machdep.cpu.cache.L2_associativity", &cacheways[1]))
+ cacheways[1] = 0;
+ else if (cacheways[1] == 0xff)
+ cacheways[1] = -1;
+
+ if (hwloc_get_sysctlbyname("hw.cachelinesize", &cachelinesize))
+ cachelinesize = 0;
+
+ if (hwloc_get_sysctlbyname("hw.memsize", &memsize))
+ memsize = 0;
+
+ if (!sysctlbyname("hw.cacheconfig", NULL, &size, NULL, 0)) {
+ unsigned n = size / sizeof(uint32_t);
+ uint64_t *cacheconfig = NULL;
+ uint64_t *cachesize = NULL;
+ uint32_t *cacheconfig32 = NULL;
+
+ cacheconfig = malloc(sizeof(uint64_t) * n);
+ if (NULL == cacheconfig) {
+ goto out;
+ }
+ cachesize = malloc(sizeof(uint64_t) * n);
+ if (NULL == cachesize) {
+ goto out;
+ }
+ cacheconfig32 = malloc(sizeof(uint32_t) * n);
+ if (NULL == cacheconfig32) {
+ goto out;
+ }
+
+ if ((!sysctlbyname("hw.cacheconfig", cacheconfig, &size, NULL, 0))) {
+ /* Yeech. Darwin seemingly has changed from 32bit to 64bit integers for
+ * cacheconfig, with apparently no way for detection. Assume the machine
+ * won't have more than 4 billion cpus */
+ if (cacheconfig[0] > 0xFFFFFFFFUL) {
+ memcpy(cacheconfig32, cacheconfig, size);
+ for (i = 0 ; i < size / sizeof(uint32_t); i++)
+ cacheconfig[i] = cacheconfig32[i];
+ }
+
+ memset(cachesize, 0, sizeof(uint64_t) * n);
+ size = sizeof(uint64_t) * n;
+ if (sysctlbyname("hw.cachesize", cachesize, &size, NULL, 0)) {
+ if (n > 0)
+ cachesize[0] = memsize;
+ if (n > 1)
+ cachesize[1] = l1dcachesize;
+ if (n > 2)
+ cachesize[2] = l2cachesize;
+ }
+
+ hwloc_debug("%s", "caches");
+ for (i = 0; i < n && cacheconfig[i]; i++)
+ hwloc_debug(" %"PRIu64"(%"PRIu64"kB)", cacheconfig[i], cachesize[i] / 1024);
+
+ /* Now we know how many caches there are */
+ n = i;
+ hwloc_debug("\n%u cache levels\n", n - 1);
+
+ /* For each cache level (0 is memory) */
+ for (i = 0; i < n; i++) {
+ /* cacheconfig tells us how many cpus share it, let's iterate on each cache */
+ for (j = 0; j < (nprocs / cacheconfig[i]); j++) {
+ obj = hwloc_alloc_setup_object(i?HWLOC_OBJ_CACHE:HWLOC_OBJ_NUMANODE, j);
+ if (!i) {
+ obj->nodeset = hwloc_bitmap_alloc();
+ hwloc_bitmap_set(obj->nodeset, j);
+ }
+ obj->cpuset = hwloc_bitmap_alloc();
+ for (cpu = j*cacheconfig[i];
+ cpu < ((j+1)*cacheconfig[i]);
+ cpu++)
+ hwloc_bitmap_set(obj->cpuset, cpu);
+
+ if (i == 1 && l1icachesize) {
+ /* FIXME assuming that L1i and L1d are shared the same way. Darwin
+ * does not yet provide a way to know. */
+ hwloc_obj_t l1i = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, j);
+ l1i->cpuset = hwloc_bitmap_dup(obj->cpuset);
+ hwloc_debug_1arg_bitmap("L1icache %u has cpuset %s\n",
+ j, l1i->cpuset);
+ l1i->attr->cache.depth = i;
+ l1i->attr->cache.size = l1icachesize;
+ l1i->attr->cache.linesize = cachelinesize;
+ l1i->attr->cache.associativity = 0;
+ l1i->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION;
+
+ hwloc_insert_object_by_cpuset(topology, l1i);
+ }
+ if (i) {
+ hwloc_debug_2args_bitmap("L%ucache %u has cpuset %s\n",
+ i, j, obj->cpuset);
+ obj->attr->cache.depth = i;
+ obj->attr->cache.size = cachesize[i];
+ obj->attr->cache.linesize = cachelinesize;
+ if (i <= sizeof(cacheways) / sizeof(cacheways[0]))
+ obj->attr->cache.associativity = cacheways[i-1];
+ else
+ obj->attr->cache.associativity = 0;
+ if (i == 1 && l1icachesize)
+ obj->attr->cache.type = HWLOC_OBJ_CACHE_DATA;
+ else
+ obj->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
+ } else {
+ hwloc_debug_1arg_bitmap("node %u has cpuset %s\n",
+ j, obj->cpuset);
+ obj->memory.local_memory = cachesize[i];
+ obj->memory.page_types_len = 2;
+ obj->memory.page_types = malloc(2*sizeof(*obj->memory.page_types));
+ memset(obj->memory.page_types, 0, 2*sizeof(*obj->memory.page_types));
+ obj->memory.page_types[0].size = hwloc_getpagesize();
+#ifdef HAVE__SC_LARGE_PAGESIZE
+ obj->memory.page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
+#endif
+ }
+
+ hwloc_insert_object_by_cpuset(topology, obj);
+ }
+ }
+ }
+ out:
+ if (NULL != cacheconfig) {
+ free(cacheconfig);
+ }
+ if (NULL != cachesize) {
+ free(cachesize);
+ }
+ if (NULL != cacheconfig32) {
+ free(cacheconfig32);
+ }
+ }
+
+
+ /* add PU objects */
+ hwloc_setup_pu_level(topology, nprocs);
+
+ hwloc_obj_add_info(topology->levels[0][0], "Backend", "Darwin");
+ if (topology->is_thissystem)
+ hwloc_add_uname_info(topology, NULL);
+ return 1;
+}
+
+void
+hwloc_set_darwin_hooks(struct hwloc_binding_hooks *hooks __hwloc_attribute_unused,
+ struct hwloc_topology_support *support __hwloc_attribute_unused)
+{
+}
+
+static struct hwloc_backend *
+hwloc_darwin_component_instantiate(struct hwloc_disc_component *component,
+ const void *_data1 __hwloc_attribute_unused,
+ const void *_data2 __hwloc_attribute_unused,
+ const void *_data3 __hwloc_attribute_unused)
+{
+ struct hwloc_backend *backend;
+ backend = hwloc_backend_alloc(component);
+ if (!backend)
+ return NULL;
+ backend->discover = hwloc_look_darwin;
+ return backend;
+}
+
+static struct hwloc_disc_component hwloc_darwin_disc_component = {
+ HWLOC_DISC_COMPONENT_TYPE_CPU,
+ "darwin",
+ HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+ hwloc_darwin_component_instantiate,
+ 50,
+ NULL
+};
+
+const struct hwloc_component hwloc_darwin_component = {
+ HWLOC_COMPONENT_ABI,
+ NULL, NULL,
+ HWLOC_COMPONENT_TYPE_DISC,
+ 0,
+ &hwloc_darwin_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology-fake.c b/ext/hwloc/hwloc/topology-fake.c
new file mode 100644
index 0000000..e3e22a0
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-fake.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2012-2014 Inria. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+
+#include <stdlib.h>
+
+static struct hwloc_backend *
+hwloc_fake_component_instantiate(struct hwloc_disc_component *component __hwloc_attribute_unused,
+ const void *_data1 __hwloc_attribute_unused,
+ const void *_data2 __hwloc_attribute_unused,
+ const void *_data3 __hwloc_attribute_unused)
+{
+ if (getenv("HWLOC_DEBUG_FAKE_COMPONENT"))
+ printf("fake component instantiated\n");
+ return NULL;
+}
+
+static struct hwloc_disc_component hwloc_fake_disc_component = {
+ HWLOC_DISC_COMPONENT_TYPE_MISC, /* so that it's always enabled when using the OS discovery */
+ "fake",
+ 0, /* nothing to exclude */
+ hwloc_fake_component_instantiate,
+ 100, /* make sure it's loaded before anything conflicting excludes it */
+ NULL
+};
+
+static int
+hwloc_fake_component_init(unsigned long flags)
+{
+ if (flags)
+ return -1;
+ if (hwloc_plugin_check_namespace("fake", "hwloc_backend_alloc") < 0)
+ return -1;
+ if (getenv("HWLOC_DEBUG_FAKE_COMPONENT"))
+ printf("fake component initialized\n");
+ return 0;
+}
+
+static void
+hwloc_fake_component_finalize(unsigned long flags)
+{
+ if (flags)
+ return;
+ if (getenv("HWLOC_DEBUG_FAKE_COMPONENT"))
+ printf("fake component finalized\n");
+}
+
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_fake_component; /* never linked statically in the core */
+
+const struct hwloc_component hwloc_fake_component = {
+ HWLOC_COMPONENT_ABI,
+ hwloc_fake_component_init, hwloc_fake_component_finalize,
+ HWLOC_COMPONENT_TYPE_DISC,
+ 0,
+ &hwloc_fake_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology-freebsd.cb b/ext/hwloc/hwloc/topology-freebsd.cb
new file mode 100644
index 0000000..d8d4c54
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-freebsd.cb
@@ -0,0 +1,255 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria. All rights reserved.
+ * Copyright © 2009-2010, 2012 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+
+#include <sys/types.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <sys/param.h>
+#include <pthread.h>
+#ifdef HAVE_PTHREAD_NP_H
+#include <pthread_np.h>
+#endif
+#ifdef HAVE_SYS_CPUSET_H
+#include <sys/cpuset.h>
+#endif
+#ifdef HAVE_SYS_SYSCTL_H
+#include <sys/sysctl.h>
+#endif
+
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+
+#if defined(HAVE_SYS_CPUSET_H) && defined(HAVE_CPUSET_SETAFFINITY)
+static void
+hwloc_freebsd_bsd2hwloc(hwloc_bitmap_t hwloc_cpuset, const cpuset_t *cset)
+{
+ unsigned cpu;
+ hwloc_bitmap_zero(hwloc_cpuset);
+ for (cpu = 0; cpu < CPU_SETSIZE; cpu++)
+ if (CPU_ISSET(cpu, cset))
+ hwloc_bitmap_set(hwloc_cpuset, cpu);
+}
+
+static void
+hwloc_freebsd_hwloc2bsd(hwloc_const_bitmap_t hwloc_cpuset, cpuset_t *cset)
+{
+ unsigned cpu;
+ CPU_ZERO(cset);
+ for (cpu = 0; cpu < CPU_SETSIZE; cpu++)
+ if (hwloc_bitmap_isset(hwloc_cpuset, cpu))
+ CPU_SET(cpu, cset);
+}
+
+static int
+hwloc_freebsd_set_sth_affinity(hwloc_topology_t topology __hwloc_attribute_unused, cpulevel_t level, cpuwhich_t which, id_t id, hwloc_const_bitmap_t hwloc_cpuset, int flags __hwloc_attribute_unused)
+{
+ cpuset_t cset;
+
+ hwloc_freebsd_hwloc2bsd(hwloc_cpuset, &cset);
+
+ if (cpuset_setaffinity(level, which, id, sizeof(cset), &cset))
+ return -1;
+
+ return 0;
+}
+
+static int
+hwloc_freebsd_get_sth_affinity(hwloc_topology_t topology __hwloc_attribute_unused, cpulevel_t level, cpuwhich_t which, id_t id, hwloc_bitmap_t hwloc_cpuset, int flags __hwloc_attribute_unused)
+{
+ cpuset_t cset;
+
+ if (cpuset_getaffinity(level, which, id, sizeof(cset), &cset))
+ return -1;
+
+ hwloc_freebsd_bsd2hwloc(hwloc_cpuset, &cset);
+ return 0;
+}
+
+static int
+hwloc_freebsd_set_thisproc_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_cpuset, int flags)
+{
+ return hwloc_freebsd_set_sth_affinity(topology, CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, hwloc_cpuset, flags);
+}
+
+static int
+hwloc_freebsd_get_thisproc_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_cpuset, int flags)
+{
+ return hwloc_freebsd_get_sth_affinity(topology, CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, hwloc_cpuset, flags);
+}
+
+static int
+hwloc_freebsd_set_thisthread_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_cpuset, int flags)
+{
+ return hwloc_freebsd_set_sth_affinity(topology, CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, hwloc_cpuset, flags);
+}
+
+static int
+hwloc_freebsd_get_thisthread_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_cpuset, int flags)
+{
+ return hwloc_freebsd_get_sth_affinity(topology, CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, hwloc_cpuset, flags);
+}
+
+static int
+hwloc_freebsd_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_bitmap_t hwloc_cpuset, int flags)
+{
+ return hwloc_freebsd_set_sth_affinity(topology, CPU_LEVEL_WHICH, CPU_WHICH_PID, pid, hwloc_cpuset, flags);
+}
+
+static int
+hwloc_freebsd_get_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_bitmap_t hwloc_cpuset, int flags)
+{
+ return hwloc_freebsd_get_sth_affinity(topology, CPU_LEVEL_WHICH, CPU_WHICH_PID, pid, hwloc_cpuset, flags);
+}
+
+#ifdef hwloc_thread_t
+
+#if HAVE_DECL_PTHREAD_SETAFFINITY_NP
+#pragma weak pthread_setaffinity_np
+static int
+hwloc_freebsd_set_thread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_thread_t tid, hwloc_const_bitmap_t hwloc_cpuset, int flags __hwloc_attribute_unused)
+{
+ int err;
+ cpuset_t cset;
+
+ if (!pthread_setaffinity_np) {
+ errno = ENOSYS;
+ return -1;
+ }
+
+ hwloc_freebsd_hwloc2bsd(hwloc_cpuset, &cset);
+
+ err = pthread_setaffinity_np(tid, sizeof(cset), &cset);
+
+ if (err) {
+ errno = err;
+ return -1;
+ }
+
+ return 0;
+}
+#endif
+
+#if HAVE_DECL_PTHREAD_GETAFFINITY_NP
+#pragma weak pthread_getaffinity_np
+static int
+hwloc_freebsd_get_thread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_thread_t tid, hwloc_bitmap_t hwloc_cpuset, int flags __hwloc_attribute_unused)
+{
+ int err;
+ cpuset_t cset;
+
+ if (!pthread_getaffinity_np) {
+ errno = ENOSYS;
+ return -1;
+ }
+
+ err = pthread_getaffinity_np(tid, sizeof(cset), &cset);
+
+ if (err) {
+ errno = err;
+ return -1;
+ }
+
+ hwloc_freebsd_bsd2hwloc(hwloc_cpuset, &cset);
+ return 0;
+}
+#endif
+#endif
+#endif
+
+#if (defined HAVE_SYSCTL) && (defined HAVE_SYS_SYSCTL_H)
+static void
+hwloc_freebsd_node_meminfo_info(struct hwloc_topology *topology)
+{
+ int mib[2] = { CTL_HW, HW_PHYSMEM };
+ unsigned long physmem;
+ size_t len = sizeof(physmem);
+ sysctl(mib, 2, &physmem, &len, NULL, 0);
+ topology->levels[0][0]->memory.local_memory = physmem;
+ /* we don't know anything about NUMA nodes in this backend.
+ * let another backend or the core move that memory to the right NUMA node */
+}
+#endif
+
+static int
+hwloc_look_freebsd(struct hwloc_backend *backend)
+{
+ struct hwloc_topology *topology = backend->topology;
+ unsigned nbprocs = hwloc_fallback_nbprocessors(topology);
+
+ if (!topology->levels[0][0]->cpuset) {
+ /* Nobody (even the x86 backend) created objects yet, setup basic objects */
+ hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+ hwloc_setup_pu_level(topology, nbprocs);
+ }
+
+ /* Add FreeBSD specific information */
+#if (defined HAVE_SYSCTL) && (defined HAVE_SYS_SYSCTL_H)
+ hwloc_freebsd_node_meminfo_info(topology);
+#endif
+ hwloc_obj_add_info(topology->levels[0][0], "Backend", "FreeBSD");
+ if (topology->is_thissystem)
+ hwloc_add_uname_info(topology, NULL);
+ return 1;
+}
+
+void
+hwloc_set_freebsd_hooks(struct hwloc_binding_hooks *hooks __hwloc_attribute_unused,
+ struct hwloc_topology_support *support __hwloc_attribute_unused)
+{
+#if defined(HAVE_SYS_CPUSET_H) && defined(HAVE_CPUSET_SETAFFINITY)
+ hooks->set_thisproc_cpubind = hwloc_freebsd_set_thisproc_cpubind;
+ hooks->get_thisproc_cpubind = hwloc_freebsd_get_thisproc_cpubind;
+ hooks->set_thisthread_cpubind = hwloc_freebsd_set_thisthread_cpubind;
+ hooks->get_thisthread_cpubind = hwloc_freebsd_get_thisthread_cpubind;
+ hooks->set_proc_cpubind = hwloc_freebsd_set_proc_cpubind;
+ hooks->get_proc_cpubind = hwloc_freebsd_get_proc_cpubind;
+#ifdef hwloc_thread_t
+#if HAVE_DECL_PTHREAD_SETAFFINITY_NP
+ hooks->set_thread_cpubind = hwloc_freebsd_set_thread_cpubind;
+#endif
+#if HAVE_DECL_PTHREAD_GETAFFINITY_NP
+ hooks->get_thread_cpubind = hwloc_freebsd_get_thread_cpubind;
+#endif
+#endif
+#endif
+ /* TODO: get_last_cpu_location: find out ki_lastcpu */
+}
+
+static struct hwloc_backend *
+hwloc_freebsd_component_instantiate(struct hwloc_disc_component *component,
+ const void *_data1 __hwloc_attribute_unused,
+ const void *_data2 __hwloc_attribute_unused,
+ const void *_data3 __hwloc_attribute_unused)
+{
+ struct hwloc_backend *backend;
+ backend = hwloc_backend_alloc(component);
+ if (!backend)
+ return NULL;
+ backend->discover = hwloc_look_freebsd;
+ return backend;
+}
+
+static struct hwloc_disc_component hwloc_freebsd_disc_component = {
+ HWLOC_DISC_COMPONENT_TYPE_CPU,
+ "freebsd",
+ HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+ hwloc_freebsd_component_instantiate,
+ 50,
+ NULL
+};
+
+const struct hwloc_component hwloc_freebsd_component = {
+ HWLOC_COMPONENT_ABI,
+ NULL, NULL,
+ HWLOC_COMPONENT_TYPE_DISC,
+ 0,
+ &hwloc_freebsd_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology-linux.c b/ext/hwloc/hwloc/topology-linux.c
new file mode 100644
index 0000000..82423ff
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-linux.c
@@ -0,0 +1,5133 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria. All rights reserved.
+ * Copyright © 2009-2013 Université Bordeaux
+ * Copyright © 2009-2014 Cisco Systems, Inc. All rights reserved.
+ * Copyright © 2015 Intel, Inc. All rights reserved.
+ * Copyright © 2010 IBM
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <hwloc/linux.h>
+#include <private/misc.h>
+#include <private/private.h>
+#include <private/misc.h>
+#include <private/debug.h>
+
+#include <limits.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <assert.h>
+#ifdef HAVE_DIRENT_H
+#include <dirent.h>
+#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_LIBUDEV_H
+#include <libudev.h>
+#endif
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sched.h>
+#include <pthread.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#if defined HWLOC_HAVE_SET_MEMPOLICY || defined HWLOC_HAVE_MBIND
+#define migratepages migrate_pages /* workaround broken migratepages prototype in numaif.h before libnuma 2.0.2 */
+#include <numaif.h>
+#endif
+
+struct hwloc_linux_backend_data_s {
+ int root_fd; /* The file descriptor for the file system root, used when browsing, e.g., Linux' sysfs and procfs. */
+ int is_real_fsroot; /* Boolean saying whether root_fd points to the real filesystem root of the system */
+#ifdef HAVE_LIBUDEV_H
+ struct udev *udev; /* Global udev context */
+#endif
+
+ struct utsname utsname; /* fields contain \0 when unknown */
+
+ int deprecated_classlinks_model; /* -2 if never tried, -1 if unknown, 0 if new (device contains class/name), 1 if old (device contains class:name) */
+ int mic_need_directlookup; /* if not tried yet, 0 if not needed, 1 if needed */
+ unsigned mic_directlookup_id_max; /* -1 if not tried yet, 0 if none to lookup, maxid+1 otherwise */
+};
+
+
+
+/***************************
+ * Misc Abstraction layers *
+ ***************************/
+
+#if !(defined HWLOC_HAVE_SCHED_SETAFFINITY) && (defined HWLOC_HAVE__SYSCALL3)
+/* libc doesn't have support for sched_setaffinity, build system call
+ * ourselves: */
+# include <linux/unistd.h>
+# ifndef __NR_sched_setaffinity
+# ifdef __i386__
+# define __NR_sched_setaffinity 241
+# elif defined(__x86_64__)
+# define __NR_sched_setaffinity 203
+# elif defined(__ia64__)
+# define __NR_sched_setaffinity 1231
+# elif defined(__hppa__)
+# define __NR_sched_setaffinity 211
+# elif defined(__alpha__)
+# define __NR_sched_setaffinity 395
+# elif defined(__s390__)
+# define __NR_sched_setaffinity 239
+# elif defined(__sparc__)
+# define __NR_sched_setaffinity 261
+# elif defined(__m68k__)
+# define __NR_sched_setaffinity 311
+# elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
+# define __NR_sched_setaffinity 222
+# elif defined(__arm__)
+# define __NR_sched_setaffinity 241
+# elif defined(__cris__)
+# define __NR_sched_setaffinity 241
+/*# elif defined(__mips__)
+ # define __NR_sched_setaffinity TODO (32/64/nabi) */
+# else
+# warning "don't know the syscall number for sched_setaffinity on this architecture, will not support binding"
+# define sched_setaffinity(pid, lg, mask) (errno = ENOSYS, -1)
+# endif
+# endif
+# ifndef sched_setaffinity
+ _syscall3(int, sched_setaffinity, pid_t, pid, unsigned int, lg, const void *, mask)
+# endif
+# ifndef __NR_sched_getaffinity
+# ifdef __i386__
+# define __NR_sched_getaffinity 242
+# elif defined(__x86_64__)
+# define __NR_sched_getaffinity 204
+# elif defined(__ia64__)
+# define __NR_sched_getaffinity 1232
+# elif defined(__hppa__)
+# define __NR_sched_getaffinity 212
+# elif defined(__alpha__)
+# define __NR_sched_getaffinity 396
+# elif defined(__s390__)
+# define __NR_sched_getaffinity 240
+# elif defined(__sparc__)
+# define __NR_sched_getaffinity 260
+# elif defined(__m68k__)
+# define __NR_sched_getaffinity 312
+# elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
+# define __NR_sched_getaffinity 223
+# elif defined(__arm__)
+# define __NR_sched_getaffinity 242
+# elif defined(__cris__)
+# define __NR_sched_getaffinity 242
+/*# elif defined(__mips__)
+ # define __NR_sched_getaffinity TODO (32/64/nabi) */
+# else
+# warning "don't know the syscall number for sched_getaffinity on this architecture, will not support getting binding"
+# define sched_getaffinity(pid, lg, mask) (errno = ENOSYS, -1)
+# endif
+# endif
+# ifndef sched_getaffinity
+ _syscall3(int, sched_getaffinity, pid_t, pid, unsigned int, lg, void *, mask)
+# endif
+#endif
+
+/* Added for ntohl() */
+#include <arpa/inet.h>
+
+#ifdef HAVE_OPENAT
+/* Use our own filesystem functions if we have openat */
+
+static const char *
+hwloc_checkat(const char *path, int fsroot_fd)
+{
+ const char *relative_path;
+ if (fsroot_fd < 0) {
+ errno = EBADF;
+ return NULL;
+ }
+
+ /* Skip leading slashes. */
+ for (relative_path = path; *relative_path == '/'; relative_path++);
+
+ return relative_path;
+}
+
+static int
+hwloc_openat(const char *path, int fsroot_fd)
+{
+ const char *relative_path;
+
+ relative_path = hwloc_checkat(path, fsroot_fd);
+ if (!relative_path)
+ return -1;
+
+ return openat (fsroot_fd, relative_path, O_RDONLY);
+}
+
+static FILE *
+hwloc_fopenat(const char *path, const char *mode, int fsroot_fd)
+{
+ int fd;
+
+ if (strcmp(mode, "r")) {
+ errno = ENOTSUP;
+ return NULL;
+ }
+
+ fd = hwloc_openat (path, fsroot_fd);
+ if (fd == -1)
+ return NULL;
+
+ return fdopen(fd, mode);
+}
+
+static int
+hwloc_accessat(const char *path, int mode, int fsroot_fd)
+{
+ const char *relative_path;
+
+ relative_path = hwloc_checkat(path, fsroot_fd);
+ if (!relative_path)
+ return -1;
+
+ return faccessat(fsroot_fd, relative_path, mode, 0);
+}
+
+static int
+hwloc_fstatat(const char *path, struct stat *st, int flags, int fsroot_fd)
+{
+ const char *relative_path;
+
+ relative_path = hwloc_checkat(path, fsroot_fd);
+ if (!relative_path)
+ return -1;
+
+ return fstatat(fsroot_fd, relative_path, st, flags);
+}
+
+static DIR*
+hwloc_opendirat(const char *path, int fsroot_fd)
+{
+ int dir_fd;
+ const char *relative_path;
+
+ relative_path = hwloc_checkat(path, fsroot_fd);
+ if (!relative_path)
+ return NULL;
+
+ dir_fd = openat(fsroot_fd, relative_path, O_RDONLY | O_DIRECTORY);
+ if (dir_fd < 0)
+ return NULL;
+
+ return fdopendir(dir_fd);
+}
+
+#endif /* HAVE_OPENAT */
+
+/* Static inline version of fopen so that we can use openat if we have
+ it, but still preserve compiler parameter checking */
+static __hwloc_inline int
+hwloc_open(const char *p, int d __hwloc_attribute_unused)
+{
+#ifdef HAVE_OPENAT
+ return hwloc_openat(p, d);
+#else
+ return open(p, O_RDONLY);
+#endif
+}
+
+static __hwloc_inline FILE *
+hwloc_fopen(const char *p, const char *m, int d __hwloc_attribute_unused)
+{
+#ifdef HAVE_OPENAT
+ return hwloc_fopenat(p, m, d);
+#else
+ return fopen(p, m);
+#endif
+}
+
+/* Static inline version of access so that we can use openat if we have
+ it, but still preserve compiler parameter checking */
+static __hwloc_inline int
+hwloc_access(const char *p, int m, int d __hwloc_attribute_unused)
+{
+#ifdef HAVE_OPENAT
+ return hwloc_accessat(p, m, d);
+#else
+ return access(p, m);
+#endif
+}
+
+static __hwloc_inline int
+hwloc_stat(const char *p, struct stat *st, int d __hwloc_attribute_unused)
+{
+#ifdef HAVE_OPENAT
+ return hwloc_fstatat(p, st, 0, d);
+#else
+ return stat(p, st);
+#endif
+}
+
+static __hwloc_inline int
+hwloc_lstat(const char *p, struct stat *st, int d __hwloc_attribute_unused)
+{
+#ifdef HAVE_OPENAT
+ return hwloc_fstatat(p, st, AT_SYMLINK_NOFOLLOW, d);
+#else
+ return lstat(p, st);
+#endif
+}
+
+/* Static inline version of opendir so that we can use openat if we have
+ it, but still preserve compiler parameter checking */
+static __hwloc_inline DIR *
+hwloc_opendir(const char *p, int d __hwloc_attribute_unused)
+{
+#ifdef HAVE_OPENAT
+ return hwloc_opendirat(p, d);
+#else
+ return opendir(p);
+#endif
+}
+
+
+/*****************************
+ ******* CpuBind Hooks *******
+ *****************************/
+
+int
+hwloc_linux_set_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid __hwloc_attribute_unused, hwloc_const_bitmap_t hwloc_set __hwloc_attribute_unused)
+{
+ /* The resulting binding is always strict */
+
+#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
+ cpu_set_t *plinux_set;
+ unsigned cpu;
+ int last;
+ size_t setsize;
+ int err;
+
+ last = hwloc_bitmap_last(hwloc_set);
+ if (last == -1) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ setsize = CPU_ALLOC_SIZE(last+1);
+ plinux_set = CPU_ALLOC(last+1);
+
+ CPU_ZERO_S(setsize, plinux_set);
+ hwloc_bitmap_foreach_begin(cpu, hwloc_set)
+ CPU_SET_S(cpu, setsize, plinux_set);
+ hwloc_bitmap_foreach_end();
+
+ err = sched_setaffinity(tid, setsize, plinux_set);
+
+ CPU_FREE(plinux_set);
+ return err;
+#elif defined(HWLOC_HAVE_CPU_SET)
+ cpu_set_t linux_set;
+ unsigned cpu;
+
+ CPU_ZERO(&linux_set);
+ hwloc_bitmap_foreach_begin(cpu, hwloc_set)
+ CPU_SET(cpu, &linux_set);
+ hwloc_bitmap_foreach_end();
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+ return sched_setaffinity(tid, &linux_set);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+ return sched_setaffinity(tid, sizeof(linux_set), &linux_set);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+#elif defined(HWLOC_HAVE__SYSCALL3)
+ unsigned long mask = hwloc_bitmap_to_ulong(hwloc_set);
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+ return sched_setaffinity(tid, (void*) &mask);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+ return sched_setaffinity(tid, sizeof(mask), (void*) &mask);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+#else /* !_SYSCALL3 */
+ errno = ENOSYS;
+ return -1;
+#endif /* !_SYSCALL3 */
+}
+
+#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
+static int
+hwloc_linux_parse_cpuset_file(FILE *file, hwloc_bitmap_t set)
+{
+ unsigned long start, stop;
+
+ /* reset to zero first */
+ hwloc_bitmap_zero(set);
+
+ while (fscanf(file, "%lu", &start) == 1)
+ {
+ int c = fgetc(file);
+
+ stop = start;
+
+ if (c == '-') {
+ /* Range */
+ if (fscanf(file, "%lu", &stop) != 1) {
+ /* Expected a number here */
+ errno = EINVAL;
+ return -1;
+ }
+ c = fgetc(file);
+ }
+
+ if (c == EOF || c == '\n') {
+ hwloc_bitmap_set_range(set, start, stop);
+ break;
+ }
+
+ if (c != ',') {
+ /* Expected EOF, EOL, or a comma */
+ errno = EINVAL;
+ return -1;
+ }
+
+ hwloc_bitmap_set_range(set, start, stop);
+ }
+
+ return 0;
+}
+
+/*
+ * On some kernels, sched_getaffinity requires the output size to be larger
+ * than the kernel cpu_set size (defined by CONFIG_NR_CPUS).
+ * Try sched_affinity on ourself until we find a nr_cpus value that makes
+ * the kernel happy.
+ */
+static int
+hwloc_linux_find_kernel_nr_cpus(hwloc_topology_t topology)
+{
+ static int _nr_cpus = -1;
+ int nr_cpus = _nr_cpus;
+ FILE *possible;
+
+ if (nr_cpus != -1)
+ /* already computed */
+ return nr_cpus;
+
+ if (topology->levels[0][0]->complete_cpuset)
+ /* start with a nr_cpus that may contain the whole topology */
+ nr_cpus = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset) + 1;
+ if (nr_cpus <= 0)
+ /* start from scratch, the topology isn't ready yet (complete_cpuset is missing (-1) or empty (0))*/
+ nr_cpus = 1;
+
+ possible = fopen("/sys/devices/system/cpu/possible", "r");
+ if (possible) {
+ hwloc_bitmap_t possible_bitmap = hwloc_bitmap_alloc();
+ if (hwloc_linux_parse_cpuset_file(possible, possible_bitmap) == 0) {
+ int max_possible = hwloc_bitmap_last(possible_bitmap);
+
+ hwloc_debug_bitmap("possible CPUs are %s\n", possible_bitmap);
+
+ if (nr_cpus < max_possible + 1)
+ nr_cpus = max_possible + 1;
+ }
+ fclose(possible);
+ hwloc_bitmap_free(possible_bitmap);
+ }
+
+ while (1) {
+ cpu_set_t *set = CPU_ALLOC(nr_cpus);
+ size_t setsize = CPU_ALLOC_SIZE(nr_cpus);
+ int err = sched_getaffinity(0, setsize, set); /* always works, unless setsize is too small */
+ CPU_FREE(set);
+ nr_cpus = setsize * 8; /* that's the value that was actually tested */
+ if (!err)
+ /* found it */
+ return _nr_cpus = nr_cpus;
+ nr_cpus *= 2;
+ }
+}
+#endif
+
+int
+hwloc_linux_get_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid __hwloc_attribute_unused, hwloc_bitmap_t hwloc_set __hwloc_attribute_unused)
+{
+ int err __hwloc_attribute_unused;
+
+#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
+ cpu_set_t *plinux_set;
+ unsigned cpu;
+ int last;
+ size_t setsize;
+ int kernel_nr_cpus;
+
+ /* find the kernel nr_cpus so as to use a large enough cpu_set size */
+ kernel_nr_cpus = hwloc_linux_find_kernel_nr_cpus(topology);
+ setsize = CPU_ALLOC_SIZE(kernel_nr_cpus);
+ plinux_set = CPU_ALLOC(kernel_nr_cpus);
+
+ err = sched_getaffinity(tid, setsize, plinux_set);
+
+ if (err < 0) {
+ CPU_FREE(plinux_set);
+ return -1;
+ }
+
+ last = -1;
+ if (topology->levels[0][0]->complete_cpuset)
+ last = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset);
+ if (last == -1)
+ /* round the maximal support number, the topology isn't ready yet (complete_cpuset is missing or empty)*/
+ last = kernel_nr_cpus-1;
+
+ hwloc_bitmap_zero(hwloc_set);
+ for(cpu=0; cpu<=(unsigned) last; cpu++)
+ if (CPU_ISSET_S(cpu, setsize, plinux_set))
+ hwloc_bitmap_set(hwloc_set, cpu);
+
+ CPU_FREE(plinux_set);
+#elif defined(HWLOC_HAVE_CPU_SET)
+ cpu_set_t linux_set;
+ unsigned cpu;
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+ err = sched_getaffinity(tid, &linux_set);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+ err = sched_getaffinity(tid, sizeof(linux_set), &linux_set);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+ if (err < 0)
+ return -1;
+
+ hwloc_bitmap_zero(hwloc_set);
+ for(cpu=0; cpu<CPU_SETSIZE; cpu++)
+ if (CPU_ISSET(cpu, &linux_set))
+ hwloc_bitmap_set(hwloc_set, cpu);
+#elif defined(HWLOC_HAVE__SYSCALL3)
+ unsigned long mask;
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+ err = sched_getaffinity(tid, (void*) &mask);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+ err = sched_getaffinity(tid, sizeof(mask), (void*) &mask);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+ if (err < 0)
+ return -1;
+
+ hwloc_bitmap_from_ulong(hwloc_set, mask);
+#else /* !_SYSCALL3 */
+ errno = ENOSYS;
+ return -1;
+#endif /* !_SYSCALL3 */
+
+ return 0;
+}
+
+/* Get the array of tids of a process from the task directory in /proc */
+static int
+hwloc_linux_get_proc_tids(DIR *taskdir, unsigned *nr_tidsp, pid_t ** tidsp)
+{
+ struct dirent *dirent;
+ unsigned nr_tids = 0;
+ unsigned max_tids = 32;
+ pid_t *tids;
+ struct stat sb;
+
+ /* take the number of links as a good estimate for the number of tids */
+ if (fstat(dirfd(taskdir), &sb) == 0)
+ max_tids = sb.st_nlink;
+
+ tids = malloc(max_tids*sizeof(pid_t));
+ if (!tids) {
+ errno = ENOMEM;
+ return -1;
+ }
+
+ rewinddir(taskdir);
+
+ while ((dirent = readdir(taskdir)) != NULL) {
+ if (nr_tids == max_tids) {
+ pid_t *newtids;
+ max_tids += 8;
+ newtids = realloc(tids, max_tids*sizeof(pid_t));
+ if (!newtids) {
+ free(tids);
+ errno = ENOMEM;
+ return -1;
+ }
+ tids = newtids;
+ }
+ if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
+ continue;
+ tids[nr_tids++] = atoi(dirent->d_name);
+ }
+
+ *nr_tidsp = nr_tids;
+ *tidsp = tids;
+ return 0;
+}
+
+/* Per-tid callbacks */
+typedef int (*hwloc_linux_foreach_proc_tid_cb_t)(hwloc_topology_t topology, pid_t tid, void *data, int idx);
+
+static int
+hwloc_linux_foreach_proc_tid(hwloc_topology_t topology,
+ pid_t pid, hwloc_linux_foreach_proc_tid_cb_t cb,
+ void *data)
+{
+ char taskdir_path[128];
+ DIR *taskdir;
+ pid_t *tids, *newtids;
+ unsigned i, nr, newnr, failed = 0, failed_errno = 0;
+ unsigned retrynr = 0;
+ int err;
+
+ if (pid)
+ snprintf(taskdir_path, sizeof(taskdir_path), "/proc/%u/task", (unsigned) pid);
+ else
+ snprintf(taskdir_path, sizeof(taskdir_path), "/proc/self/task");
+
+ taskdir = opendir(taskdir_path);
+ if (!taskdir) {
+ if (errno == ENOENT)
+ errno = EINVAL;
+ err = -1;
+ goto out;
+ }
+
+ /* read the current list of threads */
+ err = hwloc_linux_get_proc_tids(taskdir, &nr, &tids);
+ if (err < 0)
+ goto out_with_dir;
+
+ retry:
+ /* apply the callback to all threads */
+ failed=0;
+ for(i=0; i<nr; i++) {
+ err = cb(topology, tids[i], data, i);
+ if (err < 0) {
+ failed++;
+ failed_errno = errno;
+ }
+ }
+
+ /* re-read the list of thread */
+ err = hwloc_linux_get_proc_tids(taskdir, &newnr, &newtids);
+ if (err < 0)
+ goto out_with_tids;
+ /* retry if the list changed in the meantime, or we failed for *some* threads only.
+ * if we're really unlucky, all threads changed but we got the same set of tids. no way to support this.
+ */
+ if (newnr != nr || memcmp(newtids, tids, nr*sizeof(pid_t)) || (failed && failed != nr)) {
+ free(tids);
+ tids = newtids;
+ nr = newnr;
+ if (++retrynr > 10) {
+ /* we tried 10 times, it didn't work, the application is probably creating/destroying many threads, stop trying */
+ errno = EAGAIN;
+ err = -1;
+ goto out_with_tids;
+ }
+ goto retry;
+ } else {
+ free(newtids);
+ }
+
+ /* if all threads failed, return the last errno. */
+ if (failed) {
+ err = -1;
+ errno = failed_errno;
+ goto out_with_tids;
+ }
+
+ err = 0;
+ out_with_tids:
+ free(tids);
+ out_with_dir:
+ closedir(taskdir);
+ out:
+ return err;
+}
+
+/* Per-tid proc_set_cpubind callback and caller.
+ * Callback data is a hwloc_bitmap_t. */
+static int
+hwloc_linux_foreach_proc_tid_set_cpubind_cb(hwloc_topology_t topology, pid_t tid, void *data, int idx __hwloc_attribute_unused)
+{
+ return hwloc_linux_set_tid_cpubind(topology, tid, (hwloc_bitmap_t) data);
+}
+
+static int
+hwloc_linux_set_pid_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_const_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+ return hwloc_linux_foreach_proc_tid(topology, pid,
+ hwloc_linux_foreach_proc_tid_set_cpubind_cb,
+ (void*) hwloc_set);
+}
+
+/* Per-tid proc_get_cpubind callback data, callback function and caller */
+struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s {
+ hwloc_bitmap_t cpuset;
+ hwloc_bitmap_t tidset;
+ int flags;
+};
+
+static int
+hwloc_linux_foreach_proc_tid_get_cpubind_cb(hwloc_topology_t topology, pid_t tid, void *_data, int idx)
+{
+ struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s *data = _data;
+ hwloc_bitmap_t cpuset = data->cpuset;
+ hwloc_bitmap_t tidset = data->tidset;
+ int flags = data->flags;
+
+ if (hwloc_linux_get_tid_cpubind(topology, tid, tidset))
+ return -1;
+
+ /* reset the cpuset on first iteration */
+ if (!idx)
+ hwloc_bitmap_zero(cpuset);
+
+ if (flags & HWLOC_CPUBIND_STRICT) {
+ /* if STRICT, we want all threads to have the same binding */
+ if (!idx) {
+ /* this is the first thread, copy its binding */
+ hwloc_bitmap_copy(cpuset, tidset);
+ } else if (!hwloc_bitmap_isequal(cpuset, tidset)) {
+ /* this is not the first thread, and it's binding is different */
+ errno = EXDEV;
+ return -1;
+ }
+ } else {
+ /* if not STRICT, just OR all thread bindings */
+ hwloc_bitmap_or(cpuset, cpuset, tidset);
+ }
+ return 0;
+}
+
+static int
+hwloc_linux_get_pid_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags)
+{
+ struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s data;
+ hwloc_bitmap_t tidset = hwloc_bitmap_alloc();
+ int ret;
+
+ data.cpuset = hwloc_set;
+ data.tidset = tidset;
+ data.flags = flags;
+ ret = hwloc_linux_foreach_proc_tid(topology, pid,
+ hwloc_linux_foreach_proc_tid_get_cpubind_cb,
+ (void*) &data);
+ hwloc_bitmap_free(tidset);
+ return ret;
+}
+
+static int
+hwloc_linux_set_proc_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+ if (pid == 0)
+ pid = topology->pid;
+ if (flags & HWLOC_CPUBIND_THREAD)
+ return hwloc_linux_set_tid_cpubind(topology, pid, hwloc_set);
+ else
+ return hwloc_linux_set_pid_cpubind(topology, pid, hwloc_set, flags);
+}
+
+static int
+hwloc_linux_get_proc_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags)
+{
+ if (pid == 0)
+ pid = topology->pid;
+ if (flags & HWLOC_CPUBIND_THREAD)
+ return hwloc_linux_get_tid_cpubind(topology, pid, hwloc_set);
+ else
+ return hwloc_linux_get_pid_cpubind(topology, pid, hwloc_set, flags);
+}
+
+static int
+hwloc_linux_set_thisproc_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+ return hwloc_linux_set_pid_cpubind(topology, topology->pid, hwloc_set, flags);
+}
+
+static int
+hwloc_linux_get_thisproc_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags)
+{
+ return hwloc_linux_get_pid_cpubind(topology, topology->pid, hwloc_set, flags);
+}
+
+static int
+hwloc_linux_set_thisthread_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+ if (topology->pid) {
+ errno = ENOSYS;
+ return -1;
+ }
+ return hwloc_linux_set_tid_cpubind(topology, 0, hwloc_set);
+}
+
+static int
+hwloc_linux_get_thisthread_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+ if (topology->pid) {
+ errno = ENOSYS;
+ return -1;
+ }
+ return hwloc_linux_get_tid_cpubind(topology, 0, hwloc_set);
+}
+
+#if HAVE_DECL_PTHREAD_SETAFFINITY_NP
+#pragma weak pthread_setaffinity_np
+#pragma weak pthread_self
+
+static int
+hwloc_linux_set_thread_cpubind(hwloc_topology_t topology, pthread_t tid, hwloc_const_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+ int err;
+
+ if (topology->pid) {
+ errno = ENOSYS;
+ return -1;
+ }
+
+ if (!pthread_self) {
+ /* ?! Application uses set_thread_cpubind, but doesn't link against libpthread ?! */
+ errno = ENOSYS;
+ return -1;
+ }
+ if (tid == pthread_self())
+ return hwloc_linux_set_tid_cpubind(topology, 0, hwloc_set);
+
+ if (!pthread_setaffinity_np) {
+ errno = ENOSYS;
+ return -1;
+ }
+
+#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
+ /* Use a separate block so that we can define specific variable
+ types here */
+ {
+ cpu_set_t *plinux_set;
+ unsigned cpu;
+ int last;
+ size_t setsize;
+
+ last = hwloc_bitmap_last(hwloc_set);
+ if (last == -1) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ setsize = CPU_ALLOC_SIZE(last+1);
+ plinux_set = CPU_ALLOC(last+1);
+
+ CPU_ZERO_S(setsize, plinux_set);
+ hwloc_bitmap_foreach_begin(cpu, hwloc_set)
+ CPU_SET_S(cpu, setsize, plinux_set);
+ hwloc_bitmap_foreach_end();
+
+ err = pthread_setaffinity_np(tid, setsize, plinux_set);
+
+ CPU_FREE(plinux_set);
+ }
+#elif defined(HWLOC_HAVE_CPU_SET)
+ /* Use a separate block so that we can define specific variable
+ types here */
+ {
+ cpu_set_t linux_set;
+ unsigned cpu;
+
+ CPU_ZERO(&linux_set);
+ hwloc_bitmap_foreach_begin(cpu, hwloc_set)
+ CPU_SET(cpu, &linux_set);
+ hwloc_bitmap_foreach_end();
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+ err = pthread_setaffinity_np(tid, &linux_set);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+ err = pthread_setaffinity_np(tid, sizeof(linux_set), &linux_set);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+ }
+#else /* CPU_SET */
+ /* Use a separate block so that we can define specific variable
+ types here */
+ {
+ unsigned long mask = hwloc_bitmap_to_ulong(hwloc_set);
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+ err = pthread_setaffinity_np(tid, (void*) &mask);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+ err = pthread_setaffinity_np(tid, sizeof(mask), (void*) &mask);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+ }
+#endif /* CPU_SET */
+
+ if (err) {
+ errno = err;
+ return -1;
+ }
+ return 0;
+}
+#endif /* HAVE_DECL_PTHREAD_SETAFFINITY_NP */
+
+#if HAVE_DECL_PTHREAD_GETAFFINITY_NP
+#pragma weak pthread_getaffinity_np
+#pragma weak pthread_self
+
+static int
+hwloc_linux_get_thread_cpubind(hwloc_topology_t topology, pthread_t tid, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+ int err;
+
+ if (topology->pid) {
+ errno = ENOSYS;
+ return -1;
+ }
+
+ if (!pthread_self) {
+ /* ?! Application uses set_thread_cpubind, but doesn't link against libpthread ?! */
+ errno = ENOSYS;
+ return -1;
+ }
+ if (tid == pthread_self())
+ return hwloc_linux_get_tid_cpubind(topology, 0, hwloc_set);
+
+ if (!pthread_getaffinity_np) {
+ errno = ENOSYS;
+ return -1;
+ }
+
+#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
+ /* Use a separate block so that we can define specific variable
+ types here */
+ {
+ cpu_set_t *plinux_set;
+ unsigned cpu;
+ int last;
+ size_t setsize;
+
+ last = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset);
+ assert (last != -1);
+
+ setsize = CPU_ALLOC_SIZE(last+1);
+ plinux_set = CPU_ALLOC(last+1);
+
+ err = pthread_getaffinity_np(tid, setsize, plinux_set);
+ if (err) {
+ CPU_FREE(plinux_set);
+ errno = err;
+ return -1;
+ }
+
+ hwloc_bitmap_zero(hwloc_set);
+ for(cpu=0; cpu<=(unsigned) last; cpu++)
+ if (CPU_ISSET_S(cpu, setsize, plinux_set))
+ hwloc_bitmap_set(hwloc_set, cpu);
+
+ CPU_FREE(plinux_set);
+ }
+#elif defined(HWLOC_HAVE_CPU_SET)
+ /* Use a separate block so that we can define specific variable
+ types here */
+ {
+ cpu_set_t linux_set;
+ unsigned cpu;
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+ err = pthread_getaffinity_np(tid, &linux_set);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+ err = pthread_getaffinity_np(tid, sizeof(linux_set), &linux_set);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+ if (err) {
+ errno = err;
+ return -1;
+ }
+
+ hwloc_bitmap_zero(hwloc_set);
+ for(cpu=0; cpu<CPU_SETSIZE; cpu++)
+ if (CPU_ISSET(cpu, &linux_set))
+ hwloc_bitmap_set(hwloc_set, cpu);
+ }
+#else /* CPU_SET */
+ /* Use a separate block so that we can define specific variable
+ types here */
+ {
+ unsigned long mask;
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+ err = pthread_getaffinity_np(tid, (void*) &mask);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+ err = pthread_getaffinity_np(tid, sizeof(mask), (void*) &mask);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+ if (err) {
+ errno = err;
+ return -1;
+ }
+
+ hwloc_bitmap_from_ulong(hwloc_set, mask);
+ }
+#endif /* CPU_SET */
+
+ return 0;
+}
+#endif /* HAVE_DECL_PTHREAD_GETAFFINITY_NP */
+
+int
+hwloc_linux_get_tid_last_cpu_location(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid, hwloc_bitmap_t set)
+{
+ /* read /proc/pid/stat.
+ * its second field contains the command name between parentheses,
+ * and the command itself may contain parentheses,
+ * so read the whole line and find the last closing parenthesis to find the third field.
+ */
+ char buf[1024] = "";
+ char name[64];
+ char *tmp;
+ FILE *file;
+ int i;
+
+ if (!tid) {
+#ifdef SYS_gettid
+ tid = syscall(SYS_gettid);
+#else
+ errno = ENOSYS;
+ return -1;
+#endif
+ }
+
+ snprintf(name, sizeof(name), "/proc/%lu/stat", (unsigned long) tid);
+ file = fopen(name, "r");
+ if (!file) {
+ errno = ENOSYS;
+ return -1;
+ }
+ tmp = fgets(buf, sizeof(buf), file);
+ fclose(file);
+ if (!tmp) {
+ errno = ENOSYS;
+ return -1;
+ }
+
+ tmp = strrchr(buf, ')');
+ if (!tmp) {
+ errno = ENOSYS;
+ return -1;
+ }
+ /* skip ') ' to find the actual third argument */
+ tmp += 2;
+
+ /* skip 35 fields */
+ for(i=0; i<36; i++) {
+ tmp = strchr(tmp, ' ');
+ if (!tmp) {
+ errno = ENOSYS;
+ return -1;
+ }
+ /* skip the ' ' itself */
+ tmp++;
+ }
+
+ /* read the last cpu in the 38th field now */
+ if (sscanf(tmp, "%d ", &i) != 1) {
+ errno = ENOSYS;
+ return -1;
+ }
+
+ hwloc_bitmap_only(set, i);
+ return 0;
+}
+
+/* Per-tid proc_get_last_cpu_location callback data, callback function and caller */
+struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s {
+ hwloc_bitmap_t cpuset;
+ hwloc_bitmap_t tidset;
+};
+
+static int
+hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb(hwloc_topology_t topology, pid_t tid, void *_data, int idx)
+{
+ struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s *data = _data;
+ hwloc_bitmap_t cpuset = data->cpuset;
+ hwloc_bitmap_t tidset = data->tidset;
+
+ if (hwloc_linux_get_tid_last_cpu_location(topology, tid, tidset))
+ return -1;
+
+ /* reset the cpuset on first iteration */
+ if (!idx)
+ hwloc_bitmap_zero(cpuset);
+
+ hwloc_bitmap_or(cpuset, cpuset, tidset);
+ return 0;
+}
+
+static int
+hwloc_linux_get_pid_last_cpu_location(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+ struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s data;
+ hwloc_bitmap_t tidset = hwloc_bitmap_alloc();
+ int ret;
+
+ data.cpuset = hwloc_set;
+ data.tidset = tidset;
+ ret = hwloc_linux_foreach_proc_tid(topology, pid,
+ hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb,
+ &data);
+ hwloc_bitmap_free(tidset);
+ return ret;
+}
+
+static int
+hwloc_linux_get_proc_last_cpu_location(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags)
+{
+ if (pid == 0)
+ pid = topology->pid;
+ if (flags & HWLOC_CPUBIND_THREAD)
+ return hwloc_linux_get_tid_last_cpu_location(topology, pid, hwloc_set);
+ else
+ return hwloc_linux_get_pid_last_cpu_location(topology, pid, hwloc_set, flags);
+}
+
+static int
+hwloc_linux_get_thisproc_last_cpu_location(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags)
+{
+ return hwloc_linux_get_pid_last_cpu_location(topology, topology->pid, hwloc_set, flags);
+}
+
+static int
+hwloc_linux_get_thisthread_last_cpu_location(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+ if (topology->pid) {
+ errno = ENOSYS;
+ return -1;
+ }
+ return hwloc_linux_get_tid_last_cpu_location(topology, 0, hwloc_set);
+}
+
+
+
+/***************************
+ ****** Membind hooks ******
+ ***************************/
+
+#if defined HWLOC_HAVE_SET_MEMPOLICY || defined HWLOC_HAVE_MBIND
+static int
+hwloc_linux_membind_policy_from_hwloc(int *linuxpolicy, hwloc_membind_policy_t policy, int flags)
+{
+ switch (policy) {
+ case HWLOC_MEMBIND_DEFAULT:
+ case HWLOC_MEMBIND_FIRSTTOUCH:
+ *linuxpolicy = MPOL_DEFAULT;
+ break;
+ case HWLOC_MEMBIND_BIND:
+ if (flags & HWLOC_MEMBIND_STRICT)
+ *linuxpolicy = MPOL_BIND;
+ else
+ *linuxpolicy = MPOL_PREFERRED;
+ break;
+ case HWLOC_MEMBIND_INTERLEAVE:
+ *linuxpolicy = MPOL_INTERLEAVE;
+ break;
+ /* TODO: next-touch when (if?) patch applied upstream */
+ default:
+ errno = ENOSYS;
+ return -1;
+ }
+ return 0;
+}
+
+static int
+hwloc_linux_membind_mask_from_nodeset(hwloc_topology_t topology __hwloc_attribute_unused,
+ hwloc_const_nodeset_t nodeset,
+ unsigned *max_os_index_p, unsigned long **linuxmaskp)
+{
+ unsigned max_os_index = 0; /* highest os_index + 1 */
+ unsigned long *linuxmask;
+ unsigned i;
+ hwloc_nodeset_t linux_nodeset = NULL;
+
+ if (hwloc_bitmap_isfull(nodeset)) {
+ linux_nodeset = hwloc_bitmap_alloc();
+ hwloc_bitmap_only(linux_nodeset, 0);
+ nodeset = linux_nodeset;
+ }
+
+ max_os_index = hwloc_bitmap_last(nodeset);
+ if (max_os_index == (unsigned) -1)
+ max_os_index = 0;
+ /* add 1 to convert the last os_index into a max_os_index,
+ * and round up to the nearest multiple of BITS_PER_LONG */
+ max_os_index = (max_os_index + 1 + HWLOC_BITS_PER_LONG - 1) & ~(HWLOC_BITS_PER_LONG - 1);
+
+ linuxmask = calloc(max_os_index/HWLOC_BITS_PER_LONG, sizeof(long));
+ if (!linuxmask) {
+ hwloc_bitmap_free(linux_nodeset);
+ errno = ENOMEM;
+ return -1;
+ }
+
+ for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
+ linuxmask[i] = hwloc_bitmap_to_ith_ulong(nodeset, i);
+
+ if (linux_nodeset)
+ hwloc_bitmap_free(linux_nodeset);
+
+ *max_os_index_p = max_os_index;
+ *linuxmaskp = linuxmask;
+ return 0;
+}
+
+static void
+hwloc_linux_membind_mask_to_nodeset(hwloc_topology_t topology __hwloc_attribute_unused,
+ hwloc_nodeset_t nodeset,
+ unsigned max_os_index, const unsigned long *linuxmask)
+{
+ unsigned i;
+
+#ifdef HWLOC_DEBUG
+ /* max_os_index comes from hwloc_linux_find_kernel_max_numnodes() so it's a multiple of HWLOC_BITS_PER_LONG */
+ assert(!(max_os_index%HWLOC_BITS_PER_LONG));
+#endif
+
+ hwloc_bitmap_zero(nodeset);
+ for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
+ hwloc_bitmap_set_ith_ulong(nodeset, i, linuxmask[i]);
+}
+#endif /* HWLOC_HAVE_SET_MEMPOLICY || HWLOC_HAVE_MBIND */
+
+#ifdef HWLOC_HAVE_MBIND
+static int
+hwloc_linux_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+ unsigned max_os_index; /* highest os_index + 1 */
+ unsigned long *linuxmask;
+ size_t remainder;
+ int linuxpolicy;
+ unsigned linuxflags = 0;
+ int err;
+
+ remainder = (uintptr_t) addr & (sysconf(_SC_PAGESIZE)-1);
+ addr = (char*) addr - remainder;
+ len += remainder;
+
+ err = hwloc_linux_membind_policy_from_hwloc(&linuxpolicy, policy, flags);
+ if (err < 0)
+ return err;
+
+ if (linuxpolicy == MPOL_DEFAULT)
+ /* Some Linux kernels don't like being passed a set */
+ return mbind((void *) addr, len, linuxpolicy, NULL, 0, 0);
+
+ err = hwloc_linux_membind_mask_from_nodeset(topology, nodeset, &max_os_index, &linuxmask);
+ if (err < 0)
+ goto out;
+
+ if (flags & HWLOC_MEMBIND_MIGRATE) {
+#ifdef MPOL_MF_MOVE
+ linuxflags = MPOL_MF_MOVE;
+ if (flags & HWLOC_MEMBIND_STRICT)
+ linuxflags |= MPOL_MF_STRICT;
+#else
+ if (flags & HWLOC_MEMBIND_STRICT) {
+ errno = ENOSYS;
+ goto out_with_mask;
+ }
+#endif
+ }
+
+ err = mbind((void *) addr, len, linuxpolicy, linuxmask, max_os_index+1, linuxflags);
+ if (err < 0)
+ goto out_with_mask;
+
+ free(linuxmask);
+ return 0;
+
+ out_with_mask:
+ free(linuxmask);
+ out:
+ return -1;
+}
+
+static void *
+hwloc_linux_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+ void *buffer;
+ int err;
+
+ buffer = hwloc_alloc_mmap(topology, len);
+ if (buffer == MAP_FAILED)
+ return NULL;
+
+ err = hwloc_linux_set_area_membind(topology, buffer, len, nodeset, policy, flags);
+ if (err < 0 && policy & HWLOC_MEMBIND_STRICT) {
+ munmap(buffer, len);
+ return NULL;
+ }
+
+ return buffer;
+}
+#endif /* HWLOC_HAVE_MBIND */
+
+#ifdef HWLOC_HAVE_SET_MEMPOLICY
+static int
+hwloc_linux_set_thisthread_membind(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+ unsigned max_os_index; /* highest os_index + 1 */
+ unsigned long *linuxmask;
+ int linuxpolicy;
+ int err;
+
+ err = hwloc_linux_membind_policy_from_hwloc(&linuxpolicy, policy, flags);
+ if (err < 0)
+ return err;
+
+ if (linuxpolicy == MPOL_DEFAULT)
+ /* Some Linux kernels don't like being passed a set */
+ return set_mempolicy(linuxpolicy, NULL, 0);
+
+ err = hwloc_linux_membind_mask_from_nodeset(topology, nodeset, &max_os_index, &linuxmask);
+ if (err < 0)
+ goto out;
+
+ if (flags & HWLOC_MEMBIND_MIGRATE) {
+#ifdef HWLOC_HAVE_MIGRATE_PAGES
+ unsigned long *fullmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
+ if (fullmask) {
+ memset(fullmask, 0xf, max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
+ err = migrate_pages(0, max_os_index+1, fullmask, linuxmask);
+ free(fullmask);
+ } else
+ err = -1;
+ if (err < 0 && (flags & HWLOC_MEMBIND_STRICT))
+ goto out_with_mask;
+#else
+ errno = ENOSYS;
+ goto out_with_mask;
+#endif
+ }
+
+ err = set_mempolicy(linuxpolicy, linuxmask, max_os_index+1);
+ if (err < 0)
+ goto out_with_mask;
+
+ free(linuxmask);
+ return 0;
+
+ out_with_mask:
+ free(linuxmask);
+ out:
+ return -1;
+}
+
+/*
+ * On some kernels, get_mempolicy requires the output size to be larger
+ * than the kernel MAX_NUMNODES (defined by CONFIG_NODES_SHIFT).
+ * Try get_mempolicy on ourself until we find a max_os_index value that
+ * makes the kernel happy.
+ */
+static int
+hwloc_linux_find_kernel_max_numnodes(hwloc_topology_t topology __hwloc_attribute_unused)
+{
+ static int max_numnodes = -1;
+ int linuxpolicy;
+
+ if (max_numnodes != -1)
+ /* already computed */
+ return max_numnodes;
+
+ /* start with a single ulong, it's the minimal and it's enough for most machines */
+ max_numnodes = HWLOC_BITS_PER_LONG;
+ while (1) {
+ unsigned long *mask = malloc(max_numnodes / HWLOC_BITS_PER_LONG * sizeof(long));
+ int err = get_mempolicy(&linuxpolicy, mask, max_numnodes, 0, 0);
+ free(mask);
+ if (!err || errno != EINVAL)
+ /* found it */
+ return max_numnodes;
+ max_numnodes *= 2;
+ }
+}
+
+static int
+hwloc_linux_membind_policy_to_hwloc(int linuxpolicy, hwloc_membind_policy_t *policy)
+{
+ switch (linuxpolicy) {
+ case MPOL_DEFAULT:
+ *policy = HWLOC_MEMBIND_FIRSTTOUCH;
+ return 0;
+ case MPOL_PREFERRED:
+ case MPOL_BIND:
+ *policy = HWLOC_MEMBIND_BIND;
+ return 0;
+ case MPOL_INTERLEAVE:
+ *policy = HWLOC_MEMBIND_INTERLEAVE;
+ return 0;
+ default:
+ errno = EINVAL;
+ return -1;
+ }
+}
+
+static int
+hwloc_linux_get_thisthread_membind(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t *policy, int flags __hwloc_attribute_unused)
+{
+ unsigned max_os_index;
+ unsigned long *linuxmask;
+ int linuxpolicy;
+ int err;
+
+ max_os_index = hwloc_linux_find_kernel_max_numnodes(topology);
+
+ linuxmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
+ if (!linuxmask) {
+ errno = ENOMEM;
+ goto out;
+ }
+
+ err = get_mempolicy(&linuxpolicy, linuxmask, max_os_index, 0, 0);
+ if (err < 0)
+ goto out_with_mask;
+
+ if (linuxpolicy == MPOL_DEFAULT) {
+ hwloc_bitmap_copy(nodeset, hwloc_topology_get_topology_nodeset(topology));
+ } else {
+ hwloc_linux_membind_mask_to_nodeset(topology, nodeset, max_os_index, linuxmask);
+ }
+
+ err = hwloc_linux_membind_policy_to_hwloc(linuxpolicy, policy);
+ if (err < 0)
+ goto out_with_mask;
+
+ free(linuxmask);
+ return 0;
+
+ out_with_mask:
+ free(linuxmask);
+ out:
+ return -1;
+}
+
+static int
+hwloc_linux_get_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t *policy, int flags __hwloc_attribute_unused)
+{
+ unsigned max_os_index;
+ unsigned long *linuxmask, *globallinuxmask;
+ int linuxpolicy, globallinuxpolicy = 0;
+ int mixed = 0;
+ int full = 0;
+ int first = 1;
+ int pagesize = hwloc_getpagesize();
+ char *tmpaddr;
+ int err;
+ unsigned i;
+
+ max_os_index = hwloc_linux_find_kernel_max_numnodes(topology);
+
+ linuxmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
+ if (!linuxmask) {
+ errno = ENOMEM;
+ goto out;
+ }
+ globallinuxmask = calloc(max_os_index/HWLOC_BITS_PER_LONG, sizeof(long));
+ if (!globallinuxmask) {
+ errno = ENOMEM;
+ goto out_with_masks;
+ }
+
+ for(tmpaddr = (char *)((unsigned long)addr & ~(pagesize-1));
+ tmpaddr < (char *)addr + len;
+ tmpaddr += pagesize) {
+ err = get_mempolicy(&linuxpolicy, linuxmask, max_os_index, tmpaddr, MPOL_F_ADDR);
+ if (err < 0)
+ goto out_with_masks;
+
+ /* use the first found policy. if we find a different one later, set mixed to 1 */
+ if (first)
+ globallinuxpolicy = linuxpolicy;
+ else if (globallinuxpolicy != linuxpolicy)
+ mixed = 1;
+
+ /* agregate masks, and set full to 1 if we ever find DEFAULT */
+ if (full || linuxpolicy == MPOL_DEFAULT) {
+ full = 1;
+ } else {
+ for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
+ globallinuxmask[i] |= linuxmask[i];
+ }
+
+ first = 0;
+ }
+
+ if (mixed) {
+ *policy = HWLOC_MEMBIND_MIXED;
+ } else {
+ err = hwloc_linux_membind_policy_to_hwloc(linuxpolicy, policy);
+ if (err < 0)
+ goto out_with_masks;
+ }
+
+ if (full) {
+ hwloc_bitmap_copy(nodeset, hwloc_topology_get_topology_nodeset(topology));
+ } else {
+ hwloc_linux_membind_mask_to_nodeset(topology, nodeset, max_os_index, globallinuxmask);
+ }
+
+ free(globallinuxmask);
+ free(linuxmask);
+ return 0;
+
+ out_with_masks:
+ free(globallinuxmask);
+ free(linuxmask);
+ out:
+ return -1;
+}
+
+#endif /* HWLOC_HAVE_SET_MEMPOLICY */
+
+void
+hwloc_set_linuxfs_hooks(struct hwloc_binding_hooks *hooks,
+ struct hwloc_topology_support *support __hwloc_attribute_unused)
+{
+ hooks->set_thisthread_cpubind = hwloc_linux_set_thisthread_cpubind;
+ hooks->get_thisthread_cpubind = hwloc_linux_get_thisthread_cpubind;
+ hooks->set_thisproc_cpubind = hwloc_linux_set_thisproc_cpubind;
+ hooks->get_thisproc_cpubind = hwloc_linux_get_thisproc_cpubind;
+ hooks->set_proc_cpubind = hwloc_linux_set_proc_cpubind;
+ hooks->get_proc_cpubind = hwloc_linux_get_proc_cpubind;
+#if HAVE_DECL_PTHREAD_SETAFFINITY_NP
+ hooks->set_thread_cpubind = hwloc_linux_set_thread_cpubind;
+#endif /* HAVE_DECL_PTHREAD_SETAFFINITY_NP */
+#if HAVE_DECL_PTHREAD_GETAFFINITY_NP
+ hooks->get_thread_cpubind = hwloc_linux_get_thread_cpubind;
+#endif /* HAVE_DECL_PTHREAD_GETAFFINITY_NP */
+ hooks->get_thisthread_last_cpu_location = hwloc_linux_get_thisthread_last_cpu_location;
+ hooks->get_thisproc_last_cpu_location = hwloc_linux_get_thisproc_last_cpu_location;
+ hooks->get_proc_last_cpu_location = hwloc_linux_get_proc_last_cpu_location;
+#ifdef HWLOC_HAVE_SET_MEMPOLICY
+ hooks->set_thisthread_membind = hwloc_linux_set_thisthread_membind;
+ hooks->get_thisthread_membind = hwloc_linux_get_thisthread_membind;
+ hooks->get_area_membind = hwloc_linux_get_area_membind;
+#endif /* HWLOC_HAVE_SET_MEMPOLICY */
+#ifdef HWLOC_HAVE_MBIND
+ hooks->set_area_membind = hwloc_linux_set_area_membind;
+ hooks->alloc_membind = hwloc_linux_alloc_membind;
+ hooks->alloc = hwloc_alloc_mmap;
+ hooks->free_membind = hwloc_free_mmap;
+ support->membind->firsttouch_membind = 1;
+ support->membind->bind_membind = 1;
+ support->membind->interleave_membind = 1;
+#endif /* HWLOC_HAVE_MBIND */
+#if (defined HWLOC_HAVE_MIGRATE_PAGES) || ((defined HWLOC_HAVE_MBIND) && (defined MPOL_MF_MOVE))
+ support->membind->migrate_membind = 1;
+#endif
+}
+
+
+
+/*******************************************
+ *** Misc Helpers for Topology Discovery ***
+ *******************************************/
+
+/* cpuinfo array */
+struct hwloc_linux_cpuinfo_proc {
+ /* set during hwloc_linux_parse_cpuinfo */
+ unsigned long Pproc;
+ /* set during hwloc_linux_parse_cpuinfo or -1 if unknown*/
+ long Pcore, Ppkg;
+ /* set later, or -1 if unknown */
+ long Lcore, Lpkg;
+
+ /* custom info, set during hwloc_linux_parse_cpuinfo */
+ struct hwloc_obj_info_s *infos;
+ unsigned infos_count;
+};
+
+static int
+hwloc_parse_sysfs_unsigned(const char *mappath, unsigned *value, int fsroot_fd)
+{
+ char string[11];
+ FILE * fd;
+
+ fd = hwloc_fopen(mappath, "r", fsroot_fd);
+ if (!fd) {
+ *value = -1;
+ return -1;
+ }
+
+ if (!fgets(string, 11, fd)) {
+ *value = -1;
+ fclose(fd);
+ return -1;
+ }
+ *value = strtoul(string, NULL, 10);
+
+ fclose(fd);
+
+ return 0;
+}
+
+
+/* kernel cpumaps are composed of an array of 32bits cpumasks */
+#define KERNEL_CPU_MASK_BITS 32
+#define KERNEL_CPU_MAP_LEN (KERNEL_CPU_MASK_BITS/4+2)
+
+int
+hwloc_linux_parse_cpumap_file(FILE *file, hwloc_bitmap_t set)
+{
+ unsigned long *maps;
+ unsigned long map;
+ int nr_maps = 0;
+ static int nr_maps_allocated = 8; /* only compute the power-of-two above the kernel cpumask size once */
+ int i;
+
+ maps = malloc(nr_maps_allocated * sizeof(*maps));
+
+ /* reset to zero first */
+ hwloc_bitmap_zero(set);
+
+ /* parse the whole mask */
+ while (fscanf(file, "%lx,", &map) == 1) /* read one kernel cpu mask and the ending comma */
+ {
+ if (nr_maps == nr_maps_allocated) {
+ nr_maps_allocated *= 2;
+ maps = realloc(maps, nr_maps_allocated * sizeof(*maps));
+ }
+
+ if (!map && !nr_maps)
+ /* ignore the first map if it's empty */
+ continue;
+
+ memmove(&maps[1], &maps[0], nr_maps*sizeof(*maps));
+ maps[0] = map;
+ nr_maps++;
+ }
+
+ /* convert into a set */
+#if KERNEL_CPU_MASK_BITS == HWLOC_BITS_PER_LONG
+ for(i=0; i<nr_maps; i++)
+ hwloc_bitmap_set_ith_ulong(set, i, maps[i]);
+#else
+ for(i=0; i<(nr_maps+1)/2; i++) {
+ unsigned long mask;
+ mask = maps[2*i];
+ if (2*i+1<nr_maps)
+ mask |= maps[2*i+1] << KERNEL_CPU_MASK_BITS;
+ hwloc_bitmap_set_ith_ulong(set, i, mask);
+ }
+#endif
+
+ free(maps);
+
+ return 0;
+}
+
+static hwloc_bitmap_t
+hwloc_parse_cpumap(const char *mappath, int fsroot_fd)
+{
+ hwloc_bitmap_t set;
+ FILE * file;
+
+ file = hwloc_fopen(mappath, "r", fsroot_fd);
+ if (!file)
+ return NULL;
+
+ set = hwloc_bitmap_alloc();
+ hwloc_linux_parse_cpumap_file(file, set);
+
+ fclose(file);
+ return set;
+}
+
+static char *
+hwloc_strdup_mntpath(const char *escapedpath, size_t length)
+{
+ char *path = malloc(length+1);
+ const char *src = escapedpath, *tmp;
+ char *dst = path;
+
+ while ((tmp = strchr(src, '\\')) != NULL) {
+ strncpy(dst, src, tmp-src);
+ dst += tmp-src;
+ if (!strncmp(tmp+1, "040", 3))
+ *dst = ' ';
+ else if (!strncmp(tmp+1, "011", 3))
+ *dst = ' ';
+ else if (!strncmp(tmp+1, "012", 3))
+ *dst = '\n';
+ else
+ *dst = '\\';
+ dst++;
+ src = tmp+4;
+ }
+
+ strcpy(dst, src);
+
+ return path;
+}
+
+static void
+hwloc_find_linux_cpuset_mntpnt(char **cgroup_mntpnt, char **cpuset_mntpnt, int fsroot_fd)
+{
+#define PROC_MOUNT_LINE_LEN 512
+ char line[PROC_MOUNT_LINE_LEN];
+ FILE *fd;
+
+ *cgroup_mntpnt = NULL;
+ *cpuset_mntpnt = NULL;
+
+ /* ideally we should use setmntent, getmntent, hasmntopt and endmntent,
+ * but they do not support fsroot_fd.
+ */
+
+ fd = hwloc_fopen("/proc/mounts", "r", fsroot_fd);
+ if (!fd)
+ return;
+
+ while (fgets(line, sizeof(line), fd)) {
+ char *path;
+ char *type;
+ char *tmp;
+
+ /* remove the ending " 0 0\n" that the kernel always adds */
+ tmp = line + strlen(line) - 5;
+ if (tmp < line || strcmp(tmp, " 0 0\n"))
+ fprintf(stderr, "Unexpected end of /proc/mounts line `%s'\n", line);
+ else
+ *tmp = '\0';
+
+ /* path is after first field and a space */
+ tmp = strchr(line, ' ');
+ if (!tmp)
+ continue;
+ path = tmp+1;
+
+ /* type is after path, which may not contain spaces since the kernel escaped them to \040
+ * (see the manpage of getmntent) */
+ tmp = strchr(path, ' ');
+ if (!tmp)
+ continue;
+ type = tmp+1;
+ /* mark the end of path to ease upcoming strdup */
+ *tmp = '\0';
+
+ if (!strncmp(type, "cpuset ", 7)) {
+ /* found a cpuset mntpnt */
+ hwloc_debug("Found cpuset mount point on %s\n", path);
+ *cpuset_mntpnt = hwloc_strdup_mntpath(path, type-path);
+ break;
+
+ } else if (!strncmp(type, "cgroup ", 7)) {
+ /* found a cgroup mntpnt */
+ char *opt, *opts;
+ int cpuset_opt = 0;
+ int noprefix_opt = 0;
+
+ /* find options */
+ tmp = strchr(type, ' ');
+ if (!tmp)
+ continue;
+ opts = tmp+1;
+
+ /* look at options */
+ while ((opt = strsep(&opts, ",")) != NULL) {
+ if (!strcmp(opt, "cpuset"))
+ cpuset_opt = 1;
+ else if (!strcmp(opt, "noprefix"))
+ noprefix_opt = 1;
+ }
+ if (!cpuset_opt)
+ continue;
+
+ if (noprefix_opt) {
+ hwloc_debug("Found cgroup emulating a cpuset mount point on %s\n", path);
+ *cpuset_mntpnt = hwloc_strdup_mntpath(path, type-path);
+ } else {
+ hwloc_debug("Found cgroup/cpuset mount point on %s\n", path);
+ *cgroup_mntpnt = hwloc_strdup_mntpath(path, type-path);
+ }
+ break;
+ }
+ }
+
+ fclose(fd);
+}
+
+/*
+ * Linux cpusets may be managed directly or through cgroup.
+ * If cgroup is used, tasks get a /proc/pid/cgroup which may contain a
+ * single line %d:cpuset:<name>. If cpuset are used they get /proc/pid/cpuset
+ * containing <name>.
+ */
+static char *
+hwloc_read_linux_cpuset_name(int fsroot_fd, hwloc_pid_t pid)
+{
+#define CPUSET_NAME_LEN 128
+ char cpuset_name[CPUSET_NAME_LEN];
+ FILE *fd;
+ char *tmp;
+
+ /* check whether a cgroup-cpuset is enabled */
+ if (!pid)
+ fd = hwloc_fopen("/proc/self/cgroup", "r", fsroot_fd);
+ else {
+ char path[] = "/proc/XXXXXXXXXX/cgroup";
+ snprintf(path, sizeof(path), "/proc/%d/cgroup", pid);
+ fd = hwloc_fopen(path, "r", fsroot_fd);
+ }
+ if (fd) {
+ /* find a cpuset line */
+#define CGROUP_LINE_LEN 256
+ char line[CGROUP_LINE_LEN];
+ while (fgets(line, sizeof(line), fd)) {
+ char *end, *colon = strchr(line, ':');
+ if (!colon)
+ continue;
+ if (strncmp(colon, ":cpuset:", 8))
+ continue;
+
+ /* found a cgroup-cpuset line, return the name */
+ fclose(fd);
+ end = strchr(colon, '\n');
+ if (end)
+ *end = '\0';
+ hwloc_debug("Found cgroup-cpuset %s\n", colon+8);
+ return strdup(colon+8);
+ }
+ fclose(fd);
+ }
+
+ /* check whether a cpuset is enabled */
+ if (!pid)
+ fd = hwloc_fopen("/proc/self/cpuset", "r", fsroot_fd);
+ else {
+ char path[] = "/proc/XXXXXXXXXX/cpuset";
+ snprintf(path, sizeof(path), "/proc/%d/cpuset", pid);
+ fd = hwloc_fopen(path, "r", fsroot_fd);
+ }
+ if (!fd) {
+ /* found nothing */
+ hwloc_debug("%s", "No cgroup or cpuset found\n");
+ return NULL;
+ }
+
+ /* found a cpuset, return the name */
+ tmp = fgets(cpuset_name, sizeof(cpuset_name), fd);
+ fclose(fd);
+ if (!tmp)
+ return NULL;
+ tmp = strchr(cpuset_name, '\n');
+ if (tmp)
+ *tmp = '\0';
+ hwloc_debug("Found cpuset %s\n", cpuset_name);
+ return strdup(cpuset_name);
+}
+
+/*
+ * Then, the cpuset description is available from either the cgroup or
+ * the cpuset filesystem (usually mounted in / or /dev) where there
+ * are cgroup<name>/cpuset.{cpus,mems} or cpuset<name>/{cpus,mems} files.
+ */
+static char *
+hwloc_read_linux_cpuset_mask(const char *cgroup_mntpnt, const char *cpuset_mntpnt, const char *cpuset_name, const char *attr_name, int fsroot_fd)
+{
+#define CPUSET_FILENAME_LEN 256
+ char cpuset_filename[CPUSET_FILENAME_LEN];
+ FILE *fd;
+ char *info = NULL, *tmp;
+ ssize_t ssize;
+ size_t size;
+
+ if (cgroup_mntpnt) {
+ /* try to read the cpuset from cgroup */
+ snprintf(cpuset_filename, CPUSET_FILENAME_LEN, "%s%s/cpuset.%s", cgroup_mntpnt, cpuset_name, attr_name);
+ hwloc_debug("Trying to read cgroup file <%s>\n", cpuset_filename);
+ fd = hwloc_fopen(cpuset_filename, "r", fsroot_fd);
+ if (fd)
+ goto gotfile;
+ } else if (cpuset_mntpnt) {
+ /* try to read the cpuset directly */
+ snprintf(cpuset_filename, CPUSET_FILENAME_LEN, "%s%s/%s", cpuset_mntpnt, cpuset_name, attr_name);
+ hwloc_debug("Trying to read cpuset file <%s>\n", cpuset_filename);
+ fd = hwloc_fopen(cpuset_filename, "r", fsroot_fd);
+ if (fd)
+ goto gotfile;
+ }
+
+ /* found no cpuset description, ignore it */
+ hwloc_debug("Couldn't find cpuset <%s> description, ignoring\n", cpuset_name);
+ goto out;
+
+gotfile:
+ ssize = getline(&info, &size, fd);
+ fclose(fd);
+ if (ssize < 0)
+ goto out;
+ if (!info)
+ goto out;
+
+ tmp = strchr(info, '\n');
+ if (tmp)
+ *tmp = '\0';
+
+out:
+ return info;
+}
+
+static void
+hwloc_admin_disable_set_from_cpuset(struct hwloc_linux_backend_data_s *data,
+ const char *cgroup_mntpnt, const char *cpuset_mntpnt, const char *cpuset_name,
+ const char *attr_name,
+ hwloc_bitmap_t admin_enabled_cpus_set)
+{
+ char *cpuset_mask;
+ char *current, *comma, *tmp;
+ int prevlast, nextfirst, nextlast; /* beginning/end of enabled-segments */
+ hwloc_bitmap_t tmpset;
+
+ cpuset_mask = hwloc_read_linux_cpuset_mask(cgroup_mntpnt, cpuset_mntpnt, cpuset_name,
+ attr_name, data->root_fd);
+ if (!cpuset_mask)
+ return;
+
+ hwloc_debug("found cpuset %s: %s\n", attr_name, cpuset_mask);
+
+ current = cpuset_mask;
+ prevlast = -1;
+
+ while (1) {
+ /* save a pointer to the next comma and erase it to simplify things */
+ comma = strchr(current, ',');
+ if (comma)
+ *comma = '\0';
+
+ /* find current enabled-segment bounds */
+ nextfirst = strtoul(current, &tmp, 0);
+ if (*tmp == '-')
+ nextlast = strtoul(tmp+1, NULL, 0);
+ else
+ nextlast = nextfirst;
+ if (prevlast+1 <= nextfirst-1) {
+ hwloc_debug("%s [%d:%d] excluded by cpuset\n", attr_name, prevlast+1, nextfirst-1);
+ hwloc_bitmap_clr_range(admin_enabled_cpus_set, prevlast+1, nextfirst-1);
+ }
+
+ /* switch to next enabled-segment */
+ prevlast = nextlast;
+ if (!comma)
+ break;
+ current = comma+1;
+ }
+
+ hwloc_debug("%s [%d:%d] excluded by cpuset\n", attr_name, prevlast+1, nextfirst-1);
+ /* no easy way to clear until the infinity */
+ tmpset = hwloc_bitmap_alloc();
+ hwloc_bitmap_set_range(tmpset, 0, prevlast);
+ hwloc_bitmap_and(admin_enabled_cpus_set, admin_enabled_cpus_set, tmpset);
+ hwloc_bitmap_free(tmpset);
+
+ free(cpuset_mask);
+}
+
+static void
+hwloc_parse_meminfo_info(struct hwloc_linux_backend_data_s *data,
+ const char *path,
+ int prefixlength,
+ uint64_t *local_memory,
+ uint64_t *meminfo_hugepages_count,
+ uint64_t *meminfo_hugepages_size,
+ int onlytotal)
+{
+ char string[64];
+ FILE *fd;
+
+ fd = hwloc_fopen(path, "r", data->root_fd);
+ if (!fd)
+ return;
+
+ while (fgets(string, sizeof(string), fd) && *string != '\0')
+ {
+ unsigned long long number;
+ if (strlen(string) < (size_t) prefixlength)
+ continue;
+ if (sscanf(string+prefixlength, "MemTotal: %llu kB", (unsigned long long *) &number) == 1) {
+ *local_memory = number << 10;
+ if (onlytotal)
+ break;
+ }
+ else if (!onlytotal) {
+ if (sscanf(string+prefixlength, "Hugepagesize: %llu", (unsigned long long *) &number) == 1)
+ *meminfo_hugepages_size = number << 10;
+ else if (sscanf(string+prefixlength, "HugePages_Free: %llu", (unsigned long long *) &number) == 1)
+ /* these are free hugepages, not the total amount of huge pages */
+ *meminfo_hugepages_count = number;
+ }
+ }
+
+ fclose(fd);
+}
+
+#define SYSFS_NUMA_NODE_PATH_LEN 128
+
+static void
+hwloc_parse_hugepages_info(struct hwloc_linux_backend_data_s *data,
+ const char *dirpath,
+ struct hwloc_obj_memory_s *memory,
+ uint64_t *remaining_local_memory)
+{
+ DIR *dir;
+ struct dirent *dirent;
+ unsigned long index_ = 1;
+ FILE *hpfd;
+ char line[64];
+ char path[SYSFS_NUMA_NODE_PATH_LEN];
+
+ dir = hwloc_opendir(dirpath, data->root_fd);
+ if (dir) {
+ while ((dirent = readdir(dir)) != NULL) {
+ if (strncmp(dirent->d_name, "hugepages-", 10))
+ continue;
+ memory->page_types[index_].size = strtoul(dirent->d_name+10, NULL, 0) * 1024ULL;
+ sprintf(path, "%s/%s/nr_hugepages", dirpath, dirent->d_name);
+ hpfd = hwloc_fopen(path, "r", data->root_fd);
+ if (hpfd) {
+ if (fgets(line, sizeof(line), hpfd)) {
+ /* these are the actual total amount of huge pages */
+ memory->page_types[index_].count = strtoull(line, NULL, 0);
+ *remaining_local_memory -= memory->page_types[index_].count * memory->page_types[index_].size;
+ index_++;
+ }
+ fclose(hpfd);
+ }
+ }
+ closedir(dir);
+ memory->page_types_len = index_;
+ }
+}
+
+static void
+hwloc_get_procfs_meminfo_info(struct hwloc_topology *topology,
+ struct hwloc_linux_backend_data_s *data,
+ struct hwloc_obj_memory_s *memory)
+{
+ uint64_t meminfo_hugepages_count, meminfo_hugepages_size = 0;
+ struct stat st;
+ int has_sysfs_hugepages = 0;
+ const char *pagesize_env = getenv("HWLOC_DEBUG_PAGESIZE");
+ int types = 2;
+ int err;
+
+ err = hwloc_stat("/sys/kernel/mm/hugepages", &st, data->root_fd);
+ if (!err) {
+ types = 1 + st.st_nlink-2;
+ has_sysfs_hugepages = 1;
+ }
+
+ if (topology->is_thissystem || pagesize_env) {
+ /* we cannot report any page_type info unless we have the page size.
+ * we'll take it either from the system if local, or from the debug env variable
+ */
+ memory->page_types_len = types;
+ memory->page_types = calloc(types, sizeof(*memory->page_types));
+ }
+
+ if (topology->is_thissystem) {
+ /* Get the page and hugepage sizes from sysconf */
+#ifdef HAVE__SC_LARGE_PAGESIZE
+ memory->page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
+#endif
+ memory->page_types[0].size = hwloc_getpagesize(); /* might be overwritten later by /proc/meminfo or sysfs */
+ }
+
+ hwloc_parse_meminfo_info(data, "/proc/meminfo", 0 /* no prefix */,
+ &memory->local_memory,
+ &meminfo_hugepages_count, &meminfo_hugepages_size,
+ memory->page_types == NULL);
+
+ if (memory->page_types) {
+ uint64_t remaining_local_memory = memory->local_memory;
+ if (has_sysfs_hugepages) {
+ /* read from node%d/hugepages/hugepages-%skB/nr_hugepages */
+ hwloc_parse_hugepages_info(data, "/sys/kernel/mm/hugepages", memory, &remaining_local_memory);
+ } else {
+ /* use what we found in meminfo */
+ if (meminfo_hugepages_size) {
+ memory->page_types[1].size = meminfo_hugepages_size;
+ memory->page_types[1].count = meminfo_hugepages_count;
+ remaining_local_memory -= meminfo_hugepages_count * meminfo_hugepages_size;
+ } else {
+ memory->page_types_len = 1;
+ }
+ }
+
+ if (pagesize_env) {
+ /* We cannot get the pagesize if not thissystem, use the env-given one to experience the code during make check */
+ memory->page_types[0].size = strtoull(pagesize_env, NULL, 10);
+ /* If failed, use 4kB */
+ if (!memory->page_types[0].size)
+ memory->page_types[0].size = 4096;
+ }
+ assert(memory->page_types[0].size); /* from sysconf if local or from the env */
+ /* memory->page_types[1].size from sysconf if local, or from /proc/meminfo, or from sysfs,
+ * may be 0 if no hugepage support in the kernel */
+
+ memory->page_types[0].count = remaining_local_memory / memory->page_types[0].size;
+ }
+}
+
+static void
+hwloc_sysfs_node_meminfo_info(struct hwloc_topology *topology,
+ struct hwloc_linux_backend_data_s *data,
+ const char *syspath, int node,
+ struct hwloc_obj_memory_s *memory)
+{
+ char path[SYSFS_NUMA_NODE_PATH_LEN];
+ char meminfopath[SYSFS_NUMA_NODE_PATH_LEN];
+ uint64_t meminfo_hugepages_count = 0;
+ uint64_t meminfo_hugepages_size = 0;
+ struct stat st;
+ int has_sysfs_hugepages = 0;
+ int types = 2;
+ int err;
+
+ sprintf(path, "%s/node%d/hugepages", syspath, node);
+ err = hwloc_stat(path, &st, data->root_fd);
+ if (!err) {
+ types = 1 + st.st_nlink-2;
+ has_sysfs_hugepages = 1;
+ }
+
+ if (topology->is_thissystem) {
+ memory->page_types_len = types;
+ memory->page_types = malloc(types*sizeof(*memory->page_types));
+ memset(memory->page_types, 0, types*sizeof(*memory->page_types));
+ }
+
+ sprintf(meminfopath, "%s/node%d/meminfo", syspath, node);
+ hwloc_parse_meminfo_info(data, meminfopath,
+ snprintf(NULL, 0, "Node %d ", node),
+ &memory->local_memory,
+ &meminfo_hugepages_count, NULL /* no hugepage size in node-specific meminfo */,
+ memory->page_types == NULL);
+
+ if (memory->page_types) {
+ uint64_t remaining_local_memory = memory->local_memory;
+ if (has_sysfs_hugepages) {
+ /* read from node%d/hugepages/hugepages-%skB/nr_hugepages */
+ hwloc_parse_hugepages_info(data, path, memory, &remaining_local_memory);
+ } else {
+ /* get hugepage size from machine-specific meminfo since there is no size in node-specific meminfo,
+ * hwloc_get_procfs_meminfo_info must have been called earlier */
+ meminfo_hugepages_size = topology->levels[0][0]->memory.page_types[1].size;
+ /* use what we found in meminfo */
+ if (meminfo_hugepages_size) {
+ memory->page_types[1].count = meminfo_hugepages_count;
+ memory->page_types[1].size = meminfo_hugepages_size;
+ remaining_local_memory -= meminfo_hugepages_count * meminfo_hugepages_size;
+ } else {
+ memory->page_types_len = 1;
+ }
+ }
+ /* update what's remaining as normal pages */
+ memory->page_types[0].size = hwloc_getpagesize();
+ memory->page_types[0].count = remaining_local_memory / memory->page_types[0].size;
+ }
+}
+
+static void
+hwloc_parse_node_distance(const char *distancepath, unsigned nbnodes, float *distances, int fsroot_fd)
+{
+ char string[4096]; /* enough for hundreds of nodes */
+ char *tmp, *next;
+ FILE * fd;
+
+ fd = hwloc_fopen(distancepath, "r", fsroot_fd);
+ if (!fd)
+ return;
+
+ if (!fgets(string, sizeof(string), fd)) {
+ fclose(fd);
+ return;
+ }
+
+ tmp = string;
+ while (tmp) {
+ unsigned distance = strtoul(tmp, &next, 0);
+ if (next == tmp)
+ break;
+ *distances = (float) distance;
+ distances++;
+ nbnodes--;
+ if (!nbnodes)
+ break;
+ tmp = next+1;
+ }
+
+ fclose(fd);
+}
+
+static void
+hwloc__get_dmi_id_one_info(struct hwloc_linux_backend_data_s *data,
+ hwloc_obj_t obj,
+ char *path, unsigned pathlen,
+ const char *dmi_name, const char *hwloc_name)
+{
+ char dmi_line[64];
+ char *tmp;
+ FILE *fd;
+
+ strcpy(path+pathlen, dmi_name);
+ fd = hwloc_fopen(path, "r", data->root_fd);
+ if (!fd)
+ return;
+
+ dmi_line[0] = '\0';
+ tmp = fgets(dmi_line, sizeof(dmi_line), fd);
+ fclose (fd);
+
+ if (tmp && dmi_line[0] != '\0') {
+ tmp = strchr(dmi_line, '\n');
+ if (tmp)
+ *tmp = '\0';
+ hwloc_debug("found %s '%s'\n", hwloc_name, dmi_line);
+ hwloc_obj_add_info(obj, hwloc_name, dmi_line);
+ }
+}
+
+static void
+hwloc__get_dmi_id_info(struct hwloc_linux_backend_data_s *data, hwloc_obj_t obj)
+{
+ char path[128];
+ unsigned pathlen;
+ DIR *dir;
+
+ strcpy(path, "/sys/devices/virtual/dmi/id");
+ dir = hwloc_opendir(path, data->root_fd);
+ if (dir) {
+ pathlen = 27;
+ } else {
+ strcpy(path, "/sys/class/dmi/id");
+ dir = hwloc_opendir(path, data->root_fd);
+ if (dir)
+ pathlen = 17;
+ else
+ return;
+ }
+ closedir(dir);
+
+ path[pathlen++] = '/';
+
+ hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_name", "DMIProductName");
+ hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_version", "DMIProductVersion");
+ hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_serial", "DMIProductSerial");
+ hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_uuid", "DMIProductUUID");
+ hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_vendor", "DMIBoardVendor");
+ hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_name", "DMIBoardName");
+ hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_version", "DMIBoardVersion");
+ hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_serial", "DMIBoardSerial");
+ hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_asset_tag", "DMIBoardAssetTag");
+ hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_vendor", "DMIChassisVendor");
+ hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_type", "DMIChassisType");
+ hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_version", "DMIChassisVersion");
+ hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_serial", "DMIChassisSerial");
+ hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_asset_tag", "DMIChassisAssetTag");
+ hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "bios_vendor", "DMIBIOSVendor");
+ hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "bios_version", "DMIBIOSVersion");
+ hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "bios_date", "DMIBIOSDate");
+ hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "sys_vendor", "DMISysVendor");
+}
+
+struct hwloc_firmware_dmi_mem_device_header {
+ unsigned char type;
+ unsigned char length;
+ unsigned char handle[2];
+ unsigned char phy_mem_handle[2];
+ unsigned char mem_err_handle[2];
+ unsigned char tot_width[2];
+ unsigned char dat_width[2];
+ unsigned char size[2];
+ unsigned char ff;
+ unsigned char dev_set;
+ unsigned char dev_loc_str_num;
+ unsigned char bank_loc_str_num;
+ unsigned char mem_type;
+ unsigned char type_detail[2];
+ unsigned char speed[2];
+ unsigned char manuf_str_num;
+ unsigned char serial_str_num;
+ unsigned char asset_tag_str_num;
+ unsigned char part_num_str_num;
+ /* don't include the following fields since we don't need them,
+ * some old implementations may miss them.
+ */
+};
+
+static int check_dmi_entry(const char *buffer)
+{
+ /* reject empty strings */
+ if (!*buffer)
+ return 0;
+ /* reject strings of spaces (at least Dell use this for empty memory slots) */
+ if (strspn(buffer, " ") == strlen(buffer))
+ return 0;
+ return 1;
+}
+
+static void
+hwloc__get_firmware_dmi_memory_info_one(struct hwloc_topology *topology,
+ unsigned idx, const char *path, FILE *fd,
+ struct hwloc_firmware_dmi_mem_device_header *header)
+{
+ unsigned slen;
+ char buffer[256]; /* enough for memory device strings, or at least for each of them */
+ unsigned foff; /* offset in raw file */
+ unsigned boff; /* offset in buffer read from raw file */
+ unsigned i;
+ struct hwloc_obj_info_s *infos = NULL;
+ unsigned infos_count = 0;
+ hwloc_obj_t misc;
+ int foundinfo = 0;
+
+ hwloc__add_info(&infos, &infos_count, "Type", "MemoryModule");
+
+ /* start after the header */
+ foff = header->length;
+ i = 1;
+ while (1) {
+ /* read one buffer */
+ if (fseek(fd, foff, SEEK_SET) < 0)
+ break;
+ if (!fgets(buffer, sizeof(buffer), fd))
+ break;
+ /* read string at the beginning of the buffer */
+ boff = 0;
+ while (1) {
+ /* stop on empty string */
+ if (!buffer[boff])
+ goto done;
+ /* stop if this string goes to the end of the buffer */
+ slen = strlen(buffer+boff);
+ if (boff + slen+1 == sizeof(buffer))
+ break;
+ /* string didn't get truncated, should be OK */
+ if (i == header->manuf_str_num) {
+ if (check_dmi_entry(buffer+boff)) {
+ hwloc__add_info(&infos, &infos_count, "Vendor", buffer+boff);
+ foundinfo = 1;
+ }
+ } else if (i == header->serial_str_num) {
+ if (check_dmi_entry(buffer+boff)) {
+ hwloc__add_info(&infos, &infos_count, "SerialNumber", buffer+boff);
+ foundinfo = 1;
+ }
+ } else if (i == header->asset_tag_str_num) {
+ if (check_dmi_entry(buffer+boff)) {
+ hwloc__add_info(&infos, &infos_count, "AssetTag", buffer+boff);
+ foundinfo = 1;
+ }
+ } else if (i == header->part_num_str_num) {
+ if (check_dmi_entry(buffer+boff)) {
+ hwloc__add_info(&infos, &infos_count, "PartNumber", buffer+boff);
+ foundinfo = 1;
+ }
+ } else if (i == header->dev_loc_str_num) {
+ if (check_dmi_entry(buffer+boff)) {
+ hwloc__add_info(&infos, &infos_count, "DeviceLocation", buffer+boff);
+ /* only a location, not an actual info about the device */
+ }
+ } else if (i == header->bank_loc_str_num) {
+ if (check_dmi_entry(buffer+boff)) {
+ hwloc__add_info(&infos, &infos_count, "BankLocation", buffer+boff);
+ /* only a location, not an actual info about the device */
+ }
+ } else {
+ goto done;
+ }
+ /* next string in buffer */
+ boff += slen+1;
+ i++;
+ }
+ /* couldn't read a single full string from that buffer, we're screwed */
+ if (!boff) {
+ fprintf(stderr, "hwloc could read a DMI firmware entry #%u in %s\n",
+ i, path);
+ break;
+ }
+ /* reread buffer after previous string */
+ foff += boff;
+ }
+
+done:
+ if (!foundinfo) {
+ /* found no actual info about the device. if there's only location info, the slot may be empty */
+ goto out_with_infos;
+ }
+
+ misc = hwloc_alloc_setup_object(HWLOC_OBJ_MISC, idx);
+ if (!misc)
+ goto out_with_infos;
+
+ hwloc__move_infos(&misc->infos, &misc->infos_count, &infos, &infos_count);
+ /* FIXME: find a way to identify the corresponding NUMA node and attach these objects there.
+ * but it means we need to parse DeviceLocation=DIMM_B4 but these vary significantly
+ * with the vendor, and it's hard to be 100% sure 'B' is second socket.
+ * Examples at http://sourceforge.net/p/edac-utils/code/HEAD/tree/trunk/src/etc/labels.db
+ * or https://github.com/grondo/edac-utils/blob/master/src/etc/labels.db
+ */
+ hwloc_insert_object_by_parent(topology, hwloc_get_root_obj(topology), misc);
+ return;
+
+ out_with_infos:
+ hwloc__free_infos(infos, infos_count);
+}
+
+static void
+hwloc__get_firmware_dmi_memory_info(struct hwloc_topology *topology,
+ struct hwloc_linux_backend_data_s *data)
+{
+ char path[128];
+ unsigned i;
+
+ for(i=0; ; i++) {
+ FILE *fd;
+ struct hwloc_firmware_dmi_mem_device_header header;
+ int err;
+
+ snprintf(path, sizeof(path), "/sys/firmware/dmi/entries/17-%u/raw", i);
+ fd = hwloc_fopen(path, "r", data->root_fd);
+ if (!fd)
+ break;
+
+ err = fread(&header, sizeof(header), 1, fd);
+ if (err != 1)
+ break;
+ if (header.length < sizeof(header)) {
+ /* invalid, or too old entry/spec that doesn't contain what we need */
+ fclose(fd);
+ break;
+ }
+
+ hwloc__get_firmware_dmi_memory_info_one(topology, i, path, fd, &header);
+
+ fclose(fd);
+ }
+}
+
+
+/***********************************
+ ****** Device tree Discovery ******
+ ***********************************/
+
+/* Reads the entire file and returns bytes read if bytes_read != NULL
+ * Returned pointer can be freed by using free(). */
+static void *
+hwloc_read_raw(const char *p, const char *p1, size_t *bytes_read, int root_fd)
+{
+ char fname[256];
+ char *ret = NULL;
+ struct stat fs;
+ int file = -1;
+
+ snprintf(fname, sizeof(fname), "%s/%s", p, p1);
+
+ file = hwloc_open(fname, root_fd);
+ if (-1 == file) {
+ goto out_no_close;
+ }
+ if (fstat(file, &fs)) {
+ goto out;
+ }
+
+ ret = (char *) malloc(fs.st_size);
+ if (NULL != ret) {
+ ssize_t cb = read(file, ret, fs.st_size);
+ if (cb == -1) {
+ free(ret);
+ ret = NULL;
+ } else {
+ if (NULL != bytes_read)
+ *bytes_read = cb;
+ }
+ }
+
+ out:
+ close(file);
+ out_no_close:
+ return ret;
+}
+
+/* Reads the entire file and returns it as a 0-terminated string
+ * Returned pointer can be freed by using free(). */
+static char *
+hwloc_read_str(const char *p, const char *p1, int root_fd)
+{
+ size_t cb = 0;
+ char *ret = hwloc_read_raw(p, p1, &cb, root_fd);
+ if ((NULL != ret) && (0 < cb) && (0 != ret[cb-1])) {
+ ret = realloc(ret, cb + 1);
+ ret[cb] = 0;
+ }
+ return ret;
+}
+
+/* Reads first 32bit bigendian value */
+static ssize_t
+hwloc_read_unit32be(const char *p, const char *p1, uint32_t *buf, int root_fd)
+{
+ size_t cb = 0;
+ uint32_t *tmp = hwloc_read_raw(p, p1, &cb, root_fd);
+ if (sizeof(*buf) != cb) {
+ errno = EINVAL;
+ free(tmp); /* tmp is either NULL or contains useless things */
+ return -1;
+ }
+ *buf = htonl(*tmp);
+ free(tmp);
+ return sizeof(*buf);
+}
+
+typedef struct {
+ unsigned int n, allocated;
+ struct {
+ hwloc_bitmap_t cpuset;
+ uint32_t phandle;
+ uint32_t l2_cache;
+ char *name;
+ } *p;
+} device_tree_cpus_t;
+
+static void
+add_device_tree_cpus_node(device_tree_cpus_t *cpus, hwloc_bitmap_t cpuset,
+ uint32_t l2_cache, uint32_t phandle, const char *name)
+{
+ if (cpus->n == cpus->allocated) {
+ if (!cpus->allocated)
+ cpus->allocated = 64;
+ else
+ cpus->allocated *= 2;
+ cpus->p = realloc(cpus->p, cpus->allocated * sizeof(cpus->p[0]));
+ }
+ cpus->p[cpus->n].phandle = phandle;
+ cpus->p[cpus->n].cpuset = (NULL == cpuset)?NULL:hwloc_bitmap_dup(cpuset);
+ cpus->p[cpus->n].l2_cache = l2_cache;
+ cpus->p[cpus->n].name = strdup(name);
+ ++cpus->n;
+}
+
+/* Walks over the cache list in order to detect nested caches and CPU mask for each */
+static int
+look_powerpc_device_tree_discover_cache(device_tree_cpus_t *cpus,
+ uint32_t phandle, unsigned int *level, hwloc_bitmap_t cpuset)
+{
+ unsigned int i;
+ int ret = -1;
+ if ((NULL == level) || (NULL == cpuset) || phandle == (uint32_t) -1)
+ return ret;
+ for (i = 0; i < cpus->n; ++i) {
+ if (phandle != cpus->p[i].l2_cache)
+ continue;
+ if (NULL != cpus->p[i].cpuset) {
+ hwloc_bitmap_or(cpuset, cpuset, cpus->p[i].cpuset);
+ ret = 0;
+ } else {
+ ++(*level);
+ if (0 == look_powerpc_device_tree_discover_cache(cpus,
+ cpus->p[i].phandle, level, cpuset))
+ ret = 0;
+ }
+ }
+ return ret;
+}
+
+static void
+try__add_cache_from_device_tree_cpu(struct hwloc_topology *topology,
+ unsigned int level, hwloc_obj_cache_type_t type,
+ uint32_t cache_line_size, uint32_t cache_size, uint32_t cache_sets,
+ hwloc_bitmap_t cpuset)
+{
+ struct hwloc_obj *c = NULL;
+
+ if (0 == cache_size)
+ return;
+
+ c = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
+ c->attr->cache.depth = level;
+ c->attr->cache.linesize = cache_line_size;
+ c->attr->cache.size = cache_size;
+ c->attr->cache.type = type;
+ if (cache_sets == 1)
+ /* likely wrong, make it unknown */
+ cache_sets = 0;
+ if (cache_sets && cache_line_size)
+ c->attr->cache.associativity = cache_size / (cache_sets * cache_line_size);
+ else
+ c->attr->cache.associativity = 0;
+ c->cpuset = hwloc_bitmap_dup(cpuset);
+ hwloc_debug_2args_bitmap("cache (%s) depth %d has cpuset %s\n",
+ type == HWLOC_OBJ_CACHE_UNIFIED ? "unified" : (type == HWLOC_OBJ_CACHE_DATA ? "data" : "instruction"),
+ level, c->cpuset);
+ hwloc_insert_object_by_cpuset(topology, c);
+}
+
+static void
+try_add_cache_from_device_tree_cpu(struct hwloc_topology *topology,
+ struct hwloc_linux_backend_data_s *data,
+ const char *cpu, unsigned int level, hwloc_bitmap_t cpuset)
+{
+ /* d-cache-block-size - ignore */
+ /* d-cache-line-size - to read, in bytes */
+ /* d-cache-sets - ignore */
+ /* d-cache-size - to read, in bytes */
+ /* i-cache, same for instruction */
+ /* cache-unified only exist if data and instruction caches are unified */
+ /* d-tlb-sets - ignore */
+ /* d-tlb-size - ignore, always 0 on power6 */
+ /* i-tlb-*, same */
+ uint32_t d_cache_line_size = 0, d_cache_size = 0, d_cache_sets = 0;
+ uint32_t i_cache_line_size = 0, i_cache_size = 0, i_cache_sets = 0;
+ char unified_path[1024];
+ struct stat statbuf;
+ int unified;
+
+ snprintf(unified_path, sizeof(unified_path), "%s/cache-unified", cpu);
+ unified = (hwloc_stat(unified_path, &statbuf, data->root_fd) == 0);
+
+ hwloc_read_unit32be(cpu, "d-cache-line-size", &d_cache_line_size,
+ data->root_fd);
+ hwloc_read_unit32be(cpu, "d-cache-size", &d_cache_size,
+ data->root_fd);
+ hwloc_read_unit32be(cpu, "d-cache-sets", &d_cache_sets,
+ data->root_fd);
+ hwloc_read_unit32be(cpu, "i-cache-line-size", &i_cache_line_size,
+ data->root_fd);
+ hwloc_read_unit32be(cpu, "i-cache-size", &i_cache_size,
+ data->root_fd);
+ hwloc_read_unit32be(cpu, "i-cache-sets", &i_cache_sets,
+ data->root_fd);
+
+ if (!unified)
+ try__add_cache_from_device_tree_cpu(topology, level, HWLOC_OBJ_CACHE_INSTRUCTION,
+ i_cache_line_size, i_cache_size, i_cache_sets, cpuset);
+ try__add_cache_from_device_tree_cpu(topology, level, unified ? HWLOC_OBJ_CACHE_UNIFIED : HWLOC_OBJ_CACHE_DATA,
+ d_cache_line_size, d_cache_size, d_cache_sets, cpuset);
+}
+
+/*
+ * Discovers L1/L2/L3 cache information on IBM PowerPC systems for old kernels (RHEL5.*)
+ * which provide NUMA nodes information without any details
+ */
+static void
+look_powerpc_device_tree(struct hwloc_topology *topology,
+ struct hwloc_linux_backend_data_s *data)
+{
+ device_tree_cpus_t cpus;
+ const char ofroot[] = "/proc/device-tree/cpus";
+ unsigned int i;
+ int root_fd = data->root_fd;
+ DIR *dt = hwloc_opendir(ofroot, root_fd);
+ struct dirent *dirent;
+
+ if (NULL == dt)
+ return;
+
+ /* only works for Power so far, and not useful on ARM */
+ if (strncmp(data->utsname.machine, "ppc", 3))
+ return;
+
+ cpus.n = 0;
+ cpus.p = NULL;
+ cpus.allocated = 0;
+
+ while (NULL != (dirent = readdir(dt))) {
+ char cpu[256];
+ char *device_type;
+ uint32_t reg = -1, l2_cache = -1, phandle = -1;
+
+ if ('.' == dirent->d_name[0])
+ continue;
+
+ snprintf(cpu, sizeof(cpu), "%s/%s", ofroot, dirent->d_name);
+
+ device_type = hwloc_read_str(cpu, "device_type", root_fd);
+ if (NULL == device_type)
+ continue;
+
+ hwloc_read_unit32be(cpu, "reg", ®, root_fd);
+ if (hwloc_read_unit32be(cpu, "next-level-cache", &l2_cache, root_fd) == -1)
+ hwloc_read_unit32be(cpu, "l2-cache", &l2_cache, root_fd);
+ if (hwloc_read_unit32be(cpu, "phandle", &phandle, root_fd) == -1)
+ if (hwloc_read_unit32be(cpu, "ibm,phandle", &phandle, root_fd) == -1)
+ hwloc_read_unit32be(cpu, "linux,phandle", &phandle, root_fd);
+
+ if (0 == strcmp(device_type, "cache")) {
+ add_device_tree_cpus_node(&cpus, NULL, l2_cache, phandle, dirent->d_name);
+ }
+ else if (0 == strcmp(device_type, "cpu")) {
+ /* Found CPU */
+ hwloc_bitmap_t cpuset = NULL;
+ size_t cb = 0;
+ uint32_t *threads = hwloc_read_raw(cpu, "ibm,ppc-interrupt-server#s", &cb, root_fd);
+ uint32_t nthreads = cb / sizeof(threads[0]);
+
+ if (NULL != threads) {
+ cpuset = hwloc_bitmap_alloc();
+ for (i = 0; i < nthreads; ++i) {
+ if (hwloc_bitmap_isset(topology->levels[0][0]->complete_cpuset, ntohl(threads[i])))
+ hwloc_bitmap_set(cpuset, ntohl(threads[i]));
+ }
+ free(threads);
+ } else if ((unsigned int)-1 != reg) {
+ /* Doesn't work on ARM because cpu "reg" do not start at 0.
+ * We know the first cpu "reg" is the lowest. The others are likely
+ * in order assuming the device-tree shows objects in order.
+ */
+ cpuset = hwloc_bitmap_alloc();
+ hwloc_bitmap_set(cpuset, reg);
+ }
+
+ if (NULL == cpuset) {
+ hwloc_debug("%s has no \"reg\" property, skipping\n", cpu);
+ } else {
+ struct hwloc_obj *core = NULL;
+ add_device_tree_cpus_node(&cpus, cpuset, l2_cache, phandle, dirent->d_name);
+
+ /* Add core */
+ core = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, reg);
+ core->cpuset = hwloc_bitmap_dup(cpuset);
+ hwloc_insert_object_by_cpuset(topology, core);
+
+ /* Add L1 cache */
+ try_add_cache_from_device_tree_cpu(topology, data, cpu, 1, cpuset);
+
+ hwloc_bitmap_free(cpuset);
+ }
+ }
+ free(device_type);
+ }
+ closedir(dt);
+
+ /* No cores and L2 cache were found, exiting */
+ if (0 == cpus.n) {
+ hwloc_debug("No cores and L2 cache were found in %s, exiting\n", ofroot);
+ return;
+ }
+
+#ifdef HWLOC_DEBUG
+ for (i = 0; i < cpus.n; ++i) {
+ hwloc_debug("%i: %s ibm,phandle=%08X l2_cache=%08X ",
+ i, cpus.p[i].name, cpus.p[i].phandle, cpus.p[i].l2_cache);
+ if (NULL == cpus.p[i].cpuset) {
+ hwloc_debug("%s\n", "no cpuset");
+ } else {
+ hwloc_debug_bitmap("cpuset %s\n", cpus.p[i].cpuset);
+ }
+ }
+#endif
+
+ /* Scan L2/L3/... caches */
+ for (i = 0; i < cpus.n; ++i) {
+ unsigned int level = 2;
+ hwloc_bitmap_t cpuset;
+ /* Skip real CPUs */
+ if (NULL != cpus.p[i].cpuset)
+ continue;
+
+ /* Calculate cache level and CPU mask */
+ cpuset = hwloc_bitmap_alloc();
+ if (0 == look_powerpc_device_tree_discover_cache(&cpus,
+ cpus.p[i].phandle, &level, cpuset)) {
+ char cpu[256];
+ snprintf(cpu, sizeof(cpu), "%s/%s", ofroot, cpus.p[i].name);
+ try_add_cache_from_device_tree_cpu(topology, data, cpu, level, cpuset);
+ }
+ hwloc_bitmap_free(cpuset);
+ }
+
+ /* Do cleanup */
+ for (i = 0; i < cpus.n; ++i) {
+ hwloc_bitmap_free(cpus.p[i].cpuset);
+ free(cpus.p[i].name);
+ }
+ free(cpus.p);
+}
+
+
+
+/**************************************
+ ****** Sysfs Topology Discovery ******
+ **************************************/
+
+static int
+look_sysfsnode(struct hwloc_topology *topology,
+ struct hwloc_linux_backend_data_s *data,
+ const char *path, unsigned *found)
+{
+ unsigned osnode;
+ unsigned nbnodes = 0;
+ DIR *dir;
+ struct dirent *dirent;
+ hwloc_bitmap_t nodeset;
+
+ *found = 0;
+
+ /* Get the list of nodes first */
+ dir = hwloc_opendir(path, data->root_fd);
+ if (dir)
+ {
+ nodeset = hwloc_bitmap_alloc();
+ while ((dirent = readdir(dir)) != NULL)
+ {
+ if (strncmp(dirent->d_name, "node", 4))
+ continue;
+ osnode = strtoul(dirent->d_name+4, NULL, 0);
+ hwloc_bitmap_set(nodeset, osnode);
+ nbnodes++;
+ }
+ closedir(dir);
+ }
+ else
+ return -1;
+
+ if (nbnodes <= 1)
+ {
+ hwloc_bitmap_free(nodeset);
+ return 0;
+ }
+
+ /* For convenience, put these declarations inside a block. */
+
+ {
+ hwloc_obj_t * nodes = calloc(nbnodes, sizeof(hwloc_obj_t));
+ unsigned *indexes = calloc(nbnodes, sizeof(unsigned));
+ float * distances;
+ int failednodes = 0;
+ unsigned index_;
+
+ if (NULL == nodes || NULL == indexes) {
+ free(nodes);
+ free(indexes);
+ hwloc_bitmap_free(nodeset);
+ nbnodes = 0;
+ goto out;
+ }
+
+ /* Unsparsify node indexes.
+ * We'll need them later because Linux groups sparse distances
+ * and keeps them in order in the sysfs distance files.
+ * It'll simplify things in the meantime.
+ */
+ index_ = 0;
+ hwloc_bitmap_foreach_begin (osnode, nodeset) {
+ indexes[index_] = osnode;
+ index_++;
+ } hwloc_bitmap_foreach_end();
+ hwloc_bitmap_free(nodeset);
+
+#ifdef HWLOC_DEBUG
+ hwloc_debug("%s", "NUMA indexes: ");
+ for (index_ = 0; index_ < nbnodes; index_++) {
+ hwloc_debug(" %u", indexes[index_]);
+ }
+ hwloc_debug("%s", "\n");
+#endif
+
+ /* Create NUMA objects */
+ for (index_ = 0; index_ < nbnodes; index_++) {
+ char nodepath[SYSFS_NUMA_NODE_PATH_LEN];
+ hwloc_bitmap_t cpuset;
+ hwloc_obj_t node, res_obj;
+
+ osnode = indexes[index_];
+
+ sprintf(nodepath, "%s/node%u/cpumap", path, osnode);
+ cpuset = hwloc_parse_cpumap(nodepath, data->root_fd);
+ if (!cpuset) {
+ /* This NUMA object won't be inserted, we'll ignore distances */
+ failednodes++;
+ continue;
+ }
+
+ node = hwloc_alloc_setup_object(HWLOC_OBJ_NUMANODE, osnode);
+ node->cpuset = cpuset;
+ node->nodeset = hwloc_bitmap_alloc();
+ hwloc_bitmap_set(node->nodeset, osnode);
+
+ hwloc_sysfs_node_meminfo_info(topology, data, path, osnode, &node->memory);
+
+ hwloc_debug_1arg_bitmap("os node %u has cpuset %s\n",
+ osnode, node->cpuset);
+ res_obj = hwloc_insert_object_by_cpuset(topology, node);
+ if (node == res_obj) {
+ nodes[index_] = node;
+ } else {
+ /* We got merged somehow, could be a buggy BIOS reporting wrong NUMA node cpuset.
+ * This object disappeared, we'll ignore distances */
+ failednodes++;
+ }
+ }
+
+ if (failednodes) {
+ /* failed to read/create some nodes, don't bother reading/fixing
+ * a distance matrix that would likely be wrong anyway.
+ */
+ nbnodes -= failednodes;
+ distances = NULL;
+ } else {
+ distances = calloc(nbnodes*nbnodes, sizeof(float));
+ }
+
+ if (NULL == distances) {
+ free(nodes);
+ free(indexes);
+ goto out;
+ }
+
+ /* Get actual distances now */
+ for (index_ = 0; index_ < nbnodes; index_++) {
+ char nodepath[SYSFS_NUMA_NODE_PATH_LEN];
+
+ osnode = indexes[index_];
+
+ /* Linux nodeX/distance file contains distance from X to other localities (from ACPI SLIT table or so),
+ * store them in slots X*N...X*N+N-1 */
+ sprintf(nodepath, "%s/node%u/distance", path, osnode);
+ hwloc_parse_node_distance(nodepath, nbnodes, distances+index_*nbnodes, data->root_fd);
+ }
+
+ hwloc_distances_set(topology, HWLOC_OBJ_NUMANODE, nbnodes, indexes, nodes, distances, 0 /* OS cannot force */);
+ }
+
+ out:
+ *found = nbnodes;
+ return 0;
+}
+
+/* Look at Linux' /sys/devices/system/cpu/cpu%d/topology/ */
+static int
+look_sysfscpu(struct hwloc_topology *topology,
+ struct hwloc_linux_backend_data_s *data,
+ const char *path,
+ struct hwloc_linux_cpuinfo_proc * cpuinfo_Lprocs, unsigned cpuinfo_numprocs)
+{
+ hwloc_bitmap_t cpuset; /* Set of cpus for which we have topology information */
+ hwloc_bitmap_t unknownset; /* Set of cpus to clear */
+#define CPU_TOPOLOGY_STR_LEN 128
+ char str[CPU_TOPOLOGY_STR_LEN];
+ DIR *dir;
+ int i,j;
+ FILE *fd;
+ unsigned caches_added, merge_buggy_core_siblings;
+ hwloc_obj_t packages = NULL; /* temporary list of packages before actual insert in the tree */
+
+ /* fill the cpuset of interesting cpus */
+ dir = hwloc_opendir(path, data->root_fd);
+ if (!dir)
+ return -1;
+ else {
+ struct dirent *dirent;
+ cpuset = hwloc_bitmap_alloc();
+ unknownset = hwloc_bitmap_alloc();
+
+ while ((dirent = readdir(dir)) != NULL) {
+ unsigned long cpu;
+ char online[2];
+
+ if (strncmp(dirent->d_name, "cpu", 3))
+ continue;
+ cpu = strtoul(dirent->d_name+3, NULL, 0);
+
+ /* Maybe we don't have topology information but at least it exists */
+ hwloc_bitmap_set(topology->levels[0][0]->complete_cpuset, cpu);
+
+ /* check whether this processor is online */
+ sprintf(str, "%s/cpu%lu/online", path, cpu);
+ fd = hwloc_fopen(str, "r", data->root_fd);
+ if (fd) {
+ if (fgets(online, sizeof(online), fd)) {
+ if (!atoi(online)) {
+ fclose(fd);
+ hwloc_debug("os proc %lu is offline\n", cpu);
+ hwloc_bitmap_clr(topology->levels[0][0]->allowed_cpuset, cpu);
+ hwloc_bitmap_set(unknownset, cpu);
+ continue;
+ }
+ }
+ fclose(fd);
+ }
+
+ /* check whether the kernel exports topology information for this cpu */
+ sprintf(str, "%s/cpu%lu/topology", path, cpu);
+ if (hwloc_access(str, X_OK, data->root_fd) < 0 && errno == ENOENT) {
+ hwloc_debug("os proc %lu has no accessible %s/cpu%lu/topology\n",
+ cpu, path, cpu);
+ hwloc_bitmap_clr(topology->levels[0][0]->allowed_cpuset, cpu);
+ hwloc_bitmap_set(unknownset, cpu);
+ continue;
+ }
+
+ hwloc_bitmap_set(cpuset, cpu);
+ }
+ closedir(dir);
+ }
+
+ topology->support.discovery->pu = 1;
+ hwloc_debug_1arg_bitmap("found %d cpu topologies, cpuset %s\n",
+ hwloc_bitmap_weight(cpuset), cpuset);
+
+ merge_buggy_core_siblings = (!strcmp(data->utsname.machine, "x86_64"))
+ || (data->utsname.machine[0] == 'i' && !strcmp(data->utsname.machine+2, "86"));
+ caches_added = 0;
+ hwloc_bitmap_foreach_begin(i, cpuset)
+ {
+ hwloc_bitmap_t packageset, coreset, bookset, threadset, savedcoreset;
+ unsigned mypackageid, mycoreid, mybookid;
+ int threadwithcoreid = 0;
+
+ /* look at the package */
+ mypackageid = 0; /* shut-up the compiler */
+ sprintf(str, "%s/cpu%d/topology/physical_package_id", path, i);
+ hwloc_parse_sysfs_unsigned(str, &mypackageid, data->root_fd);
+
+ sprintf(str, "%s/cpu%d/topology/core_siblings", path, i);
+ packageset = hwloc_parse_cpumap(str, data->root_fd);
+ if (packageset) {
+ hwloc_bitmap_andnot(packageset, packageset, unknownset);
+ if (hwloc_bitmap_first(packageset) == i) {
+ /* first cpu in this package, add the package */
+ struct hwloc_obj *package;
+
+ if (merge_buggy_core_siblings) {
+ /* check for another package with same physical_package_id */
+ hwloc_obj_t curpackage = packages;
+ while (curpackage) {
+ if (curpackage->os_index == mypackageid) {
+ /* found another package with same physical_package_id but different core_siblings.
+ * looks like a buggy kernel on Intel Xeon E5 v3 processor with two rings.
+ * merge these core_siblings to extend the existing first package object.
+ */
+ static int reported = 0;
+ if (!reported && !hwloc_hide_errors()) {
+ char *a, *b;
+ hwloc_bitmap_asprintf(&a, curpackage->cpuset);
+ hwloc_bitmap_asprintf(&b, packageset);
+ fprintf(stderr, "****************************************************************************\n");
+ fprintf(stderr, "* hwloc %s has detected buggy sysfs package information: Two packages have\n", HWLOC_VERSION);
+ fprintf(stderr, "* the same physical package id %u but different core_siblings %s and %s\n",
+ mypackageid, a, b);
+ fprintf(stderr, "* hwloc is merging these packages into a single one assuming your Linux kernel\n");
+ fprintf(stderr, "* does not support this processor correctly.\n");
+ fprintf(stderr, "* You may hide this warning by setting HWLOC_HIDE_ERRORS=1 in the environment.\n");
+ fprintf(stderr, "*\n");
+ fprintf(stderr, "* If hwloc does not report the right number of packages,\n");
+ fprintf(stderr, "* please report this error message to the hwloc user's mailing list,\n");
+ fprintf(stderr, "* along with the output+tarball generated by the hwloc-gather-topology script.\n");
+ fprintf(stderr, "****************************************************************************\n");
+ reported = 1;
+ free(a);
+ free(b);
+ }
+ hwloc_bitmap_or(curpackage->cpuset, curpackage->cpuset, packageset);
+ goto package_done;
+ }
+ curpackage = curpackage->next_cousin;
+ }
+ }
+
+ /* no package with same physical_package_id, create a new one */
+ package = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, mypackageid);
+ package->cpuset = packageset;
+ hwloc_debug_1arg_bitmap("os package %u has cpuset %s\n",
+ mypackageid, packageset);
+ /* add cpuinfo */
+ if (cpuinfo_Lprocs) {
+ for(j=0; j<(int) cpuinfo_numprocs; j++)
+ if ((int) cpuinfo_Lprocs[j].Pproc == i) {
+ hwloc__move_infos(&package->infos, &package->infos_count,
+ &cpuinfo_Lprocs[j].infos, &cpuinfo_Lprocs[j].infos_count);
+ }
+ }
+ /* insert in a temporary list in case we have to modify the cpuset by merging other core_siblings later.
+ * we'll actually insert the tree at the end of the entire sysfs cpu loop.
+ */
+ package->next_cousin = packages;
+ packages = package;
+
+ packageset = NULL; /* don't free it */
+ }
+ }
+package_done:
+ hwloc_bitmap_free(packageset);
+
+ /* look at the core */
+ mycoreid = 0; /* shut-up the compiler */
+ sprintf(str, "%s/cpu%d/topology/core_id", path, i);
+ hwloc_parse_sysfs_unsigned(str, &mycoreid, data->root_fd);
+
+ sprintf(str, "%s/cpu%d/topology/thread_siblings", path, i);
+ coreset = hwloc_parse_cpumap(str, data->root_fd);
+ savedcoreset = coreset; /* store it for later work-arounds */
+ if (coreset) {
+ hwloc_bitmap_andnot(coreset, coreset, unknownset);
+ if (hwloc_bitmap_weight(coreset) > 1) {
+ /* check if this is hyper-threading or different coreids */
+ unsigned siblingid, siblingcoreid;
+ hwloc_bitmap_t set = hwloc_bitmap_dup(coreset);
+ hwloc_bitmap_clr(set, i);
+ siblingid = hwloc_bitmap_first(set);
+ siblingcoreid = mycoreid;
+ sprintf(str, "%s/cpu%d/topology/core_id", path, siblingid);
+ hwloc_parse_sysfs_unsigned(str, &siblingcoreid, data->root_fd);
+ threadwithcoreid = (siblingcoreid != mycoreid);
+ hwloc_bitmap_free(set);
+ }
+ if (hwloc_bitmap_first(coreset) == i || threadwithcoreid) {
+ /* regular core */
+ struct hwloc_obj *core = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, mycoreid);
+ if (threadwithcoreid) {
+ /* amd multicore compute-unit, create one core per thread */
+ core->cpuset = hwloc_bitmap_alloc();
+ hwloc_bitmap_set(core->cpuset, i);
+ } else {
+ core->cpuset = coreset;
+ }
+ hwloc_debug_1arg_bitmap("os core %u has cpuset %s\n",
+ mycoreid, coreset);
+ hwloc_insert_object_by_cpuset(topology, core);
+ coreset = NULL; /* don't free it */
+ }
+ }
+
+ /* look at the books */
+ mybookid = 0; /* shut-up the compiler */
+ sprintf(str, "%s/cpu%d/topology/book_id", path, i);
+ if (hwloc_parse_sysfs_unsigned(str, &mybookid, data->root_fd) == 0) {
+ sprintf(str, "%s/cpu%d/topology/book_siblings", path, i);
+ bookset = hwloc_parse_cpumap(str, data->root_fd);
+ if (bookset) {
+ hwloc_bitmap_andnot(bookset, bookset, unknownset);
+ if (bookset && hwloc_bitmap_first(bookset) == i) {
+ struct hwloc_obj *book = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, mybookid);
+ book->cpuset = bookset;
+ hwloc_debug_1arg_bitmap("os book %u has cpuset %s\n",
+ mybookid, bookset);
+ hwloc_obj_add_info(book, "Type", "Book");
+ hwloc_insert_object_by_cpuset(topology, book);
+ bookset = NULL; /* don't free it */
+ }
+ }
+ }
+
+ {
+ /* look at the thread */
+ struct hwloc_obj *thread = hwloc_alloc_setup_object(HWLOC_OBJ_PU, i);
+ threadset = hwloc_bitmap_alloc();
+ hwloc_bitmap_only(threadset, i);
+ thread->cpuset = threadset;
+ hwloc_debug_1arg_bitmap("thread %d has cpuset %s\n",
+ i, threadset);
+ hwloc_insert_object_by_cpuset(topology, thread);
+ }
+
+ /* look at the caches */
+ for(j=0; j<10; j++) {
+#define SHARED_CPU_MAP_STRLEN 128
+ char mappath[SHARED_CPU_MAP_STRLEN];
+ char str2[20]; /* enough for a level number (one digit) or a type (Data/Instruction/Unified) */
+ hwloc_bitmap_t cacheset;
+ unsigned long kB = 0;
+ unsigned linesize = 0;
+ unsigned sets = 0, lines_per_tag = 1;
+ int depth; /* 0 for L1, .... */
+ hwloc_obj_cache_type_t type = HWLOC_OBJ_CACHE_UNIFIED; /* default */
+
+ /* get the cache level depth */
+ sprintf(mappath, "%s/cpu%d/cache/index%d/level", path, i, j);
+ fd = hwloc_fopen(mappath, "r", data->root_fd);
+ if (fd) {
+ char *res = fgets(str2,sizeof(str2), fd);
+ fclose(fd);
+ if (res)
+ depth = strtoul(str2, NULL, 10)-1;
+ else
+ continue;
+ } else
+ continue;
+
+ /* cache type */
+ sprintf(mappath, "%s/cpu%d/cache/index%d/type", path, i, j);
+ fd = hwloc_fopen(mappath, "r", data->root_fd);
+ if (fd) {
+ if (fgets(str2, sizeof(str2), fd)) {
+ fclose(fd);
+ if (!strncmp(str2, "Data", 4))
+ type = HWLOC_OBJ_CACHE_DATA;
+ else if (!strncmp(str2, "Unified", 7))
+ type = HWLOC_OBJ_CACHE_UNIFIED;
+ else if (!strncmp(str2, "Instruction", 11))
+ type = HWLOC_OBJ_CACHE_INSTRUCTION;
+ else
+ continue;
+ } else {
+ fclose(fd);
+ continue;
+ }
+ } else
+ continue;
+
+ /* get the cache size */
+ sprintf(mappath, "%s/cpu%d/cache/index%d/size", path, i, j);
+ fd = hwloc_fopen(mappath, "r", data->root_fd);
+ if (fd) {
+ if (fgets(str2,sizeof(str2), fd))
+ kB = atol(str2); /* in kB */
+ fclose(fd);
+ }
+
+ /* get the line size */
+ sprintf(mappath, "%s/cpu%d/cache/index%d/coherency_line_size", path, i, j);
+ fd = hwloc_fopen(mappath, "r", data->root_fd);
+ if (fd) {
+ if (fgets(str2,sizeof(str2), fd))
+ linesize = atol(str2); /* in bytes */
+ fclose(fd);
+ }
+
+ /* get the number of sets and lines per tag.
+ * don't take the associativity directly in "ways_of_associativity" because
+ * some archs (ia64, ppc) put 0 there when fully-associative, while others (x86) put something like -1 there.
+ */
+ sprintf(mappath, "%s/cpu%d/cache/index%d/number_of_sets", path, i, j);
+ fd = hwloc_fopen(mappath, "r", data->root_fd);
+ if (fd) {
+ if (fgets(str2,sizeof(str2), fd))
+ sets = atol(str2);
+ fclose(fd);
+ }
+ sprintf(mappath, "%s/cpu%d/cache/index%d/physical_line_partition", path, i, j);
+ fd = hwloc_fopen(mappath, "r", data->root_fd);
+ if (fd) {
+ if (fgets(str2,sizeof(str2), fd))
+ lines_per_tag = atol(str2);
+ fclose(fd);
+ }
+
+ sprintf(mappath, "%s/cpu%d/cache/index%d/shared_cpu_map", path, i, j);
+ cacheset = hwloc_parse_cpumap(mappath, data->root_fd);
+ if (cacheset) {
+ hwloc_bitmap_andnot(cacheset, cacheset, unknownset);
+ if (hwloc_bitmap_weight(cacheset) < 1) {
+ /* mask is wrong (useful for many itaniums) */
+ if (savedcoreset)
+ /* assume it's a core-specific cache */
+ hwloc_bitmap_copy(cacheset, savedcoreset);
+ else
+ /* assumes it's not shared */
+ hwloc_bitmap_only(cacheset, i);
+ }
+
+ if (hwloc_bitmap_first(cacheset) == i) {
+ /* first cpu in this cache, add the cache */
+ struct hwloc_obj *cache = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
+ cache->attr->cache.size = kB << 10;
+ cache->attr->cache.depth = depth+1;
+ cache->attr->cache.linesize = linesize;
+ cache->attr->cache.type = type;
+ if (!linesize || !lines_per_tag || !sets)
+ cache->attr->cache.associativity = 0; /* unknown */
+ else if (sets == 1)
+ cache->attr->cache.associativity = 0; /* likely wrong, make it unknown */
+ else
+ cache->attr->cache.associativity = (kB << 10) / linesize / lines_per_tag / sets;
+ cache->cpuset = cacheset;
+ hwloc_debug_1arg_bitmap("cache depth %d has cpuset %s\n",
+ depth, cacheset);
+ hwloc_insert_object_by_cpuset(topology, cache);
+ cacheset = NULL; /* don't free it */
+ ++caches_added;
+ }
+ }
+ hwloc_bitmap_free(cacheset);
+ }
+ hwloc_bitmap_free(coreset);
+ }
+ hwloc_bitmap_foreach_end();
+
+ /* actually insert in the tree now that package cpusets have been fixed-up */
+ while (packages) {
+ hwloc_obj_t next = packages->next_cousin;
+ packages->next_cousin = NULL;
+ hwloc_insert_object_by_cpuset(topology, packages);
+ packages = next;
+ }
+
+ if (0 == caches_added)
+ look_powerpc_device_tree(topology, data);
+
+ hwloc_bitmap_free(cpuset);
+ hwloc_bitmap_free(unknownset);
+
+ return 0;
+}
+
+
+
+/****************************************
+ ****** cpuinfo Topology Discovery ******
+ ****************************************/
+
+static int
+hwloc_linux_parse_cpuinfo_x86(const char *prefix, const char *value,
+ struct hwloc_obj_info_s **infos, unsigned *infos_count,
+ int is_global __hwloc_attribute_unused)
+{
+ if (!strcmp("vendor_id", prefix)) {
+ hwloc__add_info(infos, infos_count, "CPUVendor", value);
+ } else if (!strcmp("model name", prefix)) {
+ hwloc__add_info(infos, infos_count, "CPUModel", value);
+ } else if (!strcmp("model", prefix)) {
+ hwloc__add_info(infos, infos_count, "CPUModelNumber", value);
+ } else if (!strcmp("cpu family", prefix)) {
+ hwloc__add_info(infos, infos_count, "CPUFamilyNumber", value);
+ } else if (!strcmp("stepping", prefix)) {
+ hwloc__add_info(infos, infos_count, "CPUStepping", value);
+ }
+ return 0;
+}
+
+static int
+hwloc_linux_parse_cpuinfo_ia64(const char *prefix, const char *value,
+ struct hwloc_obj_info_s **infos, unsigned *infos_count,
+ int is_global __hwloc_attribute_unused)
+{
+ if (!strcmp("vendor", prefix)) {
+ hwloc__add_info(infos, infos_count, "CPUVendor", value);
+ } else if (!strcmp("model name", prefix)) {
+ hwloc__add_info(infos, infos_count, "CPUModel", value);
+ } else if (!strcmp("model", prefix)) {
+ hwloc__add_info(infos, infos_count, "CPUModelNumber", value);
+ } else if (!strcmp("family", prefix)) {
+ hwloc__add_info(infos, infos_count, "CPUFamilyNumber", value);
+ }
+ return 0;
+}
+
+static int
+hwloc_linux_parse_cpuinfo_arm(const char *prefix, const char *value,
+ struct hwloc_obj_info_s **infos, unsigned *infos_count,
+ int is_global __hwloc_attribute_unused)
+{
+ if (!strcmp("Processor", prefix) /* old kernels with one Processor header */
+ || !strcmp("model name", prefix) /* new kernels with one model name per core */) {
+ hwloc__add_info(infos, infos_count, "CPUModel", value);
+ } else if (!strcmp("CPU implementer", prefix)) {
+ hwloc__add_info(infos, infos_count, "CPUImplementer", value);
+ } else if (!strcmp("CPU architecture", prefix)) {
+ hwloc__add_info(infos, infos_count, "CPUArchitecture", value);
+ } else if (!strcmp("CPU variant", prefix)) {
+ hwloc__add_info(infos, infos_count, "CPUVariant", value);
+ } else if (!strcmp("CPU part", prefix)) {
+ hwloc__add_info(infos, infos_count, "CPUPart", value);
+ } else if (!strcmp("CPU revision", prefix)) {
+ hwloc__add_info(infos, infos_count, "CPURevision", value);
+ } else if (!strcmp("Hardware", prefix)) {
+ hwloc__add_info(infos, infos_count, "HardwareName", value);
+ } else if (!strcmp("Revision", prefix)) {
+ hwloc__add_info(infos, infos_count, "HardwareRevision", value);
+ } else if (!strcmp("Serial", prefix)) {
+ hwloc__add_info(infos, infos_count, "HardwareSerial", value);
+ }
+ return 0;
+}
+
+static int
+hwloc_linux_parse_cpuinfo_ppc(const char *prefix, const char *value,
+ struct hwloc_obj_info_s **infos, unsigned *infos_count,
+ int is_global)
+{
+ /* common fields */
+ if (!strcmp("cpu", prefix)) {
+ hwloc__add_info(infos, infos_count, "CPUModel", value);
+ } else if (!strcmp("platform", prefix)) {
+ hwloc__add_info(infos, infos_count, "PlatformName", value);
+ } else if (!strcmp("model", prefix)) {
+ hwloc__add_info(infos, infos_count, "PlatformModel", value);
+ }
+ /* platform-specific fields */
+ else if (!strcasecmp("vendor", prefix)) {
+ hwloc__add_info(infos, infos_count, "PlatformVendor", value);
+ } else if (!strcmp("Board ID", prefix)) {
+ hwloc__add_info(infos, infos_count, "PlatformBoardID", value);
+ } else if (!strcmp("Board", prefix)
+ || !strcasecmp("Machine", prefix)) {
+ /* machine and board are similar (and often more precise) than model above */
+ char **valuep = hwloc__find_info_slot(infos, infos_count, "PlatformModel");
+ if (*valuep)
+ free(*valuep);
+ *valuep = strdup(value);
+ } else if (!strcasecmp("Revision", prefix)
+ || !strcmp("Hardware rev", prefix)) {
+ hwloc__add_info(infos, infos_count, is_global ? "PlatformRevision" : "CPURevision", value);
+ } else if (!strcmp("SVR", prefix)) {
+ hwloc__add_info(infos, infos_count, "SystemVersionRegister", value);
+ } else if (!strcmp("PVR", prefix)) {
+ hwloc__add_info(infos, infos_count, "ProcessorVersionRegister", value);
+ }
+ /* don't match 'board*' because there's also "board l2" on some platforms */
+ return 0;
+}
+
+/*
+ * avr32: "chip type\t:" => OK
+ * blackfin: "model name\t:" => OK
+ * h8300: "CPU:" => OK
+ * m68k: "CPU:" => OK
+ * mips: "cpu model\t\t:" => OK
+ * openrisc: "CPU:" => OK
+ * sparc: "cpu\t\t:" => OK
+ * tile: "model name\t:" => OK
+ * unicore32: "Processor\t:" => OK
+ * alpha: "cpu\t\t\t: Alpha" + "cpu model\t\t:" => "cpu" overwritten by "cpu model", no processor indexes
+ * cris: "cpu\t\t:" + "cpu model\t:" => only "cpu"
+ * frv: "CPU-Core:" + "CPU:" => only "CPU"
+ * mn10300: "cpu core :" + "model name :" => only "model name"
+ * parisc: "cpu family\t:" + "cpu\t\t:" => only "cpu"
+ *
+ * not supported because of conflicts with other arch minor lines:
+ * m32r: "cpu family\t:" => KO (adding "cpu family" would break "blackfin")
+ * microblaze: "CPU-Family:" => KO
+ * sh: "cpu family\t:" + "cpu type\t:" => KO
+ * xtensa: "model\t\t:" => KO
+ */
+static int
+hwloc_linux_parse_cpuinfo_generic(const char *prefix, const char *value,
+ struct hwloc_obj_info_s **infos, unsigned *infos_count,
+ int is_global __hwloc_attribute_unused)
+{
+ if (!strcmp("model name", prefix)
+ || !strcmp("Processor", prefix)
+ || !strcmp("chip type", prefix)
+ || !strcmp("cpu model", prefix)
+ || !strcasecmp("cpu", prefix)) {
+ /* keep the last one, assume it's more precise than the first one.
+ * we should have the Architecture keypair for basic information anyway.
+ */
+ char **valuep = hwloc__find_info_slot(infos, infos_count, "CPUModel");
+ if (*valuep)
+ free(*valuep);
+ *valuep = strdup(value);
+ }
+ return 0;
+}
+
+static int
+hwloc_linux_parse_cpuinfo(struct hwloc_linux_backend_data_s *data,
+ const char *path,
+ struct hwloc_linux_cpuinfo_proc ** Lprocs_p,
+ struct hwloc_obj_info_s **global_infos, unsigned *global_infos_count)
+{
+ FILE *fd;
+ char *str = NULL;
+ char *endptr;
+ unsigned len;
+ unsigned allocated_Lprocs = 0;
+ struct hwloc_linux_cpuinfo_proc * Lprocs = NULL;
+ unsigned numprocs = 0;
+ int curproc = -1;
+ int (*parse_cpuinfo_func)(const char *, const char *, struct hwloc_obj_info_s **, unsigned *, int) = NULL;
+
+ if (!(fd=hwloc_fopen(path,"r", data->root_fd)))
+ {
+ hwloc_debug("could not open %s\n", path);
+ return -1;
+ }
+
+# define PROCESSOR "processor"
+# define PACKAGEID "physical id" /* the longest one */
+# define COREID "core id"
+ len = 128; /* vendor/model can be very long */
+ str = malloc(len);
+ hwloc_debug("\n\n * Topology extraction from %s *\n\n", path);
+ while (fgets(str,len,fd)!=NULL) {
+ unsigned long Ppkg, Pcore, Pproc;
+ char *end, *dot, *prefix, *value;
+ int noend = 0;
+
+ /* remove the ending \n */
+ end = strchr(str, '\n');
+ if (end)
+ *end = 0;
+ else
+ noend = 1;
+ /* if empty line, skip and reset curproc */
+ if (!*str) {
+ curproc = -1;
+ continue;
+ }
+ /* skip lines with no dot */
+ dot = strchr(str, ':');
+ if (!dot)
+ continue;
+ /* skip lines not starting with a letter */
+ if ((*str > 'z' || *str < 'a')
+ && (*str > 'Z' || *str < 'A'))
+ continue;
+
+ /* mark the end of the prefix */
+ prefix = str;
+ end = dot;
+ while (end[-1] == ' ' || end[-1] == ' ') end--; /* need a strrspn() */
+ *end = 0;
+ /* find beginning of value, its end is already marked */
+ value = dot+1 + strspn(dot+1, " ");
+
+ /* defines for parsing numbers */
+# define getprocnb_begin(field, var) \
+ if (!strcmp(field,prefix)) { \
+ var = strtoul(value,&endptr,0); \
+ if (endptr==value) { \
+ hwloc_debug("no number in "field" field of %s\n", path); \
+ goto err; \
+ } else if (var==ULONG_MAX) { \
+ hwloc_debug("too big "field" number in %s\n", path); \
+ goto err; \
+ } \
+ hwloc_debug(field " %lu\n", var)
+# define getprocnb_end() \
+ }
+ /* actually parse numbers */
+ getprocnb_begin(PROCESSOR, Pproc);
+ curproc = numprocs++;
+ if (numprocs > allocated_Lprocs) {
+ if (!allocated_Lprocs)
+ allocated_Lprocs = 8;
+ else
+ allocated_Lprocs *= 2;
+ Lprocs = realloc(Lprocs, allocated_Lprocs * sizeof(*Lprocs));
+ }
+ Lprocs[curproc].Pproc = Pproc;
+ Lprocs[curproc].Pcore = -1;
+ Lprocs[curproc].Ppkg = -1;
+ Lprocs[curproc].Lcore = -1;
+ Lprocs[curproc].Lpkg = -1;
+ Lprocs[curproc].infos = NULL;
+ Lprocs[curproc].infos_count = 0;
+ getprocnb_end() else
+ getprocnb_begin(PACKAGEID, Ppkg);
+ Lprocs[curproc].Ppkg = Ppkg;
+ getprocnb_end() else
+ getprocnb_begin(COREID, Pcore);
+ Lprocs[curproc].Pcore = Pcore;
+ getprocnb_end() else {
+
+ /* architecture specific or default routine for parsing cpumodel */
+ if (!parse_cpuinfo_func) {
+ parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_generic;
+ if (*data->utsname.machine) {
+ /* x86_32 x86_64 k1om => x86 */
+ if (!strcmp(data->utsname.machine, "x86_64")
+ || (data->utsname.machine[0] == 'i' && !strcmp(data->utsname.machine+2, "86"))
+ || !strcmp(data->utsname.machine, "k1om"))
+ parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_x86;
+ /* ia64 */
+ else if (!strcmp(data->utsname.machine, "ia64"))
+ parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_ia64;
+ /* arm */
+ else if (!strncmp(data->utsname.machine, "arm", 3))
+ parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_arm;
+ else if (!strncmp(data->utsname.machine, "ppc", 3)
+ || !strncmp(data->utsname.machine, "power", 5))
+ parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_ppc;
+ }
+ }
+ /* we can't assume that we already got a processor index line:
+ * alpha/frv/h8300/m68k/microblaze/sparc have no processor lines at all, only a global entry.
+ * tile has a global section with model name before the list of processor lines.
+ */
+ parse_cpuinfo_func(prefix, value,
+ curproc >= 0 ? &Lprocs[curproc].infos : global_infos,
+ curproc >= 0 ? &Lprocs[curproc].infos_count : global_infos_count,
+ curproc < 0);
+ }
+
+ if (noend) {
+ /* ignore end of line */
+ if (fscanf(fd,"%*[^\n]") == EOF)
+ break;
+ getc(fd);
+ }
+ }
+ fclose(fd);
+ free(str);
+
+ *Lprocs_p = Lprocs;
+ return numprocs;
+
+ err:
+ fclose(fd);
+ free(str);
+ free(Lprocs);
+ return -1;
+}
+
+static void
+hwloc_linux_free_cpuinfo(struct hwloc_linux_cpuinfo_proc * Lprocs, unsigned numprocs,
+ struct hwloc_obj_info_s *global_infos, unsigned global_infos_count)
+{
+ if (Lprocs) {
+ unsigned i;
+ for(i=0; i<numprocs; i++) {
+ hwloc__free_infos(Lprocs[i].infos, Lprocs[i].infos_count);
+ }
+ free(Lprocs);
+ }
+ hwloc__free_infos(global_infos, global_infos_count);
+}
+
+static int
+look_cpuinfo(struct hwloc_topology *topology,
+ struct hwloc_linux_backend_data_s *data,
+ const char *path)
+{
+ struct hwloc_linux_cpuinfo_proc * Lprocs = NULL;
+ struct hwloc_obj_info_s *global_infos = NULL;
+ unsigned global_infos_count = 0;
+ /* P for physical/OS index, L for logical (e.g. in we order we get them, not in the final hwloc logical order) */
+ unsigned *Lcore_to_Pcore;
+ unsigned *Lcore_to_Ppkg; /* needed because Lcore is equivalent to Pcore+Ppkg, not to Pcore alone */
+ unsigned *Lpkg_to_Ppkg;
+ int _numprocs;
+ unsigned numprocs;
+ unsigned numpkgs=0;
+ unsigned numcores=0;
+ unsigned long Lproc;
+ unsigned missingpkg;
+ unsigned missingcore;
+ unsigned i,j;
+
+ /* parse the entire cpuinfo first, fill the Lprocs array and numprocs */
+ _numprocs = hwloc_linux_parse_cpuinfo(data, path, &Lprocs, &global_infos, &global_infos_count);
+
+
+ /* setup root info */
+ hwloc__move_infos(&hwloc_get_root_obj(topology)->infos, &hwloc_get_root_obj(topology)->infos_count,
+ &global_infos, &global_infos_count);
+
+
+ if (_numprocs <= 0)
+ /* found no processor */
+ return -1;
+ numprocs = _numprocs;
+
+ /* initialize misc arrays, there can be at most numprocs entries */
+ Lcore_to_Pcore = malloc(numprocs * sizeof(*Lcore_to_Pcore));
+ Lcore_to_Ppkg = malloc(numprocs * sizeof(*Lcore_to_Ppkg));
+ Lpkg_to_Ppkg = malloc(numprocs * sizeof(*Lpkg_to_Ppkg));
+ for (i = 0; i < numprocs; i++) {
+ Lcore_to_Pcore[i] = -1;
+ Lcore_to_Ppkg[i] = -1;
+ Lpkg_to_Ppkg[i] = -1;
+ }
+
+ /* create PU objects */
+ for(Lproc=0; Lproc<numprocs; Lproc++) {
+ unsigned long Pproc = Lprocs[Lproc].Pproc;
+ hwloc_obj_t obj = hwloc_alloc_setup_object(HWLOC_OBJ_PU, Pproc);
+ obj->cpuset = hwloc_bitmap_alloc();
+ hwloc_bitmap_only(obj->cpuset, Pproc);
+ hwloc_debug_2args_bitmap("cpu %lu (os %lu) has cpuset %s\n",
+ Lproc, Pproc, obj->cpuset);
+ hwloc_insert_object_by_cpuset(topology, obj);
+ }
+
+ topology->support.discovery->pu = 1;
+
+ hwloc_debug("%s", "\n * Topology summary *\n");
+ hwloc_debug("%u processors)\n", numprocs);
+
+ /* fill Lprocs[].Lpkg and Lpkg_to_Ppkg */
+ for(Lproc=0; Lproc<numprocs; Lproc++) {
+ long Ppkg = Lprocs[Lproc].Ppkg;
+ if (Ppkg != -1) {
+ unsigned long Pproc = Lprocs[Lproc].Pproc;
+ for (i=0; i<numpkgs; i++)
+ if ((unsigned) Ppkg == Lpkg_to_Ppkg[i])
+ break;
+ Lprocs[Lproc].Lpkg = i;
+ hwloc_debug("%lu on package %u (%lx)\n", Pproc, i, Ppkg);
+ if (i==numpkgs) {
+ Lpkg_to_Ppkg[numpkgs] = Ppkg;
+ numpkgs++;
+ }
+ }
+ }
+ /* Some buggy Linuxes don't provide numbers for processor 0, which makes us
+ * provide bogus information. We should rather drop it. */
+ missingpkg=0;
+ for(j=0; j<numprocs; j++)
+ if (Lprocs[i].Ppkg == -1) {
+ missingpkg=1;
+ break;
+ }
+ /* create package objects */
+ hwloc_debug("%u pkgs%s\n", numpkgs, missingpkg ? ", but some missing package" : "");
+ if (!missingpkg && numpkgs>0) {
+ for (i = 0; i < numpkgs; i++) {
+ struct hwloc_obj *obj = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, Lpkg_to_Ppkg[i]);
+ int doneinfos = 0;
+ obj->cpuset = hwloc_bitmap_alloc();
+ for(j=0; j<numprocs; j++)
+ if ((unsigned) Lprocs[j].Lpkg == i) {
+ hwloc_bitmap_set(obj->cpuset, Lprocs[j].Pproc);
+ if (!doneinfos) {
+ hwloc__move_infos(&obj->infos, &obj->infos_count, &Lprocs[j].infos, &Lprocs[j].infos_count);
+ doneinfos = 1;
+ }
+ }
+ hwloc_debug_1arg_bitmap("package %d has cpuset %s\n", i, obj->cpuset);
+ hwloc_insert_object_by_cpuset(topology, obj);
+ }
+ hwloc_debug("%s", "\n");
+ }
+
+ /* fill Lprocs[].Lcore, Lcore_to_Ppkg and Lcore_to_Pcore */
+ for(Lproc=0; Lproc<numprocs; Lproc++) {
+ long Pcore = Lprocs[Lproc].Pcore;
+ if (Pcore != -1) {
+ for (i=0; i<numcores; i++)
+ if ((unsigned) Pcore == Lcore_to_Pcore[i] && (unsigned) Lprocs[Lproc].Ppkg == Lcore_to_Ppkg[i])
+ break;
+ Lprocs[Lproc].Lcore = i;
+ if (i==numcores) {
+ Lcore_to_Ppkg[numcores] = Lprocs[Lproc].Ppkg;
+ Lcore_to_Pcore[numcores] = Pcore;
+ numcores++;
+ }
+ }
+ }
+ /* Some buggy Linuxes don't provide numbers for processor 0, which makes us
+ * provide bogus information. We should rather drop it. */
+ missingcore=0;
+ for(j=0; j<numprocs; j++)
+ if (Lprocs[i].Pcore == -1) {
+ missingcore=1;
+ break;
+ }
+ /* create Core objects */
+ hwloc_debug("%u cores%s\n", numcores, missingcore ? ", but some missing core" : "");
+ if (!missingcore && numcores>0) {
+ for (i = 0; i < numcores; i++) {
+ struct hwloc_obj *obj = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, Lcore_to_Pcore[i]);
+ obj->cpuset = hwloc_bitmap_alloc();
+ for(j=0; j<numprocs; j++)
+ if ((unsigned) Lprocs[j].Lcore == i)
+ hwloc_bitmap_set(obj->cpuset, Lprocs[j].Pproc);
+ hwloc_debug_1arg_bitmap("Core %d has cpuset %s\n", i, obj->cpuset);
+ hwloc_insert_object_by_cpuset(topology, obj);
+ }
+ hwloc_debug("%s", "\n");
+ }
+
+ free(Lcore_to_Pcore);
+ free(Lcore_to_Ppkg);
+ free(Lpkg_to_Ppkg);
+
+ hwloc_linux_free_cpuinfo(Lprocs, numprocs, global_infos, global_infos_count);
+
+ look_powerpc_device_tree(topology, data);
+ return 0;
+}
+
+
+
+/*************************************
+ ****** Main Topology Discovery ******
+ *************************************/
+
+static void
+hwloc__linux_get_mic_sn(struct hwloc_topology *topology, struct hwloc_linux_backend_data_s *data)
+{
+ FILE *file;
+ char line[64], *tmp, *end;
+ file = hwloc_fopen("/proc/elog", "r", data->root_fd);
+ if (!file)
+ return;
+ if (!fgets(line, sizeof(line), file))
+ goto out_with_file;
+ if (strncmp(line, "Card ", 5))
+ goto out_with_file;
+ tmp = line + 5;
+ end = strchr(tmp, ':');
+ if (!end)
+ goto out_with_file;
+ *end = '\0';
+ hwloc_obj_add_info(hwloc_get_root_obj(topology), "MICSerialNumber", tmp);
+
+ out_with_file:
+ fclose(file);
+}
+
+static void
+hwloc_linux_fallback_pu_level(struct hwloc_topology *topology)
+{
+ if (topology->is_thissystem)
+ hwloc_setup_pu_level(topology, hwloc_fallback_nbprocessors(topology));
+ else
+ /* fsys-root but not this system, no way, assume there's just 1
+ * processor :/ */
+ hwloc_setup_pu_level(topology, 1);
+}
+
+static void
+hwloc_gather_system_info(struct hwloc_topology *topology,
+ struct hwloc_linux_backend_data_s *data)
+{
+ FILE *file;
+ char line[128]; /* enough for utsname fields */
+ const char *env;
+
+ /* initialize to something sane */
+ memset(&data->utsname, 0, sizeof(data->utsname));
+
+ /* read thissystem info */
+ if (topology->is_thissystem)
+ uname(&data->utsname);
+
+ /* overwrite with optional /proc/hwloc-nofile-info */
+ file = hwloc_fopen("/proc/hwloc-nofile-info", "r", data->root_fd);
+ if (file) {
+ while (fgets(line, sizeof(line), file)) {
+ char *tmp = strchr(line, '\n');
+ if (!strncmp("OSName: ", line, 8)) {
+ if (tmp)
+ *tmp = '\0';
+ strncpy(data->utsname.sysname, line+8, sizeof(data->utsname.sysname));
+ data->utsname.sysname[sizeof(data->utsname.sysname)-1] = '\0';
+ } else if (!strncmp("OSRelease: ", line, 11)) {
+ if (tmp)
+ *tmp = '\0';
+ strncpy(data->utsname.release, line+11, sizeof(data->utsname.release));
+ data->utsname.release[sizeof(data->utsname.release)-1] = '\0';
+ } else if (!strncmp("OSVersion: ", line, 11)) {
+ if (tmp)
+ *tmp = '\0';
+ strncpy(data->utsname.version, line+11, sizeof(data->utsname.version));
+ data->utsname.version[sizeof(data->utsname.version)-1] = '\0';
+ } else if (!strncmp("HostName: ", line, 10)) {
+ if (tmp)
+ *tmp = '\0';
+ strncpy(data->utsname.nodename, line+10, sizeof(data->utsname.nodename));
+ data->utsname.nodename[sizeof(data->utsname.nodename)-1] = '\0';
+ } else if (!strncmp("Architecture: ", line, 14)) {
+ if (tmp)
+ *tmp = '\0';
+ strncpy(data->utsname.machine, line+14, sizeof(data->utsname.machine));
+ data->utsname.machine[sizeof(data->utsname.machine)-1] = '\0';
+ } else {
+ hwloc_debug("ignored /proc/hwloc-nofile-info line %s\n", line);
+ /* ignored */
+ }
+ }
+ fclose(file);
+ }
+
+ env = getenv("HWLOC_DUMP_NOFILE_INFO");
+ if (env && *env) {
+ file = fopen(env, "w");
+ if (file) {
+ if (*data->utsname.sysname)
+ fprintf(file, "OSName: %s\n", data->utsname.sysname);
+ if (*data->utsname.release)
+ fprintf(file, "OSRelease: %s\n", data->utsname.release);
+ if (*data->utsname.version)
+ fprintf(file, "OSVersion: %s\n", data->utsname.version);
+ if (*data->utsname.nodename)
+ fprintf(file, "HostName: %s\n", data->utsname.nodename);
+ if (*data->utsname.machine)
+ fprintf(file, "Architecture: %s\n", data->utsname.machine);
+ fclose(file);
+ }
+ }
+}
+
+static int
+hwloc_look_linuxfs(struct hwloc_backend *backend)
+{
+ struct hwloc_topology *topology = backend->topology;
+ struct hwloc_linux_backend_data_s *data = backend->private_data;
+ unsigned nbnodes;
+ char *cpuset_mntpnt, *cgroup_mntpnt, *cpuset_name = NULL;
+ int err;
+
+ if (topology->levels[0][0]->cpuset)
+ /* somebody discovered things */
+ return 0;
+
+ hwloc_gather_system_info(topology, data);
+
+ hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+
+ /* Gather the list of admin-disabled cpus and mems */
+ hwloc_find_linux_cpuset_mntpnt(&cgroup_mntpnt, &cpuset_mntpnt, data->root_fd);
+ if (cgroup_mntpnt || cpuset_mntpnt) {
+ cpuset_name = hwloc_read_linux_cpuset_name(data->root_fd, topology->pid);
+ if (cpuset_name) {
+ hwloc_admin_disable_set_from_cpuset(data, cgroup_mntpnt, cpuset_mntpnt, cpuset_name, "cpus", topology->levels[0][0]->allowed_cpuset);
+ hwloc_admin_disable_set_from_cpuset(data, cgroup_mntpnt, cpuset_mntpnt, cpuset_name, "mems", topology->levels[0][0]->allowed_nodeset);
+ }
+ free(cgroup_mntpnt);
+ free(cpuset_mntpnt);
+ }
+
+ /* Get the machine memory attributes */
+ hwloc_get_procfs_meminfo_info(topology, data, &topology->levels[0][0]->memory);
+
+ /* Gather NUMA information. Must be after hwloc_get_procfs_meminfo_info so that the hugepage size is known */
+ if (look_sysfsnode(topology, data, "/sys/bus/node/devices", &nbnodes) < 0)
+ look_sysfsnode(topology, data, "/sys/devices/system/node", &nbnodes);
+
+ /* if we found some numa nodes, the machine object has no local memory */
+ if (nbnodes) {
+ unsigned i;
+ topology->levels[0][0]->memory.local_memory = 0;
+ if (topology->levels[0][0]->memory.page_types)
+ for(i=0; i<topology->levels[0][0]->memory.page_types_len; i++)
+ topology->levels[0][0]->memory.page_types[i].count = 0;
+ }
+
+ /* Gather the list of cpus now */
+ if (getenv("HWLOC_LINUX_USE_CPUINFO")
+ || (hwloc_access("/sys/devices/system/cpu/cpu0/topology/core_siblings", R_OK, data->root_fd) < 0
+ && hwloc_access("/sys/devices/system/cpu/cpu0/topology/thread_siblings", R_OK, data->root_fd) < 0
+ && hwloc_access("/sys/bus/cpu/devices/cpu0/topology/thread_siblings", R_OK, data->root_fd) < 0
+ && hwloc_access("/sys/bus/cpu/devices/cpu0/topology/core_siblings", R_OK, data->root_fd) < 0)) {
+ /* revert to reading cpuinfo only if /sys/.../topology unavailable (before 2.6.16)
+ * or not containing anything interesting */
+ err = look_cpuinfo(topology, data, "/proc/cpuinfo");
+ if (err < 0)
+ hwloc_linux_fallback_pu_level(topology);
+
+ } else {
+ struct hwloc_linux_cpuinfo_proc * Lprocs = NULL;
+ struct hwloc_obj_info_s *global_infos = NULL;
+ unsigned global_infos_count = 0;
+ int numprocs = hwloc_linux_parse_cpuinfo(data, "/proc/cpuinfo", &Lprocs, &global_infos, &global_infos_count);
+ if (numprocs <= 0)
+ Lprocs = NULL;
+ if (look_sysfscpu(topology, data, "/sys/bus/cpu/devices", Lprocs, numprocs) < 0)
+ if (look_sysfscpu(topology, data, "/sys/devices/system/cpu", Lprocs, numprocs) < 0)
+ /* sysfs but we failed to read cpu topology, fallback */
+ hwloc_linux_fallback_pu_level(topology);
+ hwloc__move_infos(&hwloc_get_root_obj(topology)->infos, &hwloc_get_root_obj(topology)->infos_count,
+ &global_infos, &global_infos_count);
+ hwloc_linux_free_cpuinfo(Lprocs, numprocs, global_infos, global_infos_count);
+ }
+
+ /* Gather DMI info */
+ hwloc__get_dmi_id_info(data, topology->levels[0][0]);
+ if (hwloc_topology_get_flags(topology) & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO))
+ hwloc__get_firmware_dmi_memory_info(topology, data);
+
+ hwloc_obj_add_info(topology->levels[0][0], "Backend", "Linux");
+ if (cpuset_name) {
+ hwloc_obj_add_info(topology->levels[0][0], "LinuxCgroup", cpuset_name);
+ free(cpuset_name);
+ }
+
+ hwloc__linux_get_mic_sn(topology, data);
+
+ /* data->utsname was filled with real uname or \0, we can safely pass it */
+ hwloc_add_uname_info(topology, &data->utsname);
+
+ return 1;
+}
+
+
+
+/****************************************
+ ***** Linux PCI backend callbacks ******
+ ****************************************
+ * Do not support changing the fsroot (use sysfs)
+ */
+
+static hwloc_obj_t
+hwloc_linux_add_os_device(struct hwloc_backend *backend, struct hwloc_obj *pcidev, hwloc_obj_osdev_type_t type, const char *name)
+{
+ struct hwloc_topology *topology = backend->topology;
+ struct hwloc_obj *obj = hwloc_alloc_setup_object(HWLOC_OBJ_OS_DEVICE, -1);
+ obj->name = strdup(name);
+ obj->logical_index = -1;
+ obj->attr->osdev.type = type;
+
+ hwloc_insert_object_by_parent(topology, pcidev, obj);
+ /* insert_object_by_parent() doesn't merge during insert, so obj is still valid */
+
+ return obj;
+}
+
+typedef void (*hwloc_linux_class_fillinfos_t)(struct hwloc_backend *backend, struct hwloc_obj *osdev, const char *osdevpath);
+
+/* cannot be used in fsroot-aware code, would have to move to a per-topology variable */
+
+static void
+hwloc_linux_check_deprecated_classlinks_model(struct hwloc_linux_backend_data_s *data)
+{
+ int root_fd = data->root_fd;
+ DIR *dir;
+ struct dirent *dirent;
+ char path[128];
+ struct stat st;
+
+ data->deprecated_classlinks_model = -1;
+
+ dir = hwloc_opendir("/sys/class/net", root_fd);
+ if (!dir)
+ return;
+ while ((dirent = readdir(dir)) != NULL) {
+ if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, "..") || !strcmp(dirent->d_name, "lo"))
+ continue;
+ snprintf(path, sizeof(path), "/sys/class/net/%s/device/net/%s", dirent->d_name, dirent->d_name);
+ if (hwloc_stat(path, &st, root_fd) == 0) {
+ data->deprecated_classlinks_model = 0;
+ goto out;
+ }
+ snprintf(path, sizeof(path), "/sys/class/net/%s/device/net:%s", dirent->d_name, dirent->d_name);
+ if (hwloc_stat(path, &st, root_fd) == 0) {
+ data->deprecated_classlinks_model = 1;
+ goto out;
+ }
+ }
+out:
+ closedir(dir);
+}
+
+/* class objects that are immediately below pci devices:
+ * look for objects of the given classname below a sysfs (pcidev) directory
+ */
+static int
+hwloc_linux_class_readdir(struct hwloc_backend *backend,
+ struct hwloc_obj *pcidev, const char *devicepath,
+ hwloc_obj_osdev_type_t type, const char *classname,
+ hwloc_linux_class_fillinfos_t fillinfo)
+{
+ struct hwloc_linux_backend_data_s *data = backend->private_data;
+ int root_fd = data->root_fd;
+ size_t classnamelen = strlen(classname);
+ char path[256];
+ DIR *dir;
+ struct dirent *dirent;
+ hwloc_obj_t obj;
+ int res = 0, err;
+
+ if (data->deprecated_classlinks_model == -2)
+ hwloc_linux_check_deprecated_classlinks_model(data);
+
+ if (data->deprecated_classlinks_model != 1) {
+ /* modern sysfs: <device>/<class>/<name> */
+ struct stat st;
+ snprintf(path, sizeof(path), "%s/%s", devicepath, classname);
+
+ /* some very host kernel (2.6.9/RHEL4) have <device>/<class> symlink without any way to find <name>.
+ * make sure <device>/<class> is a directory to avoid this case.
+ */
+ err = hwloc_lstat(path, &st, root_fd);
+ if (err < 0 || !S_ISDIR(st.st_mode))
+ goto trydeprecated;
+
+ dir = hwloc_opendir(path, root_fd);
+ if (dir) {
+ data->deprecated_classlinks_model = 0;
+ while ((dirent = readdir(dir)) != NULL) {
+ if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
+ continue;
+ obj = hwloc_linux_add_os_device(backend, pcidev, type, dirent->d_name);
+ if (fillinfo) {
+ snprintf(path, sizeof(path), "%s/%s/%s", devicepath, classname, dirent->d_name);
+ fillinfo(backend, obj, path);
+ }
+ res++;
+ }
+ closedir(dir);
+ return res;
+ }
+ }
+
+trydeprecated:
+ if (data->deprecated_classlinks_model != 0) {
+ /* deprecated sysfs: <device>/<class>:<name> */
+ dir = hwloc_opendir(devicepath, root_fd);
+ if (dir) {
+ while ((dirent = readdir(dir)) != NULL) {
+ if (strncmp(dirent->d_name, classname, classnamelen) || dirent->d_name[classnamelen] != ':')
+ continue;
+ data->deprecated_classlinks_model = 1;
+ obj = hwloc_linux_add_os_device(backend, pcidev, type, dirent->d_name + classnamelen+1);
+ if (fillinfo) {
+ snprintf(path, sizeof(path), "%s/%s", devicepath, dirent->d_name);
+ fillinfo(backend, obj, path);
+ }
+ res++;
+ }
+ closedir(dir);
+ return res;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * look for net objects below a pcidev in sysfs
+ */
+static void
+hwloc_linux_net_class_fillinfos(struct hwloc_backend *backend,
+ struct hwloc_obj *obj, const char *osdevpath)
+{
+ struct hwloc_linux_backend_data_s *data = backend->private_data;
+ int root_fd = data->root_fd;
+ FILE *fd;
+ struct stat st;
+ char path[256];
+ snprintf(path, sizeof(path), "%s/address", osdevpath);
+ fd = hwloc_fopen(path, "r", root_fd);
+ if (fd) {
+ char address[128];
+ if (fgets(address, sizeof(address), fd)) {
+ char *eol = strchr(address, '\n');
+ if (eol)
+ *eol = 0;
+ hwloc_obj_add_info(obj, "Address", address);
+ }
+ fclose(fd);
+ }
+ snprintf(path, sizeof(path), "%s/device/infiniband", osdevpath);
+ if (!hwloc_stat(path, &st, root_fd)) {
+ snprintf(path, sizeof(path), "%s/dev_id", osdevpath);
+ fd = hwloc_fopen(path, "r", root_fd);
+ if (fd) {
+ char hexid[16];
+ if (fgets(hexid, sizeof(hexid), fd)) {
+ char *eoid;
+ unsigned long port;
+ port = strtoul(hexid, &eoid, 0);
+ if (eoid != hexid) {
+ char portstr[16];
+ snprintf(portstr, sizeof(portstr), "%ld", port+1);
+ hwloc_obj_add_info(obj, "Port", portstr);
+ }
+ }
+ fclose(fd);
+ }
+ }
+}
+
+static int
+hwloc_linux_lookup_net_class(struct hwloc_backend *backend,
+ struct hwloc_obj *pcidev, const char *pcidevpath)
+{
+ return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_NETWORK, "net", hwloc_linux_net_class_fillinfos);
+}
+
+/*
+ * look for infiniband objects below a pcidev in sysfs
+ */
+static void
+hwloc_linux_infiniband_class_fillinfos(struct hwloc_backend *backend,
+ struct hwloc_obj *obj, const char *osdevpath)
+{
+ struct hwloc_linux_backend_data_s *data = backend->private_data;
+ int root_fd = data->root_fd;
+ FILE *fd;
+ char path[256];
+ unsigned i,j;
+
+ snprintf(path, sizeof(path), "%s/node_guid", osdevpath);
+ fd = hwloc_fopen(path, "r", root_fd);
+ if (fd) {
+ char guidvalue[20];
+ if (fgets(guidvalue, sizeof(guidvalue), fd)) {
+ size_t len;
+ len = strspn(guidvalue, "0123456789abcdefx:");
+ assert(len == 19);
+ guidvalue[len] = '\0';
+ hwloc_obj_add_info(obj, "NodeGUID", guidvalue);
+ }
+ fclose(fd);
+ }
+
+ snprintf(path, sizeof(path), "%s/sys_image_guid", osdevpath);
+ fd = hwloc_fopen(path, "r", root_fd);
+ if (fd) {
+ char guidvalue[20];
+ if (fgets(guidvalue, sizeof(guidvalue), fd)) {
+ size_t len;
+ len = strspn(guidvalue, "0123456789abcdefx:");
+ assert(len == 19);
+ guidvalue[len] = '\0';
+ hwloc_obj_add_info(obj, "SysImageGUID", guidvalue);
+ }
+ fclose(fd);
+ }
+
+ for(i=1; ; i++) {
+ snprintf(path, sizeof(path), "%s/ports/%u/state", osdevpath, i);
+ fd = hwloc_fopen(path, "r", root_fd);
+ if (fd) {
+ char statevalue[2];
+ if (fgets(statevalue, sizeof(statevalue), fd)) {
+ char statename[32];
+ statevalue[1] = '\0'; /* only keep the first byte/digit */
+ snprintf(statename, sizeof(statename), "Port%uState", i);
+ hwloc_obj_add_info(obj, statename, statevalue);
+ }
+ fclose(fd);
+ } else {
+ /* no such port */
+ break;
+ }
+
+ snprintf(path, sizeof(path), "%s/ports/%u/lid", osdevpath, i);
+ fd = hwloc_fopen(path, "r", root_fd);
+ if (fd) {
+ char lidvalue[11];
+ if (fgets(lidvalue, sizeof(lidvalue), fd)) {
+ char lidname[32];
+ size_t len;
+ len = strspn(lidvalue, "0123456789abcdefx");
+ lidvalue[len] = '\0';
+ snprintf(lidname, sizeof(lidname), "Port%uLID", i);
+ hwloc_obj_add_info(obj, lidname, lidvalue);
+ }
+ fclose(fd);
+ }
+
+ snprintf(path, sizeof(path), "%s/ports/%u/lid_mask_count", osdevpath, i);
+ fd = hwloc_fopen(path, "r", root_fd);
+ if (fd) {
+ char lidvalue[11];
+ if (fgets(lidvalue, sizeof(lidvalue), fd)) {
+ char lidname[32];
+ size_t len;
+ len = strspn(lidvalue, "0123456789");
+ lidvalue[len] = '\0';
+ snprintf(lidname, sizeof(lidname), "Port%uLMC", i);
+ hwloc_obj_add_info(obj, lidname, lidvalue);
+ }
+ fclose(fd);
+ }
+
+ for(j=0; ; j++) {
+ snprintf(path, sizeof(path), "%s/ports/%u/gids/%u", osdevpath, i, j);
+ fd = hwloc_fopen(path, "r", root_fd);
+ if (fd) {
+ char gidvalue[40];
+ if (fgets(gidvalue, sizeof(gidvalue), fd)) {
+ char gidname[32];
+ size_t len;
+ len = strspn(gidvalue, "0123456789abcdefx:");
+ assert(len == 39);
+ gidvalue[len] = '\0';
+ if (strncmp(gidvalue+20, "0000:0000:0000:0000", 19)) {
+ /* only keep initialized GIDs */
+ snprintf(gidname, sizeof(gidname), "Port%uGID%u", i, j);
+ hwloc_obj_add_info(obj, gidname, gidvalue);
+ }
+ }
+ fclose(fd);
+ } else {
+ /* no such port */
+ break;
+ }
+ }
+ }
+}
+
+static int
+hwloc_linux_lookup_openfabrics_class(struct hwloc_backend *backend,
+ struct hwloc_obj *pcidev, const char *pcidevpath)
+{
+ return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_OPENFABRICS, "infiniband", hwloc_linux_infiniband_class_fillinfos);
+}
+
+/* look for dma objects below a pcidev in sysfs */
+static int
+hwloc_linux_lookup_dma_class(struct hwloc_backend *backend,
+ struct hwloc_obj *pcidev, const char *pcidevpath)
+{
+ return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_DMA, "dma", NULL);
+}
+
+/* look for drm objects below a pcidev in sysfs */
+static int
+hwloc_linux_lookup_drm_class(struct hwloc_backend *backend,
+ struct hwloc_obj *pcidev, const char *pcidevpath)
+{
+ return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_GPU, "drm", NULL);
+
+ /* we could look at the "graphics" class too, but it doesn't help for proprietary drivers either */
+
+ /* GPU devices (even with a proprietary driver) seem to have a boot_vga field in their PCI device directory (since 2.6.30),
+ * so we could create a OS device for each PCI devices with such a field.
+ * boot_vga is actually created when class >> 8 == VGA (it contains 1 for boot vga device), so it's trivial anyway.
+ */
+}
+
+/*
+ * look for block objects below a pcidev in sysfs
+ */
+
+static void
+hwloc_linux_block_class_fillinfos(struct hwloc_backend *backend,
+ struct hwloc_obj *obj, const char *osdevpath)
+{
+ struct hwloc_linux_backend_data_s *data = backend->private_data;
+ int root_fd = data->root_fd;
+ FILE *fd;
+ char path[256];
+ char line[128];
+ char vendor[64] = "";
+ char model[64] = "";
+ char serial[64] = "";
+ char revision[64] = "";
+ char blocktype[64] = "";
+ unsigned major_id, minor_id;
+ char *tmp;
+
+ snprintf(path, sizeof(path), "%s/dev", osdevpath);
+ fd = hwloc_fopen(path, "r", root_fd);
+ if (!fd)
+ return;
+
+ if (NULL == fgets(line, sizeof(line), fd)) {
+ fclose(fd);
+ return;
+ }
+ fclose(fd);
+
+ if (sscanf(line, "%u:%u", &major_id, &minor_id) != 2)
+ return;
+ tmp = strchr(line, '\n');
+ if (tmp)
+ *tmp = '\0';
+ hwloc_obj_add_info(obj, "LinuxDeviceID", line);
+
+#ifdef HAVE_LIBUDEV_H
+ if (data->udev) {
+ struct udev_device *dev;
+ const char *prop;
+ dev = udev_device_new_from_subsystem_sysname(data->udev, "block", obj->name);
+ if (!dev)
+ return;
+ prop = udev_device_get_property_value(dev, "ID_VENDOR");
+ if (prop)
+ strcpy(vendor, prop);
+ prop = udev_device_get_property_value(dev, "ID_MODEL");
+ if (prop)
+ strcpy(model, prop);
+ prop = udev_device_get_property_value(dev, "ID_REVISION");
+ if (prop)
+ strcpy(revision, prop);
+ prop = udev_device_get_property_value(dev, "ID_SERIAL_SHORT");
+ if (prop)
+ strcpy(serial, prop);
+ prop = udev_device_get_property_value(dev, "ID_TYPE");
+ if (prop)
+ strcpy(blocktype, prop);
+
+ udev_device_unref(dev);
+ } else
+ /* fallback to reading files, works with any fsroot */
+#endif
+ {
+ snprintf(path, sizeof(path), "/run/udev/data/b%u:%u", major_id, minor_id);
+ fd = hwloc_fopen(path, "r", root_fd);
+ if (!fd)
+ return;
+
+ while (NULL != fgets(line, sizeof(line), fd)) {
+ tmp = strchr(line, '\n');
+ if (tmp)
+ *tmp = '\0';
+ if (!strncmp(line, "E:ID_VENDOR=", strlen("E:ID_VENDOR="))) {
+ strcpy(vendor, line+strlen("E:ID_VENDOR="));
+ } else if (!strncmp(line, "E:ID_MODEL=", strlen("E:ID_MODEL="))) {
+ strcpy(model, line+strlen("E:ID_MODEL="));
+ } else if (!strncmp(line, "E:ID_REVISION=", strlen("E:ID_REVISION="))) {
+ strcpy(revision, line+strlen("E:ID_REVISION="));
+ } else if (!strncmp(line, "E:ID_SERIAL_SHORT=", strlen("E:ID_SERIAL_SHORT="))) {
+ strcpy(serial, line+strlen("E:ID_SERIAL_SHORT="));
+ } else if (!strncmp(line, "E:ID_TYPE=", strlen("E:ID_TYPE="))) {
+ strcpy(blocktype, line+strlen("E:ID_TYPE="));
+ }
+ }
+ fclose(fd);
+ }
+
+ /* clear fake "ATA" vendor name */
+ if (!strcasecmp(vendor, "ATA"))
+ *vendor = '\0';
+ /* overwrite vendor name from model when possible */
+ if (!*vendor) {
+ if (!strncasecmp(model, "wd", 2))
+ strcpy(vendor, "Western Digital");
+ else if (!strncasecmp(model, "st", 2))
+ strcpy(vendor, "Seagate");
+ else if (!strncasecmp(model, "samsung", 7))
+ strcpy(vendor, "Samsung");
+ else if (!strncasecmp(model, "sandisk", 7))
+ strcpy(vendor, "SanDisk");
+ else if (!strncasecmp(model, "toshiba", 7))
+ strcpy(vendor, "Toshiba");
+ }
+
+ if (*vendor)
+ hwloc_obj_add_info(obj, "Vendor", vendor);
+ if (*model)
+ hwloc_obj_add_info(obj, "Model", model);
+ if (*revision)
+ hwloc_obj_add_info(obj, "Revision", revision);
+ if (*serial)
+ hwloc_obj_add_info(obj, "SerialNumber", serial);
+
+ if (!strcmp(blocktype, "disk"))
+ hwloc_obj_add_info(obj, "Type", "Disk");
+ else if (!strcmp(blocktype, "tape"))
+ hwloc_obj_add_info(obj, "Type", "Tape");
+ else if (!strcmp(blocktype, "cd") || !strcmp(blocktype, "floppy") || !strcmp(blocktype, "optical"))
+ hwloc_obj_add_info(obj, "Type", "Removable Media Device");
+ else /* generic, usb mass storage/rbc, usb mass storage/scsi */
+ hwloc_obj_add_info(obj, "Type", "Other");
+}
+
+/* block class objects are in
+ * host%d/target%d:%d:%d/%d:%d:%d:%d/
+ * or
+ * host%d/port-%d:%d/end_device-%d:%d/target%d:%d:%d/%d:%d:%d:%d/
+ * or
+ * ide%d/%d.%d/
+ * below pci devices */
+static int
+hwloc_linux_lookup_host_block_class(struct hwloc_backend *backend,
+ struct hwloc_obj *pcidev, char *path, size_t pathlen)
+{
+ struct hwloc_linux_backend_data_s *data = backend->private_data;
+ int root_fd = data->root_fd;
+ DIR *hostdir, *portdir, *targetdir;
+ struct dirent *hostdirent, *portdirent, *targetdirent;
+ size_t hostdlen, portdlen, targetdlen;
+ int dummy;
+ int res = 0;
+
+ hostdir = hwloc_opendir(path, root_fd);
+ if (!hostdir)
+ return 0;
+
+ while ((hostdirent = readdir(hostdir)) != NULL) {
+ if (sscanf(hostdirent->d_name, "port-%d:%d", &dummy, &dummy) == 2)
+ {
+ /* found host%d/port-%d:%d */
+ path[pathlen] = '/';
+ strcpy(&path[pathlen+1], hostdirent->d_name);
+ pathlen += hostdlen = 1+strlen(hostdirent->d_name);
+ portdir = hwloc_opendir(path, root_fd);
+ if (!portdir)
+ continue;
+ while ((portdirent = readdir(portdir)) != NULL) {
+ if (sscanf(portdirent->d_name, "end_device-%d:%d", &dummy, &dummy) == 2) {
+ /* found host%d/port-%d:%d/end_device-%d:%d */
+ path[pathlen] = '/';
+ strcpy(&path[pathlen+1], portdirent->d_name);
+ pathlen += portdlen = 1+strlen(portdirent->d_name);
+ res += hwloc_linux_lookup_host_block_class(backend, pcidev, path, pathlen);
+ /* restore parent path */
+ pathlen -= portdlen;
+ path[pathlen] = '\0';
+ }
+ }
+ closedir(portdir);
+ /* restore parent path */
+ pathlen -= hostdlen;
+ path[pathlen] = '\0';
+ continue;
+ } else if (sscanf(hostdirent->d_name, "target%d:%d:%d", &dummy, &dummy, &dummy) == 3) {
+ /* found host%d/target%d:%d:%d */
+ path[pathlen] = '/';
+ strcpy(&path[pathlen+1], hostdirent->d_name);
+ pathlen += hostdlen = 1+strlen(hostdirent->d_name);
+ targetdir = hwloc_opendir(path, root_fd);
+ if (!targetdir)
+ continue;
+ while ((targetdirent = readdir(targetdir)) != NULL) {
+ if (sscanf(targetdirent->d_name, "%d:%d:%d:%d", &dummy, &dummy, &dummy, &dummy) != 4)
+ continue;
+ /* found host%d/target%d:%d:%d/%d:%d:%d:%d */
+ path[pathlen] = '/';
+ strcpy(&path[pathlen+1], targetdirent->d_name);
+ pathlen += targetdlen = 1+strlen(targetdirent->d_name);
+ /* lookup block class for real */
+ res += hwloc_linux_class_readdir(backend, pcidev, path, HWLOC_OBJ_OSDEV_BLOCK, "block", hwloc_linux_block_class_fillinfos);
+ /* restore parent path */
+ pathlen -= targetdlen;
+ path[pathlen] = '\0';
+ }
+ closedir(targetdir);
+ /* restore parent path */
+ pathlen -= hostdlen;
+ path[pathlen] = '\0';
+ }
+ }
+ closedir(hostdir);
+
+ return res;
+}
+
+static int
+hwloc_linux_lookup_block_class(struct hwloc_backend *backend,
+ struct hwloc_obj *pcidev, const char *pcidevpath)
+{
+ struct hwloc_linux_backend_data_s *data = backend->private_data;
+ int root_fd = data->root_fd;
+ size_t pathlen;
+ DIR *devicedir, *hostdir;
+ struct dirent *devicedirent, *hostdirent;
+ size_t devicedlen, hostdlen;
+ char path[256];
+ int dummy;
+ int res = 0;
+
+ strcpy(path, pcidevpath);
+ pathlen = strlen(path);
+
+ devicedir = hwloc_opendir(pcidevpath, root_fd);
+ if (!devicedir)
+ return 0;
+
+ while ((devicedirent = readdir(devicedir)) != NULL) {
+ if (sscanf(devicedirent->d_name, "ide%d", &dummy) == 1) {
+ /* found ide%d */
+ path[pathlen] = '/';
+ strcpy(&path[pathlen+1], devicedirent->d_name);
+ pathlen += devicedlen = 1+strlen(devicedirent->d_name);
+ hostdir = hwloc_opendir(path, root_fd);
+ if (!hostdir)
+ continue;
+ while ((hostdirent = readdir(hostdir)) != NULL) {
+ if (sscanf(hostdirent->d_name, "%d.%d", &dummy, &dummy) == 2) {
+ /* found ide%d/%d.%d */
+ path[pathlen] = '/';
+ strcpy(&path[pathlen+1], hostdirent->d_name);
+ pathlen += hostdlen = 1+strlen(hostdirent->d_name);
+ /* lookup block class for real */
+ res += hwloc_linux_class_readdir(backend, pcidev, path, HWLOC_OBJ_OSDEV_BLOCK, "block", NULL);
+ /* restore parent path */
+ pathlen -= hostdlen;
+ path[pathlen] = '\0';
+ }
+ }
+ closedir(hostdir);
+ /* restore parent path */
+ pathlen -= devicedlen;
+ path[pathlen] = '\0';
+ } else if (sscanf(devicedirent->d_name, "host%d", &dummy) == 1) {
+ /* found host%d */
+ path[pathlen] = '/';
+ strcpy(&path[pathlen+1], devicedirent->d_name);
+ pathlen += devicedlen = 1+strlen(devicedirent->d_name);
+ res += hwloc_linux_lookup_host_block_class(backend, pcidev, path, pathlen);
+ /* restore parent path */
+ pathlen -= devicedlen;
+ path[pathlen] = '\0';
+ } else if (sscanf(devicedirent->d_name, "ata%d", &dummy) == 1) {
+ /* found ata%d */
+ path[pathlen] = '/';
+ strcpy(&path[pathlen+1], devicedirent->d_name);
+ pathlen += devicedlen = 1+strlen(devicedirent->d_name);
+ hostdir = hwloc_opendir(path, root_fd);
+ if (!hostdir)
+ continue;
+ while ((hostdirent = readdir(hostdir)) != NULL) {
+ if (sscanf(hostdirent->d_name, "host%d", &dummy) == 1) {
+ /* found ata%d/host%d */
+ path[pathlen] = '/';
+ strcpy(&path[pathlen+1], hostdirent->d_name);
+ pathlen += hostdlen = 1+strlen(hostdirent->d_name);
+ /* lookup block class for real */
+ res += hwloc_linux_lookup_host_block_class(backend, pcidev, path, pathlen);
+ /* restore parent path */
+ pathlen -= hostdlen;
+ path[pathlen] = '\0';
+ }
+ }
+ closedir(hostdir);
+ /* restore parent path */
+ pathlen -= devicedlen;
+ path[pathlen] = '\0';
+ }
+ }
+ closedir(devicedir);
+
+ return res;
+}
+
+static void
+hwloc_linux_mic_class_fillinfos(struct hwloc_backend *backend,
+ struct hwloc_obj *obj, const char *osdevpath)
+{
+ struct hwloc_linux_backend_data_s *data = backend->private_data;
+ int root_fd = data->root_fd;
+ FILE *fd;
+ char path[256];
+
+ hwloc_obj_add_info(obj, "CoProcType", "MIC");
+
+ snprintf(path, sizeof(path), "%s/family", osdevpath);
+ fd = hwloc_fopen(path, "r", root_fd);
+ if (fd) {
+ char family[64];
+ if (fgets(family, sizeof(family), fd)) {
+ char *eol = strchr(family, '\n');
+ if (eol)
+ *eol = 0;
+ hwloc_obj_add_info(obj, "MICFamily", family);
+ }
+ fclose(fd);
+ }
+
+ snprintf(path, sizeof(path), "%s/sku", osdevpath);
+ fd = hwloc_fopen(path, "r", root_fd);
+ if (fd) {
+ char sku[64];
+ if (fgets(sku, sizeof(sku), fd)) {
+ char *eol = strchr(sku, '\n');
+ if (eol)
+ *eol = 0;
+ hwloc_obj_add_info(obj, "MICSKU", sku);
+ }
+ fclose(fd);
+ }
+
+ snprintf(path, sizeof(path), "%s/serialnumber", osdevpath);
+ fd = hwloc_fopen(path, "r", root_fd);
+ if (fd) {
+ char sn[64];
+ if (fgets(sn, sizeof(sn), fd)) {
+ char *eol = strchr(sn, '\n');
+ if (eol)
+ *eol = 0;
+ hwloc_obj_add_info(obj, "MICSerialNumber", sn);
+ }
+ fclose(fd);
+ }
+
+ snprintf(path, sizeof(path), "%s/active_cores", osdevpath);
+ fd = hwloc_fopen(path, "r", root_fd);
+ if (fd) {
+ char string[10];
+ if (fgets(string, sizeof(string), fd)) {
+ unsigned long count = strtoul(string, NULL, 16);
+ snprintf(string, sizeof(string), "%lu", count);
+ hwloc_obj_add_info(obj, "MICActiveCores", string);
+ }
+ fclose(fd);
+ }
+
+ snprintf(path, sizeof(path), "%s/memsize", osdevpath);
+ fd = hwloc_fopen(path, "r", root_fd);
+ if (fd) {
+ char string[20];
+ if (fgets(string, sizeof(string), fd)) {
+ unsigned long count = strtoul(string, NULL, 16);
+ snprintf(string, sizeof(string), "%lu", count);
+ hwloc_obj_add_info(obj, "MICMemorySize", string);
+ }
+ fclose(fd);
+ }
+}
+
+static int
+hwloc_linux_lookup_mic_class(struct hwloc_backend *backend,
+ struct hwloc_obj *pcidev, const char *pcidevpath)
+{
+ return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_COPROC, "mic", hwloc_linux_mic_class_fillinfos);
+}
+
+static int
+hwloc_linux_directlookup_mic_class(struct hwloc_backend *backend,
+ struct hwloc_obj *pcidev)
+{
+ struct hwloc_linux_backend_data_s *data = backend->private_data;
+ int root_fd = data->root_fd;
+ char path[256];
+ struct stat st;
+ hwloc_obj_t obj;
+ unsigned idx;
+ int res = 0;
+
+ if (!data->mic_directlookup_id_max)
+ /* already tried, nothing to do */
+ return 0;
+
+ if (data->mic_directlookup_id_max == (unsigned) -1) {
+ /* never tried, find out the max id */
+ DIR *dir;
+ struct dirent *dirent;
+
+ /* make sure we never do this lookup again */
+ data->mic_directlookup_id_max = 0;
+
+ /* read the entire class and find the max id of mic%u dirents */
+ dir = hwloc_opendir("/sys/devices/virtual/mic", root_fd);
+ if (!dir) {
+ dir = opendir("/sys/class/mic");
+ if (!dir)
+ return 0;
+ }
+ while ((dirent = readdir(dir)) != NULL) {
+ if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
+ continue;
+ if (sscanf(dirent->d_name, "mic%u", &idx) != 1)
+ continue;
+ if (idx >= data->mic_directlookup_id_max)
+ data->mic_directlookup_id_max = idx+1;
+ }
+ closedir(dir);
+ }
+
+ /* now iterate over the mic ids and see if one matches our pcidev */
+ for(idx=0; idx<data->mic_directlookup_id_max; idx++) {
+ snprintf(path, sizeof(path), "/sys/class/mic/mic%u/pci_%02x:%02x.%02x",
+ idx, pcidev->attr->pcidev.bus, pcidev->attr->pcidev.dev, pcidev->attr->pcidev.func);
+ if (hwloc_stat(path, &st, root_fd) < 0)
+ continue;
+ snprintf(path, sizeof(path), "mic%u", idx);
+ obj = hwloc_linux_add_os_device(backend, pcidev, HWLOC_OBJ_OSDEV_COPROC, path);
+ snprintf(path, sizeof(path), "/sys/class/mic/mic%u", idx);
+ hwloc_linux_mic_class_fillinfos(backend, obj, path);
+ res++;
+ }
+
+ return res;
+}
+
+/*
+ * backend callback for inserting objects inside a pci device
+ */
+static int
+hwloc_linux_backend_notify_new_object(struct hwloc_backend *backend, struct hwloc_backend *caller __hwloc_attribute_unused,
+ struct hwloc_obj *obj)
+{
+ struct hwloc_linux_backend_data_s *data = backend->private_data;
+ char pcidevpath[256];
+ int res = 0;
+
+ /* this callback is only used in the libpci backend for now */
+ assert(obj->type == HWLOC_OBJ_PCI_DEVICE);
+
+ snprintf(pcidevpath, sizeof(pcidevpath), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
+ obj->attr->pcidev.domain, obj->attr->pcidev.bus,
+ obj->attr->pcidev.dev, obj->attr->pcidev.func);
+
+ res += hwloc_linux_lookup_net_class(backend, obj, pcidevpath);
+ res += hwloc_linux_lookup_openfabrics_class(backend, obj, pcidevpath);
+ res += hwloc_linux_lookup_dma_class(backend, obj, pcidevpath);
+ res += hwloc_linux_lookup_drm_class(backend, obj, pcidevpath);
+ res += hwloc_linux_lookup_block_class(backend, obj, pcidevpath);
+
+ if (data->mic_need_directlookup == -1) {
+ struct stat st;
+ if (hwloc_stat("/sys/class/mic/mic0", &st, data->root_fd) == 0
+ && hwloc_stat("/sys/class/mic/mic0/device/mic/mic0", &st, data->root_fd) == -1)
+ /* hwloc_linux_lookup_mic_class will fail because pcidev sysfs directories
+ * do not have mic/mic%u symlinks to mic devices (old mic driver).
+ * if so, try from the mic class.
+ */
+ data->mic_need_directlookup = 1;
+ else
+ data->mic_need_directlookup = 0;
+ }
+ if (data->mic_need_directlookup)
+ res += hwloc_linux_directlookup_mic_class(backend, obj);
+ else
+ res += hwloc_linux_lookup_mic_class(backend, obj, pcidevpath);
+
+ return res;
+}
+
+/*
+ * backend callback for retrieving the location of a pci device
+ */
+static int
+hwloc_linux_backend_get_obj_cpuset(struct hwloc_backend *backend,
+ struct hwloc_backend *caller __hwloc_attribute_unused,
+ struct hwloc_obj *obj, hwloc_bitmap_t cpuset)
+{
+ struct hwloc_linux_backend_data_s *data = backend->private_data;
+ char path[256];
+ FILE *file;
+ int err;
+
+ /* this callback is only used in the libpci backend for now */
+ assert(obj->type == HWLOC_OBJ_PCI_DEVICE
+ || (obj->type == HWLOC_OBJ_BRIDGE && obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI));
+
+ snprintf(path, sizeof(path), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/local_cpus",
+ obj->attr->pcidev.domain, obj->attr->pcidev.bus,
+ obj->attr->pcidev.dev, obj->attr->pcidev.func);
+ file = hwloc_fopen(path, "r", data->root_fd);
+ if (file) {
+ err = hwloc_linux_parse_cpumap_file(file, cpuset);
+ fclose(file);
+ if (!err && !hwloc_bitmap_iszero(cpuset))
+ return 0;
+ }
+ return -1;
+}
+
+
+
+/*******************************
+ ******* Linux component *******
+ *******************************/
+
+static void
+hwloc_linux_backend_disable(struct hwloc_backend *backend)
+{
+ struct hwloc_linux_backend_data_s *data = backend->private_data;
+#ifdef HAVE_OPENAT
+ close(data->root_fd);
+#endif
+#ifdef HAVE_LIBUDEV_H
+ if (data->udev)
+ udev_unref(data->udev);
+#endif
+ free(data);
+}
+
+static struct hwloc_backend *
+hwloc_linux_component_instantiate(struct hwloc_disc_component *component,
+ const void *_data1,
+ const void *_data2 __hwloc_attribute_unused,
+ const void *_data3 __hwloc_attribute_unused)
+{
+ struct hwloc_backend *backend;
+ struct hwloc_linux_backend_data_s *data;
+ const char * fsroot_path = _data1;
+ int flags, root = -1;
+
+ backend = hwloc_backend_alloc(component);
+ if (!backend)
+ goto out;
+
+ data = malloc(sizeof(*data));
+ if (!data) {
+ errno = ENOMEM;
+ goto out_with_backend;
+ }
+
+ backend->private_data = data;
+ backend->discover = hwloc_look_linuxfs;
+ backend->get_obj_cpuset = hwloc_linux_backend_get_obj_cpuset;
+ backend->notify_new_object = hwloc_linux_backend_notify_new_object;
+ backend->disable = hwloc_linux_backend_disable;
+
+ /* default values */
+ data->is_real_fsroot = 1;
+ if (!fsroot_path)
+ fsroot_path = "/";
+
+#ifdef HAVE_OPENAT
+ root = open(fsroot_path, O_RDONLY | O_DIRECTORY);
+ if (root < 0)
+ goto out_with_data;
+
+ if (strcmp(fsroot_path, "/")) {
+ backend->is_thissystem = 0;
+ data->is_real_fsroot = 0;
+ }
+
+ /* Since this fd stays open after hwloc returns, mark it as
+ close-on-exec so that children don't inherit it. Stevens says
+ that we should GETFD before we SETFD, so we do. */
+ flags = fcntl(root, F_GETFD, 0);
+ if (-1 == flags ||
+ -1 == fcntl(root, F_SETFD, FD_CLOEXEC | flags)) {
+ close(root);
+ root = -1;
+ goto out_with_data;
+ }
+#else
+ if (strcmp(fsroot_path, "/")) {
+ errno = ENOSYS;
+ goto out_with_data;
+ }
+#endif
+ data->root_fd = root;
+
+#ifdef HAVE_LIBUDEV_H
+ data->udev = NULL;
+ if (data->is_real_fsroot) {
+ data->udev = udev_new();
+ }
+#endif
+
+ data->deprecated_classlinks_model = -2; /* never tried */
+ data->mic_need_directlookup = -1; /* not initialized */
+ data->mic_directlookup_id_max = -1; /* not initialized */
+
+ return backend;
+
+ out_with_data:
+ free(data);
+ out_with_backend:
+ free(backend);
+ out:
+ return NULL;
+}
+
+static struct hwloc_disc_component hwloc_linux_disc_component = {
+ HWLOC_DISC_COMPONENT_TYPE_CPU,
+ "linux",
+ HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+ hwloc_linux_component_instantiate,
+ 50,
+ NULL
+};
+
+const struct hwloc_component hwloc_linux_component = {
+ HWLOC_COMPONENT_ABI,
+ NULL, NULL,
+ HWLOC_COMPONENT_TYPE_DISC,
+ 0,
+ &hwloc_linux_disc_component
+};
+
+
+
+
+#ifdef HWLOC_HAVE_LINUXPCI
+
+/***********************************
+ ******* Linux PCI component *******
+ ***********************************/
+
+#define HWLOC_PCI_REVISION_ID 0x08
+#define HWLOC_PCI_CAP_ID_EXP 0x10
+#define HWLOC_PCI_CLASS_NOT_DEFINED 0x0000
+
+static int
+hwloc_look_linuxfs_pci(struct hwloc_backend *backend)
+{
+ struct hwloc_topology *topology = backend->topology;
+ struct hwloc_backend *tmpbackend;
+ hwloc_obj_t first_obj = NULL, last_obj = NULL;
+ int root_fd = -1;
+ DIR *dir;
+ struct dirent *dirent;
+ int res = 0;
+
+ if (!(hwloc_topology_get_flags(topology) & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO)))
+ return 0;
+
+ if (hwloc_get_next_pcidev(topology, NULL)) {
+ hwloc_debug("%s", "PCI objects already added, ignoring linuxpci backend.\n");
+ return 0;
+ }
+
+ /* hackily find the linux backend to steal its fsroot */
+ tmpbackend = topology->backends;
+ while (tmpbackend) {
+ if (tmpbackend->component == &hwloc_linux_disc_component) {
+ root_fd = ((struct hwloc_linux_backend_data_s *) tmpbackend->private_data)->root_fd;
+ hwloc_debug("linuxpci backend stole linux backend root_fd %d\n", root_fd);
+ break; }
+ tmpbackend = tmpbackend->next;
+ }
+ /* take our own descriptor, either pointing to linux fsroot, or to / if not found */
+ if (root_fd >= 0)
+ root_fd = dup(root_fd);
+ else
+ root_fd = open("/", O_RDONLY | O_DIRECTORY);
+
+ dir = hwloc_opendir("/sys/bus/pci/devices/", root_fd);
+ if (!dir)
+ goto out_with_rootfd;
+
+ while ((dirent = readdir(dir)) != NULL) {
+ unsigned domain, bus, dev, func;
+ hwloc_obj_t obj;
+ struct hwloc_pcidev_attr_s *attr;
+ unsigned os_index;
+ char path[64];
+ char value[16];
+ size_t read;
+ FILE *file;
+
+ if (sscanf(dirent->d_name, "%04x:%02x:%02x.%01x", &domain, &bus, &dev, &func) != 4)
+ continue;
+
+ os_index = (domain << 20) + (bus << 12) + (dev << 4) + func;
+ obj = hwloc_alloc_setup_object(HWLOC_OBJ_PCI_DEVICE, os_index);
+ if (!obj)
+ break;
+ attr = &obj->attr->pcidev;
+
+ attr->domain = domain;
+ attr->bus = bus;
+ attr->dev = dev;
+ attr->func = func;
+
+ /* default (unknown) values */
+ attr->vendor_id = 0;
+ attr->device_id = 0;
+ attr->class_id = HWLOC_PCI_CLASS_NOT_DEFINED;
+ attr->revision = 0;
+ attr->subvendor_id = 0;
+ attr->subdevice_id = 0;
+ attr->linkspeed = 0;
+
+ snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/vendor", dirent->d_name);
+ file = hwloc_fopen(path, "r", root_fd);
+ if (file) {
+ read = fread(value, 1, sizeof(value), file);
+ fclose(file);
+ if (read)
+ attr->vendor_id = strtoul(value, NULL, 16);
+ }
+ snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/device", dirent->d_name);
+ file = hwloc_fopen(path, "r", root_fd);
+ if (file) {
+ read = fread(value, 1, sizeof(value), file);
+ fclose(file);
+ if (read)
+ attr->device_id = strtoul(value, NULL, 16);
+ }
+ snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/class", dirent->d_name);
+ file = hwloc_fopen(path, "r", root_fd);
+ if (file) {
+ read = fread(value, 1, sizeof(value), file);
+ fclose(file);
+ if (read)
+ attr->class_id = strtoul(value, NULL, 16) >> 8;
+ }
+ snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/subsystem_vendor", dirent->d_name);
+ file = hwloc_fopen(path, "r", root_fd);
+ if (file) {
+ read = fread(value, 1, sizeof(value), file);
+ fclose(file);
+ if (read)
+ attr->subvendor_id = strtoul(value, NULL, 16);
+ }
+ snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/subsystem_device", dirent->d_name);
+ file = hwloc_fopen(path, "r", root_fd);
+ if (file) {
+ read = fread(value, 1, sizeof(value), file);
+ fclose(file);
+ if (read)
+ attr->subdevice_id = strtoul(value, NULL, 16);
+ }
+
+ snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/config", dirent->d_name);
+ file = hwloc_fopen(path, "r", root_fd);
+ if (file) {
+#define CONFIG_SPACE_CACHESIZE 256
+ unsigned char config_space_cache[CONFIG_SPACE_CACHESIZE];
+ unsigned offset;
+
+ /* initialize the config space in case we fail to read it (missing permissions, etc). */
+ memset(config_space_cache, 0xff, CONFIG_SPACE_CACHESIZE);
+ read = fread(config_space_cache, 1, CONFIG_SPACE_CACHESIZE, file);
+ (void) read; /* we initialized config_space_cache in case we don't read enough, ignore the read length */
+ fclose(file);
+
+ /* is this a bridge? */
+ hwloc_pci_prepare_bridge(obj, config_space_cache);
+
+ /* get the revision */
+ attr->revision = config_space_cache[HWLOC_PCI_REVISION_ID];
+
+ /* try to get the link speed */
+ offset = hwloc_pci_find_cap(config_space_cache, HWLOC_PCI_CAP_ID_EXP);
+ if (offset > 0 && offset + 20 /* size of PCI express block up to link status */ <= CONFIG_SPACE_CACHESIZE)
+ hwloc_pci_find_linkspeed(config_space_cache, offset, &attr->linkspeed);
+ }
+
+ if (first_obj)
+ last_obj->next_sibling = obj;
+ else
+ first_obj = obj;
+ last_obj = obj;
+ }
+
+ closedir(dir);
+
+ dir = hwloc_opendir("/sys/bus/pci/slots/", root_fd);
+ if (dir) {
+ while ((dirent = readdir(dir)) != NULL) {
+ char path[64];
+ FILE *file;
+ if (dirent->d_name[0] == '.')
+ continue;
+ snprintf(path, sizeof(path), "/sys/bus/pci/slots/%s/address", dirent->d_name);
+ file = hwloc_fopen(path, "r", root_fd);
+ if (file) {
+ unsigned domain, bus, dev;
+ if (fscanf(file, "%x:%x:%x", &domain, &bus, &dev) == 3) {
+ hwloc_obj_t obj = first_obj;
+ while (obj) {
+ if (obj->attr->pcidev.domain == domain
+ && obj->attr->pcidev.bus == bus
+ && obj->attr->pcidev.dev == dev
+ && obj->attr->pcidev.func == 0) {
+ hwloc_obj_add_info(obj, "PCISlot", dirent->d_name);
+ break;
+ }
+ obj = obj->next_sibling;
+ }
+ }
+ fclose(file);
+ }
+ }
+ closedir(dir);
+ }
+
+ res = hwloc_insert_pci_device_list(backend, first_obj);
+
+ out_with_rootfd:
+ close(root_fd);
+ return res;
+}
+
+static struct hwloc_backend *
+hwloc_linuxpci_component_instantiate(struct hwloc_disc_component *component,
+ const void *_data1 __hwloc_attribute_unused,
+ const void *_data2 __hwloc_attribute_unused,
+ const void *_data3 __hwloc_attribute_unused)
+{
+ struct hwloc_backend *backend;
+
+ /* thissystem may not be fully initialized yet, we'll check flags in discover() */
+
+ backend = hwloc_backend_alloc(component);
+ if (!backend)
+ return NULL;
+ backend->flags = HWLOC_BACKEND_FLAG_NEED_LEVELS;
+ backend->discover = hwloc_look_linuxfs_pci;
+ return backend;
+}
+
+static struct hwloc_disc_component hwloc_linuxpci_disc_component = {
+ HWLOC_DISC_COMPONENT_TYPE_MISC,
+ "linuxpci",
+ HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+ hwloc_linuxpci_component_instantiate,
+ 19, /* after pci */
+ NULL
+};
+
+const struct hwloc_component hwloc_linuxpci_component = {
+ HWLOC_COMPONENT_ABI,
+ NULL, NULL,
+ HWLOC_COMPONENT_TYPE_DISC,
+ 0,
+ &hwloc_linuxpci_disc_component
+};
+
+#endif /* HWLOC_HAVE_LINUXPCI */
diff --git a/ext/hwloc/hwloc/topology-noos.c b/ext/hwloc/hwloc/topology-noos.c
new file mode 100644
index 0000000..a926428
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-noos.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria. All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+
+static int
+hwloc_look_noos(struct hwloc_backend *backend)
+{
+ struct hwloc_topology *topology = backend->topology;
+
+ if (topology->levels[0][0]->cpuset)
+ /* somebody discovered things */
+ return 0;
+
+ hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+ hwloc_setup_pu_level(topology, hwloc_fallback_nbprocessors(topology));
+ if (topology->is_thissystem)
+ hwloc_add_uname_info(topology, NULL);
+ return 1;
+}
+
+static struct hwloc_backend *
+hwloc_noos_component_instantiate(struct hwloc_disc_component *component,
+ const void *_data1 __hwloc_attribute_unused,
+ const void *_data2 __hwloc_attribute_unused,
+ const void *_data3 __hwloc_attribute_unused)
+{
+ struct hwloc_backend *backend;
+ backend = hwloc_backend_alloc(component);
+ if (!backend)
+ return NULL;
+ backend->discover = hwloc_look_noos;
+ return backend;
+}
+
+static struct hwloc_disc_component hwloc_noos_disc_component = {
+ HWLOC_DISC_COMPONENT_TYPE_CPU,
+ "no_os",
+ HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+ hwloc_noos_component_instantiate,
+ 40, /* lower than native OS component, higher than globals */
+ NULL
+};
+
+const struct hwloc_component hwloc_noos_component = {
+ HWLOC_COMPONENT_ABI,
+ NULL, NULL,
+ HWLOC_COMPONENT_TYPE_DISC,
+ 0,
+ &hwloc_noos_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology-opencl.cb b/ext/hwloc/hwloc/topology-opencl.cb
new file mode 100644
index 0000000..85057c7
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-opencl.cb
@@ -0,0 +1,346 @@
+/*
+ * Copyright © 2012-2014 Inria. All rights reserved.
+ * Copyright © 2013 Université Bordeaux. All right reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <hwloc/plugins.h>
+
+/* private headers allowed for convenience because this plugin is built within hwloc */
+#include <private/misc.h>
+#include <private/debug.h>
+
+#include <CL/cl_ext.h>
+
+typedef enum hwloc_opencl_device_type_e {
+ HWLOC_OPENCL_DEVICE_AMD
+} hwloc_opencl_device_type_t;
+
+struct hwloc_opencl_backend_data_s {
+ unsigned nr_devices; /* -1 when unknown yet, first callback will setup */
+ struct hwloc_opencl_device_info_s {
+ hwloc_opencl_device_type_t type;
+
+ unsigned platformidx;
+ char platformname[64];
+ unsigned platformdeviceidx;
+ char devicename[64];
+ char devicevendor[64];
+ char devicetype[64];
+
+ unsigned computeunits;
+ unsigned long long globalmemsize;
+
+ union hwloc_opencl_device_info_u {
+ struct hwloc_opencl_device_info_amd_s {
+ unsigned pcidomain, pcibus, pcidev, pcifunc;
+ } amd;
+ } specific;
+ } * devices;
+};
+
+static void
+hwloc_opencl_query_devices(struct hwloc_opencl_backend_data_s *data)
+{
+ cl_platform_id *platform_ids = NULL;
+ cl_uint nr_platforms;
+ cl_device_id *device_ids = NULL;
+ cl_uint nr_devices, nr_total_devices, tmp;
+ cl_int clret;
+ unsigned curpfidx, curpfdvidx, i;
+
+ /* mark the number of devices as 0 in case we fail below,
+ * so that we don't try again later.
+ */
+ data->nr_devices = 0;
+
+ /* count platforms, allocate and get them */
+ clret = clGetPlatformIDs(0, NULL, &nr_platforms);
+ if (CL_SUCCESS != clret || !nr_platforms)
+ goto out;
+ hwloc_debug("%u OpenCL platforms\n", nr_platforms);
+ platform_ids = malloc(nr_platforms * sizeof(*platform_ids));
+ if (!platform_ids)
+ goto out;
+ clret = clGetPlatformIDs(nr_platforms, platform_ids, &nr_platforms);
+ if (CL_SUCCESS != clret || !nr_platforms)
+ goto out_with_platform_ids;
+
+ /* how many devices, total? */
+ tmp = 0;
+ for(i=0; i<nr_platforms; i++) {
+ clret = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_ALL, 0, NULL, &nr_devices);
+ if (CL_SUCCESS != clret)
+ goto out_with_platform_ids;
+ tmp += nr_devices;
+ }
+ nr_total_devices = tmp;
+ hwloc_debug("%u OpenCL devices total\n", nr_total_devices);
+ /* allocate structs */
+ device_ids = malloc(nr_total_devices * sizeof(*device_ids));
+ data->devices = malloc(nr_total_devices * sizeof(*data->devices));
+ if (!data->devices || !device_ids)
+ goto out_with_device_ids;
+ /* actually query device ids */
+ tmp = 0;
+ for(i=0; i<nr_platforms; i++) {
+ clret = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_ALL, nr_total_devices - tmp, device_ids + tmp, &nr_devices);
+ if (CL_SUCCESS != clret)
+ goto out_with_device_ids;
+ tmp += nr_devices;
+ }
+
+ /* query individual devices */
+ curpfidx = 0;
+ curpfdvidx = 0;
+ for(i=0; i<nr_total_devices; i++) {
+ struct hwloc_opencl_device_info_s *info = &data->devices[data->nr_devices];
+ cl_platform_id platform_id = 0;
+ cl_device_type type;
+#ifdef CL_DEVICE_TOPOLOGY_AMD
+ cl_device_topology_amd amdtopo;
+#endif
+ cl_ulong globalmemsize;
+ cl_uint computeunits;
+
+ hwloc_debug("Looking device %p\n", device_ids[i]);
+
+ info->platformname[0] = '\0';
+ clret = clGetDeviceInfo(device_ids[i], CL_DEVICE_PLATFORM, sizeof(platform_id), &platform_id, NULL);
+ if (CL_SUCCESS != clret)
+ continue;
+ clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(info->platformname), info->platformname, NULL);
+
+ info->devicename[0] = '\0';
+#ifdef CL_DEVICE_BOARD_NAME_AMD
+ clGetDeviceInfo(device_ids[i], CL_DEVICE_BOARD_NAME_AMD, sizeof(info->devicename), info->devicename, NULL);
+#else
+ clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, sizeof(info->devicename), info->devicename, NULL);
+#endif
+ info->devicevendor[0] = '\0';
+ clGetDeviceInfo(device_ids[i], CL_DEVICE_VENDOR, sizeof(info->devicevendor), info->devicevendor, NULL);
+
+ clGetDeviceInfo(device_ids[i], CL_DEVICE_TYPE, sizeof(type), &type, NULL);
+ switch (type) {
+ case CL_DEVICE_TYPE_CPU: /* FIXME: cannot happen in PCI devices? */
+ strcpy(info->devicetype, "CPU");
+ break;
+ case CL_DEVICE_TYPE_GPU:
+ strcpy(info->devicetype, "GPU");
+ break;
+ case CL_DEVICE_TYPE_ACCELERATOR:
+ strcpy(info->devicetype, "Accelerator");
+ break;
+ default:
+ strcpy(info->devicetype, "Unknown");
+ break;
+ }
+
+ clGetDeviceInfo(device_ids[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(globalmemsize), &globalmemsize, NULL);
+ info->globalmemsize = globalmemsize / 1024;
+
+ clGetDeviceInfo(device_ids[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(computeunits), &computeunits, NULL);
+ info->computeunits = computeunits;
+
+ hwloc_debug("platform %s device %s vendor %s type %s\n", info->platformname, info->devicename, info->devicevendor, info->devicetype);
+
+ /* find our indexes */
+ while (platform_id != platform_ids[curpfidx]) {
+ curpfidx++;
+ curpfdvidx = 0;
+ }
+ info->platformidx = curpfidx;
+ info->platformdeviceidx = curpfdvidx;
+ curpfdvidx++;
+
+ hwloc_debug("This is opencl%dd%d\n", info->platformidx, info->platformdeviceidx);
+
+#ifdef CL_DEVICE_TOPOLOGY_AMD
+ clret = clGetDeviceInfo(device_ids[i], CL_DEVICE_TOPOLOGY_AMD, sizeof(amdtopo), &amdtopo, NULL);
+ if (CL_SUCCESS != clret) {
+ hwloc_debug("no AMD-specific device information: %d\n", clret);
+ continue;
+ }
+ if (CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD != amdtopo.raw.type) {
+ hwloc_debug("not a PCIe device: %u\n", amdtopo.raw.type);
+ continue;
+ }
+
+ info->type = HWLOC_OPENCL_DEVICE_AMD;
+ info->specific.amd.pcidomain = 0;
+ info->specific.amd.pcibus = amdtopo.pcie.bus;
+ info->specific.amd.pcidev = amdtopo.pcie.device;
+ info->specific.amd.pcifunc = amdtopo.pcie.function;
+
+ hwloc_debug("OpenCL device on PCI 0000:%02x:%02x.%u\n", amdtopo.pcie.bus, amdtopo.pcie.device, amdtopo.pcie.function);
+
+ /* validate this device */
+ data->nr_devices++;
+#endif /* HAVE_DECL_CL_DEVICE_TOPOLOGY_AMD */
+ }
+ free(device_ids);
+ free(platform_ids);
+ return;
+
+out_with_device_ids:
+ free(device_ids);
+ free(data->devices);
+ data->devices = NULL;
+out_with_platform_ids:
+ free(platform_ids);
+out:
+ return;
+}
+
+static int
+hwloc_opencl_backend_notify_new_object(struct hwloc_backend *backend, struct hwloc_backend *caller __hwloc_attribute_unused,
+ struct hwloc_obj *pcidev)
+{
+ struct hwloc_topology *topology = backend->topology;
+ struct hwloc_opencl_backend_data_s *data = backend->private_data;
+ unsigned i;
+
+ if (!(hwloc_topology_get_flags(topology) & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO)))
+ return 0;
+
+ if (!hwloc_topology_is_thissystem(topology)) {
+ hwloc_debug("%s", "\nno OpenCL detection (not thissystem)\n");
+ return 0;
+ }
+
+ if (HWLOC_OBJ_PCI_DEVICE != pcidev->type)
+ return 0;
+
+ if (data->nr_devices == (unsigned) -1) {
+ /* first call, lookup all devices */
+ hwloc_opencl_query_devices(data);
+ /* if it fails, data->nr_devices = 0 so we won't do anything below and in next callbacks */
+ }
+
+ if (!data->nr_devices)
+ /* found no devices */
+ return 0;
+
+ /* now the devices array is ready to use */
+ for(i=0; i<data->nr_devices; i++) {
+ struct hwloc_opencl_device_info_s *info = &data->devices[i];
+ hwloc_obj_t osdev;
+ char buffer[64];
+
+ assert(info->type == HWLOC_OPENCL_DEVICE_AMD);
+ if (info->specific.amd.pcidomain != pcidev->attr->pcidev.domain)
+ continue;
+ if (info->specific.amd.pcibus != pcidev->attr->pcidev.bus)
+ continue;
+ if (info->specific.amd.pcidev != pcidev->attr->pcidev.dev)
+ continue;
+ if (info->specific.amd.pcifunc != pcidev->attr->pcidev.func)
+ continue;
+
+ osdev = hwloc_alloc_setup_object(HWLOC_OBJ_OS_DEVICE, -1);
+ snprintf(buffer, sizeof(buffer), "opencl%dd%d", info->platformidx, info->platformdeviceidx);
+ osdev->name = strdup(buffer);
+ osdev->depth = (unsigned) HWLOC_TYPE_DEPTH_UNKNOWN;
+ osdev->attr->osdev.type = HWLOC_OBJ_OSDEV_COPROC;
+
+ hwloc_obj_add_info(osdev, "CoProcType", "OpenCL");
+ hwloc_obj_add_info(osdev, "Backend", "OpenCL");
+ hwloc_obj_add_info(osdev, "OpenCLDeviceType", info->devicetype);
+
+ if (info->devicevendor[0] != '\0')
+ hwloc_obj_add_info(osdev, "GPUVendor", info->devicevendor);
+ if (info->devicename[0] != '\0')
+ hwloc_obj_add_info(osdev, "GPUModel", info->devicename);
+
+ snprintf(buffer, sizeof(buffer), "%u", info->platformidx);
+ hwloc_obj_add_info(osdev, "OpenCLPlatformIndex", buffer);
+ if (info->platformname[0] != '\0')
+ hwloc_obj_add_info(osdev, "OpenCLPlatformName", info->platformname);
+
+ snprintf(buffer, sizeof(buffer), "%u", info->platformdeviceidx);
+ hwloc_obj_add_info(osdev, "OpenCLPlatformDeviceIndex", buffer);
+
+ snprintf(buffer, sizeof(buffer), "%u", info->computeunits);
+ hwloc_obj_add_info(osdev, "OpenCLComputeUnits", buffer);
+
+ snprintf(buffer, sizeof(buffer), "%llu", info->globalmemsize);
+ hwloc_obj_add_info(osdev, "OpenCLGlobalMemorySize", buffer);
+
+ hwloc_insert_object_by_parent(topology, pcidev, osdev);
+ return 1;
+ }
+
+ return 0;
+}
+
+static void
+hwloc_opencl_backend_disable(struct hwloc_backend *backend)
+{
+ struct hwloc_opencl_backend_data_s *data = backend->private_data;
+ free(data->devices);
+ free(data);
+}
+
+static struct hwloc_backend *
+hwloc_opencl_component_instantiate(struct hwloc_disc_component *component,
+ const void *_data1 __hwloc_attribute_unused,
+ const void *_data2 __hwloc_attribute_unused,
+ const void *_data3 __hwloc_attribute_unused)
+{
+ struct hwloc_backend *backend;
+ struct hwloc_opencl_backend_data_s *data;
+
+ /* thissystem may not be fully initialized yet, we'll check flags in discover() */
+
+ backend = hwloc_backend_alloc(component);
+ if (!backend)
+ return NULL;
+
+ data = malloc(sizeof(*data));
+ if (!data) {
+ free(backend);
+ return NULL;
+ }
+ /* the first callback will initialize those */
+ data->nr_devices = (unsigned) -1; /* unknown yet */
+ data->devices = NULL;
+
+ backend->private_data = data;
+ backend->disable = hwloc_opencl_backend_disable;
+
+ backend->notify_new_object = hwloc_opencl_backend_notify_new_object;
+ return backend;
+}
+
+static struct hwloc_disc_component hwloc_opencl_disc_component = {
+ HWLOC_DISC_COMPONENT_TYPE_MISC,
+ "opencl",
+ HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+ hwloc_opencl_component_instantiate,
+ 10, /* after pci */
+ NULL
+};
+
+static int
+hwloc_opencl_component_init(unsigned long flags)
+{
+ if (flags)
+ return -1;
+ if (hwloc_plugin_check_namespace("opencl", "hwloc_backend_alloc") < 0)
+ return -1;
+ return 0;
+}
+
+#ifdef HWLOC_INSIDE_PLUGIN
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_opencl_component;
+#endif
+
+const struct hwloc_component hwloc_opencl_component = {
+ HWLOC_COMPONENT_ABI,
+ hwloc_opencl_component_init, NULL,
+ HWLOC_COMPONENT_TYPE_DISC,
+ 0,
+ &hwloc_opencl_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology-osf.cb b/ext/hwloc/hwloc/topology-osf.cb
new file mode 100644
index 0000000..5715888
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-osf.cb
@@ -0,0 +1,392 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria. All rights reserved.
+ * Copyright © 2009-2011 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+
+#include <sys/types.h>
+#ifdef HAVE_DIRENT_H
+#include <dirent.h>
+#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <pthread.h>
+
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+
+#include <numa.h>
+#include <radset.h>
+#include <cpuset.h>
+#include <sys/mman.h>
+
+/*
+ * TODO
+ *
+ * nsg_init(), nsg_attach_pid(), RAD_MIGRATE/RAD_WAIT
+ * assign_pid_to_pset()
+ *
+ * pthread_use_only_cpu too?
+ */
+
+static int
+prepare_radset(hwloc_topology_t topology __hwloc_attribute_unused, radset_t *radset, hwloc_const_bitmap_t hwloc_set)
+{
+ unsigned cpu;
+ cpuset_t target_cpuset;
+ cpuset_t cpuset, xor_cpuset;
+ radid_t radid;
+ int ret = 0;
+ int ret_errno = 0;
+ int nbnodes = rad_get_num();
+
+ cpusetcreate(&target_cpuset);
+ cpuemptyset(target_cpuset);
+ hwloc_bitmap_foreach_begin(cpu, hwloc_set)
+ cpuaddset(target_cpuset, cpu);
+ hwloc_bitmap_foreach_end();
+
+ cpusetcreate(&cpuset);
+ cpusetcreate(&xor_cpuset);
+ for (radid = 0; radid < nbnodes; radid++) {
+ cpuemptyset(cpuset);
+ if (rad_get_cpus(radid, cpuset)==-1) {
+ fprintf(stderr,"rad_get_cpus(%d) failed: %s\n",radid,strerror(errno));
+ continue;
+ }
+ cpuxorset(target_cpuset, cpuset, xor_cpuset);
+ if (cpucountset(xor_cpuset) == 0) {
+ /* Found it */
+ radsetcreate(radset);
+ rademptyset(*radset);
+ radaddset(*radset, radid);
+ ret = 1;
+ goto out;
+ }
+ }
+ /* radset containing exactly this set of CPUs not found */
+ ret_errno = EXDEV;
+
+out:
+ cpusetdestroy(&target_cpuset);
+ cpusetdestroy(&cpuset);
+ cpusetdestroy(&xor_cpuset);
+ errno = ret_errno;
+ return ret;
+}
+
+/* Note: get_cpubind not available on OSF */
+
+static int
+hwloc_osf_set_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t thread, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+ radset_t radset;
+
+ if (hwloc_bitmap_isequal(hwloc_set, hwloc_topology_get_complete_cpuset(topology))) {
+ if ((errno = pthread_rad_detach(thread)))
+ return -1;
+ return 0;
+ }
+
+ /* Apparently OSF migrates pages */
+ if (flags & HWLOC_CPUBIND_NOMEMBIND) {
+ errno = ENOSYS;
+ return -1;
+ }
+
+ if (!prepare_radset(topology, &radset, hwloc_set))
+ return -1;
+
+ if (flags & HWLOC_CPUBIND_STRICT) {
+ if ((errno = pthread_rad_bind(thread, radset, RAD_INSIST | RAD_WAIT)))
+ return -1;
+ } else {
+ if ((errno = pthread_rad_attach(thread, radset, RAD_WAIT)))
+ return -1;
+ }
+ radsetdestroy(&radset);
+
+ return 0;
+}
+
+static int
+hwloc_osf_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+ radset_t radset;
+
+ if (hwloc_bitmap_isequal(hwloc_set, hwloc_topology_get_complete_cpuset(topology))) {
+ if (rad_detach_pid(pid))
+ return -1;
+ return 0;
+ }
+
+ /* Apparently OSF migrates pages */
+ if (flags & HWLOC_CPUBIND_NOMEMBIND) {
+ errno = ENOSYS;
+ return -1;
+ }
+
+ if (!prepare_radset(topology, &radset, hwloc_set))
+ return -1;
+
+ if (flags & HWLOC_CPUBIND_STRICT) {
+ if (rad_bind_pid(pid, radset, RAD_INSIST | RAD_WAIT))
+ return -1;
+ } else {
+ if (rad_attach_pid(pid, radset, RAD_WAIT))
+ return -1;
+ }
+ radsetdestroy(&radset);
+
+ return 0;
+}
+
+static int
+hwloc_osf_set_thisthread_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+ return hwloc_osf_set_thread_cpubind(topology, pthread_self(), hwloc_set, flags);
+}
+
+static int
+hwloc_osf_set_thisproc_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+ return hwloc_osf_set_proc_cpubind(topology, getpid(), hwloc_set, flags);
+}
+
+static int
+hwloc_osf_prepare_mattr(hwloc_topology_t topology __hwloc_attribute_unused, memalloc_attr_t *mattr, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags __hwloc_attribute_unused)
+{
+ unsigned long osf_policy;
+ int node;
+
+ switch (policy) {
+ case HWLOC_MEMBIND_FIRSTTOUCH:
+ osf_policy = MPOL_THREAD;
+ break;
+ case HWLOC_MEMBIND_DEFAULT:
+ case HWLOC_MEMBIND_BIND:
+ osf_policy = MPOL_DIRECTED;
+ break;
+ case HWLOC_MEMBIND_INTERLEAVE:
+ osf_policy = MPOL_STRIPPED;
+ break;
+ case HWLOC_MEMBIND_REPLICATE:
+ osf_policy = MPOL_REPLICATED;
+ break;
+ default:
+ errno = ENOSYS;
+ return -1;
+ }
+
+ memset(mattr, 0, sizeof(*mattr));
+ mattr->mattr_policy = osf_policy;
+ mattr->mattr_rad = RAD_NONE;
+ radsetcreate(&mattr->mattr_radset);
+ rademptyset(mattr->mattr_radset);
+
+ hwloc_bitmap_foreach_begin(node, nodeset)
+ radaddset(mattr->mattr_radset, node);
+ hwloc_bitmap_foreach_end();
+ return 0;
+}
+
+static int
+hwloc_osf_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+ memalloc_attr_t mattr;
+ int behavior = 0;
+ int ret;
+
+ if (flags & HWLOC_MEMBIND_MIGRATE)
+ behavior |= MADV_CURRENT;
+ if (flags & HWLOC_MEMBIND_STRICT)
+ behavior |= MADV_INSIST;
+
+ if (hwloc_osf_prepare_mattr(topology, &mattr, nodeset, policy, flags))
+ return -1;
+
+ ret = nmadvise(addr, len, MADV_CURRENT, &mattr);
+ radsetdestroy(&mattr.mattr_radset);
+ return ret;
+}
+
+static void *
+hwloc_osf_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+ memalloc_attr_t mattr;
+ void *ptr;
+
+ if (hwloc_osf_prepare_mattr(topology, &mattr, nodeset, policy, flags))
+ return hwloc_alloc_or_fail(topology, len, flags);
+
+ /* TODO: rather use acreate/amalloc ? */
+ ptr = nmmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1,
+ 0, &mattr);
+ radsetdestroy(&mattr.mattr_radset);
+ return ptr;
+}
+
+static int
+hwloc_look_osf(struct hwloc_backend *backend)
+{
+ struct hwloc_topology *topology = backend->topology;
+ cpu_cursor_t cursor;
+ unsigned nbnodes;
+ radid_t radid, radid2;
+ radset_t radset, radset2;
+ cpuid_t cpuid;
+ cpuset_t cpuset;
+ struct hwloc_obj *obj;
+ unsigned distance;
+
+ if (topology->levels[0][0]->cpuset)
+ /* somebody discovered things */
+ return 0;
+
+ hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+
+ nbnodes = rad_get_num();
+
+ cpusetcreate(&cpuset);
+ radsetcreate(&radset);
+ radsetcreate(&radset2);
+ {
+ hwloc_obj_t *nodes = calloc(nbnodes, sizeof(hwloc_obj_t));
+ unsigned *indexes = calloc(nbnodes, sizeof(unsigned));
+ float *distances = calloc(nbnodes*nbnodes, sizeof(float));
+ unsigned nfound;
+ numa_attr_t attr;
+
+ attr.nattr_type = R_RAD;
+ attr.nattr_descr.rd_radset = radset;
+ attr.nattr_flags = 0;
+
+ for (radid = 0; radid < (radid_t) nbnodes; radid++) {
+ rademptyset(radset);
+ radaddset(radset, radid);
+ cpuemptyset(cpuset);
+ if (rad_get_cpus(radid, cpuset)==-1) {
+ fprintf(stderr,"rad_get_cpus(%d) failed: %s\n",radid,strerror(errno));
+ continue;
+ }
+
+ indexes[radid] = radid;
+ nodes[radid] = obj = hwloc_alloc_setup_object(HWLOC_OBJ_NUMANODE, radid);
+ obj->nodeset = hwloc_bitmap_alloc();
+ hwloc_bitmap_set(obj->nodeset, radid);
+ obj->cpuset = hwloc_bitmap_alloc();
+ obj->memory.local_memory = rad_get_physmem(radid) * hwloc_getpagesize();
+ obj->memory.page_types_len = 2;
+ obj->memory.page_types = malloc(2*sizeof(*obj->memory.page_types));
+ memset(obj->memory.page_types, 0, 2*sizeof(*obj->memory.page_types));
+ obj->memory.page_types[0].size = hwloc_getpagesize();
+#ifdef HAVE__SC_LARGE_PAGESIZE
+ obj->memory.page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
+#endif
+
+ cursor = SET_CURSOR_INIT;
+ while((cpuid = cpu_foreach(cpuset, 0, &cursor)) != CPU_NONE)
+ hwloc_bitmap_set(obj->cpuset, cpuid);
+
+ hwloc_debug_1arg_bitmap("node %d has cpuset %s\n",
+ radid, obj->cpuset);
+
+ hwloc_insert_object_by_cpuset(topology, obj);
+
+ nfound = 0;
+ for (radid2 = 0; radid2 < (radid_t) nbnodes; radid2++)
+ distances[radid*nbnodes+radid2] = RAD_DIST_REMOTE;
+ for (distance = RAD_DIST_LOCAL; distance < RAD_DIST_REMOTE; distance++) {
+ attr.nattr_distance = distance;
+ /* get set of NUMA nodes at distance <= DISTANCE */
+ if (nloc(&attr, radset2)) {
+ fprintf(stderr,"nloc failed: %s\n", strerror(errno));
+ continue;
+ }
+ cursor = SET_CURSOR_INIT;
+ while ((radid2 = rad_foreach(radset2, 0, &cursor)) != RAD_NONE) {
+ if (distances[radid*nbnodes+radid2] == RAD_DIST_REMOTE) {
+ distances[radid*nbnodes+radid2] = (float) distance;
+ nfound++;
+ }
+ }
+ if (nfound == nbnodes)
+ /* Finished finding distances, no need to go up to RAD_DIST_REMOTE */
+ break;
+ }
+ }
+
+ hwloc_distances_set(topology, HWLOC_OBJ_NUMANODE, nbnodes, indexes, nodes, distances, 0 /* OS cannot force */);
+ }
+ radsetdestroy(&radset2);
+ radsetdestroy(&radset);
+ cpusetdestroy(&cpuset);
+
+ /* add PU objects */
+ hwloc_setup_pu_level(topology, hwloc_fallback_nbprocessors(topology));
+
+ hwloc_obj_add_info(topology->levels[0][0], "Backend", "OSF");
+ if (topology->is_thissystem)
+ hwloc_add_uname_info(topology, NULL);
+ return 1;
+}
+
+void
+hwloc_set_osf_hooks(struct hwloc_binding_hooks *hooks,
+ struct hwloc_topology_support *support)
+{
+ hooks->set_thread_cpubind = hwloc_osf_set_thread_cpubind;
+ hooks->set_thisthread_cpubind = hwloc_osf_set_thisthread_cpubind;
+ hooks->set_proc_cpubind = hwloc_osf_set_proc_cpubind;
+ hooks->set_thisproc_cpubind = hwloc_osf_set_thisproc_cpubind;
+ hooks->set_area_membind = hwloc_osf_set_area_membind;
+ hooks->alloc_membind = hwloc_osf_alloc_membind;
+ hooks->alloc = hwloc_alloc_mmap;
+ hooks->free_membind = hwloc_free_mmap;
+ support->membind->firsttouch_membind = 1;
+ support->membind->bind_membind = 1;
+ support->membind->interleave_membind = 1;
+ support->membind->replicate_membind = 1;
+}
+
+static struct hwloc_backend *
+hwloc_osf_component_instantiate(struct hwloc_disc_component *component,
+ const void *_data1 __hwloc_attribute_unused,
+ const void *_data2 __hwloc_attribute_unused,
+ const void *_data3 __hwloc_attribute_unused)
+{
+ struct hwloc_backend *backend;
+ backend = hwloc_backend_alloc(component);
+ if (!backend)
+ return NULL;
+ backend->discover = hwloc_look_osf;
+ return backend;
+}
+
+static struct hwloc_disc_component hwloc_osf_disc_component = {
+ HWLOC_DISC_COMPONENT_TYPE_CPU,
+ "osf",
+ HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+ hwloc_osf_component_instantiate,
+ 50,
+ NULL
+};
+
+const struct hwloc_component hwloc_osf_component = {
+ HWLOC_COMPONENT_ABI,
+ NULL, NULL,
+ HWLOC_COMPONENT_TYPE_DISC,
+ 0,
+ &hwloc_osf_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology-synthetic.c b/ext/hwloc/hwloc/topology-synthetic.c
new file mode 100644
index 0000000..237729a
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-synthetic.c
@@ -0,0 +1,1128 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria. All rights reserved.
+ * Copyright © 2009-2010 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/misc.h>
+#include <private/debug.h>
+
+#include <limits.h>
+#include <assert.h>
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+
+struct hwloc_synthetic_level_data_s {
+ unsigned arity;
+ unsigned long totalwidth;
+ hwloc_obj_type_t type;
+ unsigned depth; /* For caches/groups */
+ hwloc_obj_cache_type_t cachetype; /* For caches */
+ hwloc_uint64_t memorysize; /* For caches/memory */
+
+ /* the indexes= attribute before parsing */
+ const char *index_string;
+ unsigned long index_string_length;
+ /* the array of explicit indexes after parsing */
+ unsigned *index_array;
+
+ /* used while filling the topology */
+ unsigned next_os_index; /* id of the next object for that level */
+};
+
+struct hwloc_synthetic_backend_data_s {
+ /* synthetic backend parameters */
+ char *string;
+#define HWLOC_SYNTHETIC_MAX_DEPTH 128
+ struct hwloc_synthetic_level_data_s level[HWLOC_SYNTHETIC_MAX_DEPTH];
+};
+
+struct hwloc_synthetic_intlv_loop_s {
+ unsigned step;
+ unsigned nb;
+ unsigned level_depth;
+};
+
+static void
+hwloc_synthetic_process_level_indexes(struct hwloc_synthetic_backend_data_s *data,
+ unsigned curleveldepth,
+ int verbose)
+{
+ struct hwloc_synthetic_level_data_s *curlevel = &data->level[curleveldepth];
+ unsigned long total = curlevel->totalwidth;
+ const char *attr = curlevel->index_string;
+ unsigned long length = curlevel->index_string_length;
+ unsigned *array = NULL;
+ struct hwloc_synthetic_intlv_loop_s * loops = NULL;
+ unsigned long i;
+
+ if (!attr)
+ return;
+
+ array = calloc(total, sizeof(*array));
+ if (!array) {
+ if (verbose)
+ fprintf(stderr, "Failed to allocate synthetic index array of size %lu\n", total);
+ goto out;
+ }
+
+ i = strspn(attr, "0123456789,");
+ if (i == length) {
+ /* explicit array of indexes */
+
+ for(i=0; i<total; i++) {
+ const char *next;
+ unsigned idx = strtoul(attr, (char **) &next, 10);
+ if (next == attr) {
+ if (verbose)
+ fprintf(stderr, "Failed to read synthetic index #%lu at '%s'\n", i, attr);
+ goto out_with_array;
+ }
+
+ array[i] = idx;
+ if (i != total-1) {
+ if (*next != ',') {
+ if (verbose)
+ fprintf(stderr, "Missing comma after synthetic index #%lu at '%s'\n", i, attr);
+ goto out_with_array;
+ }
+ attr = next+1;
+ } else {
+ attr = next;
+ }
+ }
+ curlevel->index_array = array;
+
+ } else {
+ /* interleaving */
+ unsigned nr_loops = 1, cur_loop;
+ unsigned minstep = total;
+ unsigned long nbs = 1;
+ unsigned j, mul;
+ const char *tmp;
+
+ tmp = attr;
+ while (tmp) {
+ tmp = strchr(tmp, ':');
+ if (!tmp || tmp >= attr+length)
+ break;
+ nr_loops++;
+ tmp++;
+ }
+ /* nr_loops colon-separated fields, but we may need one more at the end */
+ loops = malloc((nr_loops+1)*sizeof(*loops));
+ if (!loops) {
+ if (verbose)
+ fprintf(stderr, "Failed to allocate synthetic index interleave loop array of size %u\n", nr_loops);
+ goto out_with_array;
+ }
+
+ if (*attr >= '0' && *attr <= '9') {
+ /* interleaving as x*y:z*t:... */
+ unsigned step, nb;
+
+ tmp = attr;
+ cur_loop = 0;
+ while (tmp) {
+ char *tmp2, *tmp3;
+ step = (unsigned) strtol(tmp, &tmp2, 0);
+ if (tmp2 == tmp || *tmp2 != '*') {
+ if (verbose)
+ fprintf(stderr, "Failed to read synthetic index interleaving loop '%s' without number before '*'\n", tmp);
+ goto out_with_loops;
+ }
+ if (!step) {
+ if (verbose)
+ fprintf(stderr, "Invalid interleaving loop with step 0 at '%s'\n", tmp);
+ goto out_with_loops;
+ }
+ tmp2++;
+ nb = (unsigned) strtol(tmp2, &tmp3, 0);
+ if (tmp3 == tmp2 || (*tmp3 && *tmp3 != ':' && *tmp3 != ')' && *tmp3 != ' ')) {
+ if (verbose)
+ fprintf(stderr, "Failed to read synthetic index interleaving loop '%s' without number between '*' and ':'\n", tmp);
+ goto out_with_loops;
+ }
+ if (!nb) {
+ if (verbose)
+ fprintf(stderr, "Invalid interleaving loop with number 0 at '%s'\n", tmp2);
+ goto out_with_loops;
+ }
+ loops[cur_loop].step = step;
+ loops[cur_loop].nb = nb;
+ if (step < minstep)
+ minstep = step;
+ nbs *= nb;
+ cur_loop++;
+ if (*tmp3 == ')' || *tmp3 == ' ')
+ break;
+ tmp = (const char*) (tmp3+1);
+ }
+
+ } else {
+ /* interleaving as type1:type2:... */
+ hwloc_obj_type_t type;
+ hwloc_obj_cache_type_t cachetypeattr;
+ int depthattr;
+ int err;
+
+ /* find level depths for each interleaving loop */
+ tmp = attr;
+ cur_loop = 0;
+ while (tmp) {
+ err = hwloc_obj_type_sscanf(tmp, &type, &depthattr, &cachetypeattr, sizeof(cachetypeattr));
+ if (err < 0) {
+ if (verbose)
+ fprintf(stderr, "Failed to read synthetic index interleaving loop type '%s'\n", tmp);
+ goto out_with_loops;
+ }
+ if (type == HWLOC_OBJ_MISC || type == HWLOC_OBJ_BRIDGE || type == HWLOC_OBJ_PCI_DEVICE || type == HWLOC_OBJ_OS_DEVICE) {
+ if (verbose)
+ fprintf(stderr, "Misc object type disallowed in synthetic index interleaving loop type '%s'\n", tmp);
+ goto out_with_loops;
+ }
+ for(i=0; i<curleveldepth; i++) {
+ if (type != data->level[i].type)
+ continue;
+ if ((type == HWLOC_OBJ_GROUP || type == HWLOC_OBJ_CACHE)
+ && depthattr != -1
+ && (unsigned) depthattr != data->level[i].depth)
+ continue;
+ if (type == HWLOC_OBJ_CACHE
+ && cachetypeattr != (hwloc_obj_cache_type_t) -1
+ && cachetypeattr != data->level[i].cachetype)
+ continue;
+ loops[cur_loop].level_depth = i;
+ break;
+ }
+ if (i == curleveldepth) {
+ if (verbose)
+ fprintf(stderr, "Failed to find level for synthetic index interleaving loop type '%s' above '%s'\n",
+ tmp, hwloc_obj_type_string(curlevel->type));
+ goto out_with_loops;
+ }
+ tmp = strchr(tmp, ':');
+ if (!tmp || tmp > attr+length)
+ break;
+ tmp++;
+ cur_loop++;
+ }
+
+ /* compute actual loop step/nb */
+ for(cur_loop=0; cur_loop<nr_loops; cur_loop++) {
+ unsigned mydepth = loops[cur_loop].level_depth;
+ unsigned prevdepth = 0;
+ unsigned step, nb;
+ for(i=0; i<nr_loops; i++) {
+ if (loops[i].level_depth == mydepth && i != cur_loop) {
+ if (verbose)
+ fprintf(stderr, "Invalid duplicate interleaving loop type in synthetic index '%s'\n", attr);
+ goto out_with_loops;
+ }
+ if (loops[i].level_depth < mydepth
+ && loops[i].level_depth > prevdepth)
+ prevdepth = loops[i].level_depth;
+ }
+ step = curlevel->totalwidth / data->level[mydepth].totalwidth; /* number of objects below us */
+ nb = data->level[mydepth].totalwidth / data->level[prevdepth].totalwidth; /* number of us within parent */
+
+ loops[cur_loop].step = step;
+ loops[cur_loop].nb = nb;
+ assert(nb);
+ assert(step);
+ if (step < minstep)
+ minstep = step;
+ nbs *= nb;
+ }
+ }
+ assert(nbs);
+
+ if (nbs != total) {
+ /* one loop of total/nbs steps is missing, add it if it's just the smallest one */
+ if (minstep == total/nbs) {
+ loops[nr_loops].step = 1;
+ loops[nr_loops].nb = total/nbs;
+ nr_loops++;
+ } else {
+ if (verbose)
+ fprintf(stderr, "Invalid index interleaving total width %lu instead of %lu\n", nbs, total);
+ goto out_with_loops;
+ }
+ }
+
+ /* generate the array of indexes */
+ mul = 1;
+ for(i=0; i<nr_loops; i++) {
+ unsigned step = loops[i].step;
+ unsigned nb = loops[i].nb;
+ for(j=0; j<total; j++)
+ array[j] += ((j / step) % nb) * mul;
+ mul *= nb;
+ }
+
+ /* check that we have the right values (cannot pass total, cannot give duplicate 0) */
+ for(j=0; j<total; j++) {
+ if (array[j] >= total) {
+ if (verbose)
+ fprintf(stderr, "Invalid index interleaving generates out-of-range index %u\n", array[j]);
+ goto out_with_loops;
+ }
+ if (!array[j] && j) {
+ if (verbose)
+ fprintf(stderr, "Invalid index interleaving generates duplicate index values\n");
+ goto out_with_loops;
+ }
+ }
+
+ free(loops);
+ curlevel->index_array = array;
+ }
+
+ return;
+
+ out_with_loops:
+ free(loops);
+ out_with_array:
+ free(array);
+ out:
+ return;
+}
+
+static hwloc_uint64_t
+hwloc_synthetic_parse_memory_attr(const char *attr, const char **endp)
+{
+ const char *endptr;
+ hwloc_uint64_t size;
+ size = strtoull(attr, (char **) &endptr, 0);
+ if (!hwloc_strncasecmp(endptr, "TB", 2)) {
+ size <<= 40;
+ endptr += 2;
+ } else if (!hwloc_strncasecmp(endptr, "GB", 2)) {
+ size <<= 30;
+ endptr += 2;
+ } else if (!hwloc_strncasecmp(endptr, "MB", 2)) {
+ size <<= 20;
+ endptr += 2;
+ } else if (!hwloc_strncasecmp(endptr, "kB", 2)) {
+ size <<= 10;
+ endptr += 2;
+ }
+ *endp = endptr;
+ return size;
+}
+
+static int
+hwloc_synthetic_parse_level_attrs(const char *attrs, const char **next_posp,
+ struct hwloc_synthetic_level_data_s *curlevel,
+ int verbose)
+{
+ hwloc_obj_type_t type = curlevel->type;
+ const char *next_pos;
+ hwloc_uint64_t memorysize = 0;
+ const char *index_string = NULL;
+ unsigned long index_string_length = 0;
+
+ next_pos = (const char *) strchr(attrs, ')');
+ if (!next_pos) {
+ if (verbose)
+ fprintf(stderr, "Missing attribute closing bracket in synthetic string doesn't have a number of objects at '%s'\n", attrs);
+ errno = EINVAL;
+ return -1;
+ }
+
+ while (')' != *attrs) {
+ if (HWLOC_OBJ_CACHE == type && !strncmp("size=", attrs, 5)) {
+ memorysize = hwloc_synthetic_parse_memory_attr(attrs+5, &attrs);
+
+ } else if (HWLOC_OBJ_CACHE != type && !strncmp("memory=", attrs, 7)) {
+ memorysize = hwloc_synthetic_parse_memory_attr(attrs+7, &attrs);
+
+ } else if (!strncmp("indexes=", attrs, 8)) {
+ index_string = attrs+8;
+ attrs += 8;
+ index_string_length = strcspn(attrs, " )");
+ attrs += index_string_length;
+
+ } else {
+ if (verbose)
+ fprintf(stderr, "Unknown attribute at '%s'\n", attrs);
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (' ' == *attrs)
+ attrs++;
+ else if (')' != *attrs) {
+ if (verbose)
+ fprintf(stderr, "Missing parameter separator at '%s'\n", attrs);
+ errno = EINVAL;
+ return -1;
+ }
+ }
+
+ curlevel->memorysize = memorysize;
+ curlevel->index_string = index_string;
+ curlevel->index_string_length = index_string_length;
+ *next_posp = next_pos+1;
+ return 0;
+}
+
+/* Read from description a series of integers describing a symmetrical
+ topology and update the hwloc_synthetic_backend_data_s accordingly. On
+ success, return zero. */
+static int
+hwloc_backend_synthetic_init(struct hwloc_synthetic_backend_data_s *data,
+ const char *description)
+{
+ const char *pos, *next_pos;
+ unsigned long item, count;
+ unsigned i;
+ int cache_depth = 0, group_depth = 0;
+ int nb_machine_levels = 0, nb_node_levels = 0;
+ int nb_pu_levels = 0;
+ int verbose = 0;
+ const char *env = getenv("HWLOC_SYNTHETIC_VERBOSE");
+ int err;
+ unsigned long totalarity = 1;
+
+ if (env)
+ verbose = atoi(env);
+
+ /* default values before we add root attributes */
+ data->level[0].totalwidth = 1;
+ data->level[0].type = HWLOC_OBJ_MACHINE;
+ data->level[0].index_string = NULL;
+ data->level[0].index_array = NULL;
+ data->level[0].memorysize = 0;
+ if (*description == '(') {
+ err = hwloc_synthetic_parse_level_attrs(description+1, &description, &data->level[0], verbose);
+ if (err < 0)
+ return err;
+ }
+
+ for (pos = description, count = 1; *pos; pos = next_pos) {
+#define HWLOC_OBJ_TYPE_UNKNOWN ((hwloc_obj_type_t) -1)
+ hwloc_obj_type_t type = HWLOC_OBJ_TYPE_UNKNOWN;
+ int typedepth = -1;
+ hwloc_obj_cache_type_t cachetype = (hwloc_obj_cache_type_t) -1;
+
+ /* initialize parent arity to 0 so that the levels are not infinite */
+ data->level[count-1].arity = 0;
+
+ while (*pos == ' ')
+ pos++;
+
+ if (!*pos)
+ break;
+
+ if (*pos < '0' || *pos > '9') {
+ if (hwloc_obj_type_sscanf(pos, &type, &typedepth, &cachetype, sizeof(cachetype)) < 0) {
+ if (verbose)
+ fprintf(stderr, "Synthetic string with unknown object type at '%s'\n", pos);
+ errno = EINVAL;
+ goto error;
+ }
+ if (type == HWLOC_OBJ_MISC || type == HWLOC_OBJ_BRIDGE || type == HWLOC_OBJ_PCI_DEVICE || type == HWLOC_OBJ_OS_DEVICE) {
+ if (verbose)
+ fprintf(stderr, "Synthetic string with disallowed object type at '%s'\n", pos);
+ errno = EINVAL;
+ goto error;
+ }
+
+ next_pos = strchr(pos, ':');
+ if (!next_pos) {
+ if (verbose)
+ fprintf(stderr,"Synthetic string doesn't have a `:' after object type at '%s'\n", pos);
+ errno = EINVAL;
+ goto error;
+ }
+ pos = next_pos + 1;
+ }
+ data->level[count].type = type;
+ data->level[count].depth = (unsigned) typedepth;
+ data->level[count].cachetype = cachetype;
+
+ item = strtoul(pos, (char **)&next_pos, 0);
+ if (next_pos == pos) {
+ if (verbose)
+ fprintf(stderr,"Synthetic string doesn't have a number of objects at '%s'\n", pos);
+ errno = EINVAL;
+ goto error;
+ }
+ data->level[count-1].arity = (unsigned)item;
+
+ totalarity *= item;
+ data->level[count].totalwidth = totalarity;
+ data->level[count].index_string = NULL;
+ data->level[count].index_array = NULL;
+ data->level[count].memorysize = 0;
+ if (*next_pos == '(') {
+ err = hwloc_synthetic_parse_level_attrs(next_pos+1, &next_pos, &data->level[count], verbose);
+ if (err < 0)
+ goto error;
+ }
+
+ if (count + 1 >= HWLOC_SYNTHETIC_MAX_DEPTH) {
+ if (verbose)
+ fprintf(stderr,"Too many synthetic levels, max %d\n", HWLOC_SYNTHETIC_MAX_DEPTH);
+ errno = EINVAL;
+ goto error;
+ }
+ if (item > UINT_MAX) {
+ if (verbose)
+ fprintf(stderr,"Too big arity, max %u\n", UINT_MAX);
+ errno = EINVAL;
+ goto error;
+ }
+
+ count++;
+ }
+
+ if (count <= 0) {
+ if (verbose)
+ fprintf(stderr, "Synthetic string doesn't contain any object\n");
+ errno = EINVAL;
+ goto error;
+ }
+
+ for(i=count-1; i>0; i--) {
+ struct hwloc_synthetic_level_data_s *curlevel = &data->level[i];
+ hwloc_obj_type_t type;
+
+ type = curlevel->type;
+
+ if (type == HWLOC_OBJ_TYPE_UNKNOWN) {
+ if (i == count-1)
+ type = HWLOC_OBJ_PU;
+ else {
+ switch (data->level[i+1].type) {
+ case HWLOC_OBJ_PU: type = HWLOC_OBJ_CORE; break;
+ case HWLOC_OBJ_CORE: type = HWLOC_OBJ_CACHE; break;
+ case HWLOC_OBJ_CACHE: type = HWLOC_OBJ_PACKAGE; break;
+ case HWLOC_OBJ_PACKAGE: type = HWLOC_OBJ_NUMANODE; break;
+ case HWLOC_OBJ_NUMANODE:
+ case HWLOC_OBJ_MACHINE:
+ case HWLOC_OBJ_GROUP: type = HWLOC_OBJ_GROUP; break;
+ default:
+ assert(0);
+ }
+ }
+ curlevel->type = type;
+ }
+ switch (type) {
+ case HWLOC_OBJ_PU:
+ nb_pu_levels++;
+ break;
+ case HWLOC_OBJ_CACHE:
+ cache_depth++;
+ break;
+ case HWLOC_OBJ_GROUP:
+ group_depth++;
+ break;
+ case HWLOC_OBJ_NUMANODE:
+ nb_node_levels++;
+ break;
+ case HWLOC_OBJ_MACHINE:
+ nb_machine_levels++;
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (!nb_pu_levels) {
+ if (verbose)
+ fprintf(stderr, "Synthetic string missing ending number of PUs\n");
+ errno = EINVAL;
+ return -1;
+ }
+ if (nb_pu_levels > 1) {
+ if (verbose)
+ fprintf(stderr, "Synthetic string can not have several PU levels\n");
+ errno = EINVAL;
+ return -1;
+ }
+ if (nb_node_levels > 1) {
+ if (verbose)
+ fprintf(stderr, "Synthetic string can not have several NUMA node levels\n");
+ errno = EINVAL;
+ return -1;
+ }
+ if (nb_machine_levels > 1) {
+ if (verbose)
+ fprintf(stderr, "Synthetic string can not have several machine levels\n");
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (nb_machine_levels)
+ data->level[0].type = HWLOC_OBJ_SYSTEM;
+ else {
+ data->level[0].type = HWLOC_OBJ_MACHINE;
+ nb_machine_levels++;
+ }
+
+ /* enforce a NUMA level */
+ if (!nb_node_levels) {
+ /* insert a NUMA level and the machine level */
+ if (data->level[1].type == HWLOC_OBJ_MACHINE)
+ /* there's an explicit machine level after the automatic system root, insert below both */
+ i = 2;
+ else
+ /* insert below the automatic machine root */
+ i = 1;
+ if (verbose)
+ fprintf(stderr, "Inserting a NUMA level with a single object at depth %u\n", i);
+ /* move existing levels by one */
+ memmove(&data->level[i+1], &data->level[i], (count*i)*sizeof(struct hwloc_synthetic_level_data_s));
+ data->level[i].type = HWLOC_OBJ_NUMANODE;
+ data->level[i].index_string = NULL;
+ data->level[i].index_array = NULL;
+ data->level[i].memorysize = 0;
+ data->level[i].totalwidth = data->level[i-1].totalwidth;
+ /* update arity to insert a single NUMA node per parent */
+ data->level[i].arity = data->level[i-1].arity;
+ data->level[i-1].arity = 1;
+ count++;
+ }
+
+ if (cache_depth == 1)
+ /* if there is a single cache level, make it L2 */
+ cache_depth = 2;
+
+ for (i=0; i<count; i++) {
+ struct hwloc_synthetic_level_data_s *curlevel = &data->level[i];
+ hwloc_obj_type_t type = curlevel->type;
+
+ if (type == HWLOC_OBJ_GROUP) {
+ if (curlevel->depth == (unsigned)-1)
+ curlevel->depth = group_depth--;
+
+ } else if (type == HWLOC_OBJ_CACHE) {
+ if (curlevel->depth == (unsigned)-1)
+ curlevel->depth = cache_depth--;
+ if (curlevel->cachetype == (hwloc_obj_cache_type_t) -1)
+ curlevel->cachetype = curlevel->depth == 1 ? HWLOC_OBJ_CACHE_DATA : HWLOC_OBJ_CACHE_UNIFIED;
+ if (!curlevel->memorysize) {
+ if (1 == curlevel->depth)
+ /* 32Kb in L1 */
+ curlevel->memorysize = 32*1024;
+ else
+ /* *4 at each level, starting from 1MB for L2, unified */
+ curlevel->memorysize = 256*1024 << (2*curlevel->depth);
+ }
+
+ } else if (type == HWLOC_OBJ_NUMANODE && !curlevel->memorysize) {
+ /* 1GB in memory nodes. */
+ curlevel->memorysize = 1024*1024*1024;
+ }
+
+ hwloc_synthetic_process_level_indexes(data, i, verbose);
+ }
+
+ data->string = strdup(description);
+ data->level[count-1].arity = 0;
+ return 0;
+
+ error:
+ for(i=0; i<HWLOC_SYNTHETIC_MAX_DEPTH; i++) {
+ struct hwloc_synthetic_level_data_s *curlevel = &data->level[i];
+ free(curlevel->index_array);
+ if (!curlevel->arity)
+ break;
+ }
+ return -1;
+}
+
+static void
+hwloc_synthetic__post_look_hooks(struct hwloc_synthetic_level_data_s *curlevel,
+ hwloc_obj_t obj)
+{
+ switch (obj->type) {
+ case HWLOC_OBJ_GROUP:
+ obj->attr->group.depth = curlevel->depth;
+ break;
+ case HWLOC_OBJ_SYSTEM:
+ break;
+ case HWLOC_OBJ_MACHINE:
+ break;
+ case HWLOC_OBJ_NUMANODE:
+ break;
+ case HWLOC_OBJ_PACKAGE:
+ break;
+ case HWLOC_OBJ_CACHE:
+ obj->attr->cache.depth = curlevel->depth;
+ obj->attr->cache.linesize = 64;
+ obj->attr->cache.type = curlevel->cachetype;
+ obj->attr->cache.size = curlevel->memorysize;
+ break;
+ case HWLOC_OBJ_CORE:
+ break;
+ case HWLOC_OBJ_PU:
+ break;
+ case HWLOC_OBJ_BRIDGE:
+ case HWLOC_OBJ_PCI_DEVICE:
+ case HWLOC_OBJ_OS_DEVICE:
+ case HWLOC_OBJ_MISC:
+ case HWLOC_OBJ_TYPE_MAX:
+ /* Should never happen */
+ assert(0);
+ break;
+ }
+ if (curlevel->memorysize && HWLOC_OBJ_CACHE != obj->type) {
+ obj->memory.local_memory = curlevel->memorysize;
+ obj->memory.page_types_len = 1;
+ obj->memory.page_types = malloc(sizeof(*obj->memory.page_types));
+ memset(obj->memory.page_types, 0, sizeof(*obj->memory.page_types));
+ obj->memory.page_types[0].size = 4096;
+ obj->memory.page_types[0].count = curlevel->memorysize / 4096;
+ }
+}
+
+/*
+ * Recursively build objects whose cpu start at first_cpu
+ * - level gives where to look in the type, arity and id arrays
+ * - the id array is used as a variable to get unique IDs for a given level.
+ * - generated memory should be added to *memory_kB.
+ * - generated cpus should be added to parent_cpuset.
+ * - next cpu number to be used should be returned.
+ */
+static void
+hwloc__look_synthetic(struct hwloc_topology *topology,
+ struct hwloc_synthetic_backend_data_s *data,
+ int level,
+ hwloc_bitmap_t parent_cpuset)
+{
+ hwloc_obj_t obj;
+ unsigned i;
+ struct hwloc_synthetic_level_data_s *curlevel = &data->level[level];
+ hwloc_obj_type_t type = curlevel->type;
+ unsigned os_index;
+
+ /* pre-hooks */
+ switch (type) {
+ case HWLOC_OBJ_GROUP:
+ break;
+ case HWLOC_OBJ_MACHINE:
+ break;
+ case HWLOC_OBJ_NUMANODE:
+ break;
+ case HWLOC_OBJ_PACKAGE:
+ break;
+ case HWLOC_OBJ_CACHE:
+ break;
+ case HWLOC_OBJ_CORE:
+ break;
+ case HWLOC_OBJ_PU:
+ break;
+ case HWLOC_OBJ_SYSTEM:
+ case HWLOC_OBJ_BRIDGE:
+ case HWLOC_OBJ_PCI_DEVICE:
+ case HWLOC_OBJ_OS_DEVICE:
+ case HWLOC_OBJ_MISC:
+ case HWLOC_OBJ_TYPE_MAX:
+ /* Should never happen */
+ assert(0);
+ break;
+ }
+
+ os_index = curlevel->next_os_index++;
+ if (curlevel->index_array)
+ os_index = curlevel->index_array[os_index];
+ obj = hwloc_alloc_setup_object(type, os_index);
+ obj->cpuset = hwloc_bitmap_alloc();
+
+ if (!curlevel->arity) {
+ hwloc_bitmap_set(obj->cpuset, os_index);
+ } else {
+ for (i = 0; i < curlevel->arity; i++)
+ hwloc__look_synthetic(topology, data, level + 1, obj->cpuset);
+ }
+
+ if (type == HWLOC_OBJ_NUMANODE) {
+ obj->nodeset = hwloc_bitmap_alloc();
+ hwloc_bitmap_set(obj->nodeset, os_index);
+ }
+
+ hwloc_bitmap_or(parent_cpuset, parent_cpuset, obj->cpuset);
+
+ hwloc_synthetic__post_look_hooks(curlevel, obj);
+
+ hwloc_insert_object_by_cpuset(topology, obj);
+}
+
+static int
+hwloc_look_synthetic(struct hwloc_backend *backend)
+{
+ struct hwloc_topology *topology = backend->topology;
+ struct hwloc_synthetic_backend_data_s *data = backend->private_data;
+ hwloc_bitmap_t cpuset = hwloc_bitmap_alloc();
+ unsigned i;
+
+ assert(!topology->levels[0][0]->cpuset);
+
+ hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+
+ topology->support.discovery->pu = 1;
+
+ /* start with os_index 0 for each level */
+ for (i = 0; data->level[i].arity > 0; i++)
+ data->level[i].next_os_index = 0;
+ /* ... including the last one */
+ data->level[i].next_os_index = 0;
+
+ /* update first level type according to the synthetic type array */
+ topology->levels[0][0]->type = data->level[0].type;
+ hwloc_synthetic__post_look_hooks(&data->level[0], topology->levels[0][0]);
+
+ for (i = 0; i < data->level[0].arity; i++)
+ hwloc__look_synthetic(topology, data, 1, cpuset);
+
+ hwloc_bitmap_free(cpuset);
+
+ hwloc_obj_add_info(topology->levels[0][0], "Backend", "Synthetic");
+ hwloc_obj_add_info(topology->levels[0][0], "SyntheticDescription", data->string);
+ return 1;
+}
+
+static void
+hwloc_synthetic_backend_disable(struct hwloc_backend *backend)
+{
+ struct hwloc_synthetic_backend_data_s *data = backend->private_data;
+ unsigned i;
+ for(i=0; i<HWLOC_SYNTHETIC_MAX_DEPTH; i++) {
+ struct hwloc_synthetic_level_data_s *curlevel = &data->level[i];
+ free(curlevel->index_array);
+ if (!curlevel->arity)
+ break;
+ }
+ free(data->string);
+ free(data);
+}
+
+static struct hwloc_backend *
+hwloc_synthetic_component_instantiate(struct hwloc_disc_component *component,
+ const void *_data1,
+ const void *_data2 __hwloc_attribute_unused,
+ const void *_data3 __hwloc_attribute_unused)
+{
+ struct hwloc_backend *backend;
+ struct hwloc_synthetic_backend_data_s *data;
+ int err;
+
+ if (!_data1) {
+ const char *env = getenv("HWLOC_SYNTHETIC");
+ if (env) {
+ /* 'synthetic' was given in HWLOC_COMPONENTS without a description */
+ _data1 = env;
+ } else {
+ errno = EINVAL;
+ goto out;
+ }
+ }
+
+ backend = hwloc_backend_alloc(component);
+ if (!backend)
+ goto out;
+
+ data = malloc(sizeof(*data));
+ if (!data) {
+ errno = ENOMEM;
+ goto out_with_backend;
+ }
+
+ err = hwloc_backend_synthetic_init(data, (const char *) _data1);
+ if (err < 0)
+ goto out_with_data;
+
+ backend->private_data = data;
+ backend->discover = hwloc_look_synthetic;
+ backend->disable = hwloc_synthetic_backend_disable;
+ backend->is_thissystem = 0;
+
+ return backend;
+
+ out_with_data:
+ free(data);
+ out_with_backend:
+ free(backend);
+ out:
+ return NULL;
+}
+
+static struct hwloc_disc_component hwloc_synthetic_disc_component = {
+ HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+ "synthetic",
+ ~0,
+ hwloc_synthetic_component_instantiate,
+ 30,
+ NULL
+};
+
+const struct hwloc_component hwloc_synthetic_component = {
+ HWLOC_COMPONENT_ABI,
+ NULL, NULL,
+ HWLOC_COMPONENT_TYPE_DISC,
+ 0,
+ &hwloc_synthetic_disc_component
+};
+
+static int hwloc_topology_export_synthetic_indexes(struct hwloc_topology * topology,
+ hwloc_obj_t obj,
+ char *buffer, size_t buflen)
+{
+ unsigned depth = obj->depth;
+ unsigned total = topology->level_nbobjects[depth];
+ unsigned step = 1;
+ unsigned nr_loops = 0;
+ struct hwloc_synthetic_intlv_loop_s *loops = NULL;
+ hwloc_obj_t cur;
+ unsigned i, j;
+ ssize_t tmplen = buflen;
+ char *tmp = buffer;
+ int res, ret = 0;
+
+ /* must start with 0 */
+ if (obj->os_index)
+ goto exportall;
+
+ while (step != total) {
+ /* must be a divider of the total */
+ if (total % step)
+ goto exportall;
+
+ /* look for os_index == step */
+ for(i=1; i<total; i++)
+ if (topology->levels[depth][i]->os_index == step)
+ break;
+ if (i == total)
+ goto exportall;
+ for(j=2; j<total/i; j++)
+ if (topology->levels[depth][i*j]->os_index != step*j)
+ break;
+
+ nr_loops++;
+ loops = realloc(loops, nr_loops*sizeof(*loops));
+ if (!loops)
+ goto exportall;
+ loops[nr_loops-1].step = i;
+ loops[nr_loops-1].nb = j;
+ step *= j;
+ }
+
+ /* check this interleaving */
+ for(i=0; i<total; i++) {
+ unsigned ind = 0;
+ unsigned mul = 1;
+ for(j=0; j<nr_loops; j++) {
+ ind += (i / loops[j].step) % loops[j].nb * mul;
+ mul *= loops[j].nb;
+ }
+ if (topology->levels[depth][i]->os_index != ind)
+ goto exportall;
+ }
+
+ /* success, print it */
+ for(j=0; j<nr_loops; j++) {
+ res = hwloc_snprintf(tmp, tmplen, "%u*%u%s", loops[j].step, loops[j].nb,
+ j == nr_loops-1 ? ")" : ":");
+ if (res < 0) {
+ free(loops);
+ return -1;
+ }
+ ret += res;
+ if (res >= tmplen)
+ res = tmplen>0 ? tmplen - 1 : 0;
+ tmp += res;
+ tmplen -= res;
+ }
+
+ if (loops)
+ free(loops);
+
+ return ret;
+
+ exportall:
+ if (loops)
+ free(loops);
+
+ /* dump all indexes */
+ cur = obj;
+ while (cur) {
+ res = snprintf(tmp, tmplen, "%u%s", cur->os_index,
+ cur->next_cousin ? "," : ")");
+ if (res < 0)
+ return -1;
+ ret += res;
+ if (res >= tmplen)
+ res = tmplen>0 ? tmplen - 1 : 0;
+ tmp += res;
+ tmplen -= res;
+ cur = cur->next_cousin;
+ }
+ return ret;
+}
+
+static int hwloc_topology_export_synthetic_obj_attr(struct hwloc_topology * topology,
+ hwloc_obj_t obj,
+ char *buffer, size_t buflen)
+{
+ const char * separator = " ";
+ const char * prefix = "(";
+ char cachesize[64] = "";
+ char memsize[64] = "";
+ int needindexes = 0;
+
+ if (HWLOC_OBJ_CACHE == obj->type && obj->attr->cache.size) {
+ snprintf(cachesize, sizeof(cachesize), "%ssize=%llu",
+ prefix, (unsigned long long) obj->attr->cache.size);
+ prefix = separator;
+ }
+ if (obj->memory.local_memory) {
+ snprintf(memsize, sizeof(memsize), "%smemory=%llu",
+ prefix, (unsigned long long) obj->memory.local_memory);
+ prefix = separator;
+ }
+ if (obj->type == HWLOC_OBJ_PU || obj->type == HWLOC_OBJ_NUMANODE) {
+ hwloc_obj_t cur = obj;
+ while (cur) {
+ if (cur->os_index != cur->logical_index) {
+ needindexes = 1;
+ break;
+ }
+ cur = cur->next_cousin;
+ }
+ }
+ if (*cachesize || *memsize || needindexes) {
+ ssize_t tmplen = buflen;
+ char *tmp = buffer;
+ int res, ret = 0;
+
+ res = hwloc_snprintf(tmp, tmplen, "%s%s%s", cachesize, memsize, needindexes ? "" : ")");
+ if (res < 0)
+ return -1;
+ ret += res;
+ if (res >= tmplen)
+ res = tmplen>0 ? tmplen - 1 : 0;
+ tmp += res;
+ tmplen -= res;
+
+ if (needindexes) {
+ res = snprintf(tmp, tmplen, "%sindexes=", prefix);
+ if (res < 0)
+ return -1;
+ ret += res;
+ if (res >= tmplen)
+ res = tmplen>0 ? tmplen - 1 : 0;
+ tmp += res;
+ tmplen -= res;
+
+ res = hwloc_topology_export_synthetic_indexes(topology, obj, tmp, tmplen);
+ if (res < 0)
+ return -1;
+ ret += res;
+ if (res >= tmplen)
+ res = tmplen>0 ? tmplen - 1 : 0;
+ tmp += res;
+ tmplen -= res;
+ }
+ return ret;
+ } else {
+ return 0;
+ }
+}
+
+int
+hwloc_topology_export_synthetic(struct hwloc_topology * topology,
+ char *buffer, size_t buflen,
+ unsigned long flags)
+{
+ hwloc_obj_t obj = hwloc_get_root_obj(topology);
+ ssize_t tmplen = buflen;
+ char *tmp = buffer;
+ int res, ret = 0;
+ int arity;
+ const char * separator = " ";
+ const char * prefix = "";
+
+ if (flags & ~(HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES|HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ /* TODO: add a flag to ignore symmetric_subtree and I/Os.
+ * just assume things are symmetric with the left branches of the tree.
+ * but the number of objects per level may be wrong, what to do with OS index array in this case?
+ * only allow ignoring symmetric_subtree if the level width remains OK?
+ */
+
+ /* TODO: add a root object by default, with a prefix such as tree=
+ * so that we can backward-compatibly recognize whether there's a root or not.
+ * and add a flag to disable it.
+ */
+
+ /* TODO: flag to force all indexes, not only for PU and NUMA? */
+
+ if (!obj->symmetric_subtree) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (!(flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS)) {
+ /* root attributes */
+ res = hwloc_topology_export_synthetic_obj_attr(topology, obj, tmp, tmplen);
+ if (res < 0)
+ return -1;
+ ret += res;
+ if (ret > 0)
+ prefix = separator;
+ if (res >= tmplen)
+ res = tmplen>0 ? tmplen - 1 : 0;
+ tmp += res;
+ tmplen -= res;
+ }
+
+ arity = obj->arity;
+ while (arity) {
+ /* for each level */
+ obj = obj->first_child;
+ if (flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES) {
+ res = hwloc_snprintf(tmp, tmplen, "%s%s:%u", prefix, hwloc_obj_type_string(obj->type), arity);
+ } else {
+ char types[64];
+ hwloc_obj_type_snprintf(types, sizeof(types), obj, 1);
+ res = hwloc_snprintf(tmp, tmplen, "%s%s:%u", prefix, types, arity);
+ }
+ if (res < 0)
+ return -1;
+ ret += res;
+ if (res >= tmplen)
+ res = tmplen>0 ? tmplen - 1 : 0;
+ tmp += res;
+ tmplen -= res;
+
+ if (!(flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS)) {
+ /* obj attributes */
+ res = hwloc_topology_export_synthetic_obj_attr(topology, obj, tmp, tmplen);
+ if (res < 0)
+ return -1;
+ ret += res;
+ if (res >= tmplen)
+ res = tmplen>0 ? tmplen - 1 : 0;
+ tmp += res;
+ tmplen -= res;
+ }
+
+ /* next level */
+ prefix = separator;
+ arity = obj->arity;
+ }
+
+ return ret;
+}
diff --git a/ext/hwloc/hwloc/topology-x86.c b/ext/hwloc/hwloc/topology-x86.c
new file mode 100644
index 0000000..1234ce4
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-x86.c
@@ -0,0 +1,1386 @@
+/*
+ * Copyright © 2010-2015 Inria. All rights reserved.
+ * Copyright © 2010-2013 Université Bordeaux
+ * Copyright © 2010-2011 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ *
+ *
+ * This backend is only used when the operating system does not export
+ * the necessary hardware topology information to user-space applications.
+ * Currently, only the FreeBSD backend relies on this x86 backend.
+ *
+ * Other backends such as Linux have their own way to retrieve various
+ * pieces of hardware topology information from the operating system
+ * on various architectures, without having to use this x86-specific code.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+#include <private/misc.h>
+
+#include <private/cpuid-x86.h>
+
+#include <sys/types.h>
+#include <dirent.h>
+
+struct hwloc_x86_backend_data_s {
+ unsigned nbprocs;
+ hwloc_bitmap_t apicid_set;
+ int apicid_unique;
+ char *src_cpuiddump_path;
+};
+
+/************************************
+ * Management of cpuid dump as input
+ */
+
+struct cpuiddump {
+ unsigned nr;
+ struct cpuiddump_entry {
+ unsigned inmask; /* which of ine[abcd]x are set on input */
+ unsigned ineax;
+ unsigned inebx;
+ unsigned inecx;
+ unsigned inedx;
+ unsigned outeax;
+ unsigned outebx;
+ unsigned outecx;
+ unsigned outedx;
+ } *entries;
+};
+
+static void
+cpuiddump_free(struct cpuiddump *cpuiddump)
+{
+ if (cpuiddump->nr)
+ free(cpuiddump->entries);
+ free(cpuiddump);
+}
+
+static struct cpuiddump *
+cpuiddump_read(const char *dirpath, unsigned idx)
+{
+ struct cpuiddump *cpuiddump;
+ struct cpuiddump_entry *cur;
+ char *filename;
+ size_t filenamelen = strlen(dirpath) + 15;
+ FILE *file;
+ char line[128];
+ unsigned nr;
+
+ cpuiddump = malloc(sizeof(*cpuiddump));
+ cpuiddump->nr = 0; /* return a cpuiddump that will raise errors because it matches nothing */
+
+ filename = malloc(filenamelen);
+ snprintf(filename, filenamelen, "%s/pu%u", dirpath, idx);
+ file = fopen(filename, "r");
+ if (!file) {
+ fprintf(stderr, "Could not read dumped cpuid file %s\n", filename);
+ free(filename);
+ return cpuiddump;
+ }
+ free(filename);
+
+ nr = 0;
+ while (fgets(line, sizeof(line), file))
+ nr++;
+ cpuiddump->entries = malloc(nr * sizeof(struct cpuiddump_entry));
+
+ fseek(file, 0, SEEK_SET);
+ cur = &cpuiddump->entries[0];
+ nr = 0;
+ while (fgets(line, sizeof(line), file)) {
+ if (*line == '#')
+ continue;
+ if (sscanf(line, "%x %x %x %x %x => %x %x %x %x",
+ &cur->inmask,
+ &cur->ineax, &cur->inebx, &cur->inecx, &cur->inedx,
+ &cur->outeax, &cur->outebx, &cur->outecx, &cur->outedx) == 9) {
+ cur++;
+ nr++;
+ }
+ }
+ cpuiddump->nr = nr;
+ fclose(file);
+ return cpuiddump;
+}
+
+static void
+cpuiddump_find_by_input(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx, struct cpuiddump *cpuiddump)
+{
+ unsigned i;
+
+ for(i=0; i<cpuiddump->nr; i++) {
+ struct cpuiddump_entry *entry = &cpuiddump->entries[i];
+ if ((entry->inmask & 0x1) && *eax != entry->ineax)
+ continue;
+ if ((entry->inmask & 0x2) && *ebx != entry->inebx)
+ continue;
+ if ((entry->inmask & 0x4) && *ecx != entry->inecx)
+ continue;
+ if ((entry->inmask & 0x8) && *edx != entry->inedx)
+ continue;
+ *eax = entry->outeax;
+ *ebx = entry->outebx;
+ *ecx = entry->outecx;
+ *edx = entry->outedx;
+ return;
+ }
+
+ fprintf(stderr, "Couldn't find %x,%x,%x,%x in dumped cpuid, returning 0s.\n",
+ *eax, *ebx, *ecx, *edx);
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+}
+
+static void cpuid_or_from_dump(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx, struct cpuiddump *src_cpuiddump)
+{
+ if (src_cpuiddump) {
+ cpuiddump_find_by_input(eax, ebx, ecx, edx, src_cpuiddump);
+ } else {
+ hwloc_x86_cpuid(eax, ebx, ecx, edx);
+ }
+}
+
+/*******************************
+ * Core detection routines and structures
+ */
+
+#define has_topoext(features) ((features)[6] & (1 << 22))
+#define has_x2apic(features) ((features)[4] & (1 << 21))
+
+struct cacheinfo {
+ unsigned type;
+ unsigned level;
+ unsigned nbthreads_sharing;
+
+ unsigned linesize;
+ unsigned linepart;
+ int ways;
+ unsigned sets;
+ unsigned long size;
+ char inclusiveness;
+
+};
+
+struct procinfo {
+ unsigned present;
+ unsigned apicid;
+ unsigned max_log_proc;
+ unsigned max_nbcores;
+ unsigned max_nbthreads;
+ unsigned packageid;
+ unsigned nodeid;
+ unsigned unitid;
+ unsigned logprocid;
+ unsigned threadid;
+ unsigned coreid;
+ unsigned *otherids;
+ unsigned levels;
+ unsigned numcaches;
+ struct cacheinfo *cache;
+ char cpuvendor[13];
+ char cpumodel[3*4*4+1];
+ unsigned cpustepping;
+ unsigned cpumodelnumber;
+ unsigned cpufamilynumber;
+};
+
+enum cpuid_type {
+ intel,
+ amd,
+ unknown
+};
+
+static void fill_amd_cache(struct procinfo *infos, unsigned level, int type, unsigned cpuid)
+{
+ struct cacheinfo *cache;
+ unsigned cachenum;
+ unsigned long size = 0;
+
+ if (level == 1)
+ size = ((cpuid >> 24)) << 10;
+ else if (level == 2)
+ size = ((cpuid >> 16)) << 10;
+ else if (level == 3)
+ size = ((cpuid >> 18)) << 19;
+ if (!size)
+ return;
+
+ cachenum = infos->numcaches++;
+ infos->cache = realloc(infos->cache, infos->numcaches*sizeof(*infos->cache));
+ cache = &infos->cache[cachenum];
+
+ cache->type = type;
+ cache->level = level;
+ if (level <= 2)
+ cache->nbthreads_sharing = 1;
+ else
+ cache->nbthreads_sharing = infos->max_log_proc;
+ cache->linesize = cpuid & 0xff;
+ cache->linepart = 0;
+ if (level == 1) {
+ cache->inclusiveness = 0;//get inclusiveness old AMD ( suposed to be L1 false)
+
+ cache->ways = (cpuid >> 16) & 0xff;
+ if (cache->ways == 0xff)
+ /* Fully associative */
+ cache->ways = -1;
+ } else {
+ cache->inclusiveness = 1;//get inclusivenessold AMD ( suposed to be L2 L3 true)
+
+ static const unsigned ways_tab[] = { 0, 1, 2, 0, 4, 0, 8, 0, 16, 0, 32, 48, 64, 96, 128, -1 };
+ unsigned ways = (cpuid >> 12) & 0xf;
+ cache->ways = ways_tab[ways];
+ }
+ cache->size = size;
+ cache->sets = 0;
+
+ hwloc_debug("cache L%u t%u linesize %u ways %u size %luKB\n", cache->level, cache->nbthreads_sharing, cache->linesize, cache->ways, cache->size >> 10);
+}
+
+/* Fetch information from the processor itself thanks to cpuid and store it in
+ * infos for summarize to analyze them globally */
+static void look_proc(struct hwloc_backend *backend, struct procinfo *infos, unsigned highest_cpuid, unsigned highest_ext_cpuid, unsigned *features, enum cpuid_type cpuid_type, struct cpuiddump *src_cpuiddump)
+{
+ struct hwloc_x86_backend_data_s *data = backend->private_data;
+ unsigned eax, ebx, ecx = 0, edx;
+ unsigned cachenum;
+ struct cacheinfo *cache;
+ unsigned regs[4];
+ unsigned _model, _extendedmodel, _family, _extendedfamily;
+
+ infos->present = 1;
+
+ /* on return from this function, the following fields must be set in infos:
+ * packageid, nodeid, unitid, coreid, threadid, or -1
+ * apicid
+ * levels and levels slots in otherids[]
+ * numcaches and numcaches slots in caches[]
+ *
+ * max_log_proc, max_nbthreads, max_nbcores, logprocid
+ * are only used temporarily inside this function and its callees.
+ */
+
+ /* Get apicid, max_log_proc, packageid, logprocid from cpuid 0x01 */
+ eax = 0x01;
+ cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+ infos->apicid = ebx >> 24;
+ if (edx & (1 << 28))
+ infos->max_log_proc = 1 << hwloc_flsl(((ebx >> 16) & 0xff) - 1);
+ else
+ infos->max_log_proc = 1;
+ hwloc_debug("APIC ID 0x%02x max_log_proc %u\n", infos->apicid, infos->max_log_proc);
+ infos->packageid = infos->apicid / infos->max_log_proc;
+ infos->logprocid = infos->apicid % infos->max_log_proc;
+ hwloc_debug("phys %u thread %u\n", infos->packageid, infos->logprocid);
+
+ /* Get cpu model/family/stepping numbers from same cpuid */
+ _model = (eax>>4) & 0xf;
+ _extendedmodel = (eax>>16) & 0xf;
+ _family = (eax>>8) & 0xf;
+ _extendedfamily = (eax>>20) & 0xff;
+ if ((cpuid_type == intel || cpuid_type == amd) && _family == 0xf) {
+ infos->cpufamilynumber = _family + _extendedfamily;
+ } else {
+ infos->cpufamilynumber = _family;
+ }
+ if ((cpuid_type == intel && (_family == 0x6 || _family == 0xf))
+ || (cpuid_type == amd && _family == 0xf)) {
+ infos->cpumodelnumber = _model + (_extendedmodel << 4);
+ } else {
+ infos->cpumodelnumber = _model;
+ }
+ infos->cpustepping = eax & 0xf;
+
+ /* Get cpu vendor string from cpuid 0x00 */
+ memset(regs, 0, sizeof(regs));
+ regs[0] = 0;
+ cpuid_or_from_dump(®s[0], ®s[1], ®s[3], ®s[2], src_cpuiddump);
+ memcpy(infos->cpuvendor, regs+1, 4*3);
+ /* infos was calloc'ed, already ends with \0 */
+
+ /* Get cpu model string from cpuid 0x80000002-4 */
+ if (highest_ext_cpuid >= 0x80000004) {
+ memset(regs, 0, sizeof(regs));
+ regs[0] = 0x80000002;
+ cpuid_or_from_dump(®s[0], ®s[1], ®s[2], ®s[3], src_cpuiddump);
+ memcpy(infos->cpumodel, regs, 4*4);
+ regs[0] = 0x80000003;
+ cpuid_or_from_dump(®s[0], ®s[1], ®s[2], ®s[3], src_cpuiddump);
+ memcpy(infos->cpumodel + 4*4, regs, 4*4);
+ regs[0] = 0x80000004;
+ cpuid_or_from_dump(®s[0], ®s[1], ®s[2], ®s[3], src_cpuiddump);
+ memcpy(infos->cpumodel + 4*4*2, regs, 4*4);
+ /* infos was calloc'ed, already ends with \0 */
+ }
+
+ /* Get core/thread information from cpuid 0x80000008
+ * (not supported on Intel)
+ */
+ if (cpuid_type != intel && highest_ext_cpuid >= 0x80000008) {
+ unsigned coreidsize;
+ eax = 0x80000008;
+ cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+ coreidsize = (ecx >> 12) & 0xf;
+ hwloc_debug("core ID size: %u\n", coreidsize);
+ if (!coreidsize) {
+ infos->max_nbcores = (ecx & 0xff) + 1;
+ } else
+ infos->max_nbcores = 1 << coreidsize;
+ hwloc_debug("Thus max # of cores: %u\n", infos->max_nbcores);
+ /* Still no multithreaded AMD */
+ infos->max_nbthreads = 1 ;
+ hwloc_debug("and max # of threads: %u\n", infos->max_nbthreads);
+ /* The legacy max_log_proc is deprecated, it can be smaller than max_nbcores,
+ * which is the maximum number of cores that the processor could theoretically support
+ * (see "Multiple Core Calculation" in the AMD CPUID specification).
+ * Recompute packageid/logprocid/threadid/coreid accordingly.
+ */
+ infos->packageid = infos->apicid / infos->max_nbcores;
+ infos->logprocid = infos->apicid % infos->max_nbcores;
+ infos->threadid = infos->logprocid % infos->max_nbthreads;
+ infos->coreid = infos->logprocid / infos->max_nbthreads;
+ hwloc_debug("this is thread %u of core %u\n", infos->threadid, infos->coreid);
+ }
+
+ infos->numcaches = 0;
+ infos->cache = NULL;
+
+ /* Get apicid, nodeid, unitid from cpuid 0x8000001e
+ * and cache information from cpuid 0x8000001d
+ * (AMD topology extension)
+ */
+ if (cpuid_type != intel && has_topoext(features)) {
+ unsigned apic_id, node_id, nodes_per_proc, unit_id, cores_per_unit;
+
+ eax = 0x8000001e;
+ cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+ infos->apicid = apic_id = eax;
+ infos->nodeid = node_id = ecx & 0xff;
+ nodes_per_proc = ((ecx >> 8) & 7) + 1;
+ if (nodes_per_proc > 2) {
+ hwloc_debug("warning: undefined value %d, assuming it means %d\n", nodes_per_proc, nodes_per_proc);
+ }
+ infos->unitid = unit_id = ebx & 0xff;
+ cores_per_unit = ((ebx >> 8) & 3) + 1;
+ hwloc_debug("x2APIC %08x, %d nodes, node %d, %d cores in unit %d\n", apic_id, nodes_per_proc, node_id, cores_per_unit, unit_id);
+
+ for (cachenum = 0; ; cachenum++) {
+ unsigned type;
+ eax = 0x8000001d;
+ ecx = cachenum;
+ cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+ type = eax & 0x1f;
+ if (type == 0)
+ break;
+ infos->numcaches++;
+ }
+
+ cache = infos->cache = malloc(infos->numcaches * sizeof(*infos->cache));
+
+ for (cachenum = 0; ; cachenum++) {
+ unsigned long linesize, linepart, ways, sets;
+ unsigned type;
+ eax = 0x8000001d;
+ ecx = cachenum;
+ cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+
+ type = eax & 0x1f;
+
+ if (type == 0)
+ break;
+
+ cache->type = type;
+ cache->level = (eax >> 5) & 0x7;
+ /* Note: actually number of cores */
+ cache->nbthreads_sharing = ((eax >> 14) & 0xfff) + 1;
+
+ cache->linesize = linesize = (ebx & 0xfff) + 1;
+ cache->linepart = linepart = ((ebx >> 12) & 0x3ff) + 1;
+ ways = ((ebx >> 22) & 0x3ff) + 1;
+
+ if (eax & (1 << 9))
+ /* Fully associative */
+ cache->ways = -1;
+ else
+ cache->ways = ways;
+ cache->sets = sets = ecx + 1;
+ cache->size = linesize * linepart * ways * sets;
+ cache->inclusiveness = edx & 0x2;
+
+
+ hwloc_debug("cache %u type %u L%u t%u c%u linesize %lu linepart %lu ways %lu sets %lu, size %uKB\n", cachenum, cache->type, cache->level, cache->nbthreads_sharing, infos->max_nbcores, linesize, linepart, ways, sets, cache->size >> 10);
+
+ cache++;
+ }
+ } else {
+ /* If there's no topoext,
+ * get cache information from cpuid 0x80000005 and 0x80000006
+ * (not supported on Intel)
+ */
+ if (cpuid_type != intel && highest_ext_cpuid >= 0x80000005) {
+ eax = 0x80000005;
+ cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+ fill_amd_cache(infos, 1, 1, ecx); /* L1d */
+ fill_amd_cache(infos, 1, 2, edx); /* L1i */
+ }
+ if (cpuid_type != intel && highest_ext_cpuid >= 0x80000006) {
+ eax = 0x80000006;
+ cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+ if (ecx & 0xf000)
+ /* This is actually supported on Intel but LinePerTag isn't returned in bits 8-11.
+ * Could be useful if some Intels (at least before Core micro-architecture)
+ * support this leaf without leaf 0x4.
+ */
+ fill_amd_cache(infos, 2, 3, ecx); /* L2u */
+ if (edx & 0xf000)
+ fill_amd_cache(infos, 3, 3, edx); /* L3u */
+ /* FIXME: AMD MagnyCours family 0x10 model 0x9 with 8 cores or more actually
+ * have the L3 split in two halves, and associativity is divided as well (48)
+ */
+ }
+ }
+
+ /* Get thread/core + cache information from cpuid 0x04
+ * (not supported on AMD)
+ */
+ if (cpuid_type != amd && highest_cpuid >= 0x04) {
+ for (cachenum = 0; ; cachenum++) {
+ unsigned type;
+ eax = 0x04;
+ ecx = cachenum;
+ cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+
+ type = eax & 0x1f;
+
+ hwloc_debug("cache %u type %u\n", cachenum, type);
+
+ if (type == 0)
+ break;
+ infos->numcaches++;
+
+ if (!cachenum) {
+ /* by the way, get thread/core information from the first cache */
+ infos->max_nbcores = ((eax >> 26) & 0x3f) + 1;
+ infos->max_nbthreads = infos->max_log_proc / infos->max_nbcores;
+ hwloc_debug("thus %u threads\n", infos->max_nbthreads);
+ infos->threadid = infos->logprocid % infos->max_nbthreads;
+ infos->coreid = infos->logprocid / infos->max_nbthreads;
+ hwloc_debug("this is thread %u of core %u\n", infos->threadid, infos->coreid);
+ }
+ }
+
+ cache = infos->cache = malloc(infos->numcaches * sizeof(*infos->cache));
+
+ for (cachenum = 0; ; cachenum++) {
+ unsigned long linesize, linepart, ways, sets;
+ unsigned type;
+ eax = 0x04;
+ ecx = cachenum;
+ cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+
+ type = eax & 0x1f;
+
+ if (type == 0)
+ break;
+
+ cache->type = type;
+ cache->level = (eax >> 5) & 0x7;
+ cache->nbthreads_sharing = ((eax >> 14) & 0xfff) + 1;
+
+ cache->linesize = linesize = (ebx & 0xfff) + 1;
+ cache->linepart = linepart = ((ebx >> 12) & 0x3ff) + 1;
+ ways = ((ebx >> 22) & 0x3ff) + 1;
+ if (eax & (1 << 9))
+ /* Fully associative */
+ cache->ways = -1;
+ else
+ cache->ways = ways;
+ cache->sets = sets = ecx + 1;
+ cache->size = linesize * linepart * ways * sets;
+ cache->inclusiveness = edx & 0x2;
+
+ hwloc_debug("cache %u type %u L%u t%u c%u linesize %lu linepart %lu ways %lu sets %lu, size %uKB\n", cachenum, cache->type, cache->level, cache->nbthreads_sharing, infos->max_nbcores, linesize, linepart, ways, sets, cache->size >> 10);
+
+ cache++;
+ }
+ }
+
+ /* Get package/core/thread information from cpuid 0x0b
+ * (Intel x2APIC)
+ */
+ if (cpuid_type == intel && has_x2apic(features)) {
+ unsigned level, apic_nextshift, apic_number, apic_type, apic_id = 0, apic_shift = 0, id;
+ for (level = 0; ; level++) {
+ ecx = level;
+ eax = 0x0b;
+ cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+ if (!eax && !ebx)
+ break;
+ }
+ if (level) {
+ infos->levels = level;
+ infos->otherids = malloc(level * sizeof(*infos->otherids));
+ for (level = 0; ; level++) {
+ ecx = level;
+ eax = 0x0b;
+ cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+ if (!eax && !ebx)
+ break;
+ apic_nextshift = eax & 0x1f;
+ apic_number = ebx & 0xffff;
+ apic_type = (ecx & 0xff00) >> 8;
+ apic_id = edx;
+ id = (apic_id >> apic_shift) & ((1 << (apic_nextshift - apic_shift)) - 1);
+ hwloc_debug("x2APIC %08x %d: nextshift %d num %2d type %d id %2d\n", apic_id, level, apic_nextshift, apic_number, apic_type, id);
+ infos->apicid = apic_id;
+ infos->otherids[level] = UINT_MAX;
+ switch (apic_type) {
+ case 1:
+ infos->threadid = id;
+ break;
+ case 2:
+ infos->coreid = id;
+ break;
+ default:
+ hwloc_debug("x2APIC %d: unknown type %d\n", level, apic_type);
+ infos->otherids[level] = apic_id >> apic_shift;
+ break;
+ }
+ apic_shift = apic_nextshift;
+ }
+ infos->apicid = apic_id;
+ infos->packageid = apic_id >> apic_shift;
+ hwloc_debug("x2APIC remainder: %d\n", infos->packageid);
+ hwloc_debug("this is thread %u of core %u\n", infos->threadid, infos->coreid);
+ }
+ }
+
+ if (hwloc_bitmap_isset(data->apicid_set, infos->apicid))
+ data->apicid_unique = 0;
+ else
+ hwloc_bitmap_set(data->apicid_set, infos->apicid);
+}
+
+static void
+hwloc_x86_add_cpuinfos(hwloc_obj_t obj, struct procinfo *info, int nodup)
+{
+ char number[8];
+ hwloc_obj_add_info_nodup(obj, "CPUVendor", info->cpuvendor, nodup);
+ snprintf(number, sizeof(number), "%u", info->cpufamilynumber);
+ hwloc_obj_add_info_nodup(obj, "CPUFamilyNumber", number, nodup);
+ snprintf(number, sizeof(number), "%u", info->cpumodelnumber);
+ hwloc_obj_add_info_nodup(obj, "CPUModelNumber", number, nodup);
+ if (info->cpumodel[0]) {
+ const char *c = info->cpumodel;
+ while (*c == ' ')
+ c++;
+ hwloc_obj_add_info_nodup(obj, "CPUModel", c, nodup);
+ }
+ snprintf(number, sizeof(number), "%u", info->cpustepping);
+ hwloc_obj_add_info_nodup(obj, "CPUStepping", number, nodup);
+}
+
+/* Analyse information stored in infos, and build/annotate topology levels accordingly */
+static void summarize(struct hwloc_backend *backend, struct procinfo *infos, int fulldiscovery)
+{
+ struct hwloc_topology *topology = backend->topology;
+ struct hwloc_x86_backend_data_s *data = backend->private_data;
+ unsigned nbprocs = data->nbprocs;
+ hwloc_bitmap_t complete_cpuset = hwloc_bitmap_alloc();
+ unsigned i, j, l, level, type;
+ unsigned nbpackages = 0;
+ int one = -1;
+ unsigned next_group_depth = topology->next_group_depth;
+
+ for (i = 0; i < nbprocs; i++)
+ if (infos[i].present) {
+ hwloc_bitmap_set(complete_cpuset, i);
+ one = i;
+ }
+
+ if (one == -1) {
+ hwloc_bitmap_free(complete_cpuset);
+ return;
+ }
+
+ /* Ideally, when fulldiscovery=0, we could add any object that doesn't exist yet.
+ * But what if the x86 and the native backends disagree because one is buggy? Which one to trust?
+ * Only annotate existing objects for now.
+ */
+
+ /*Anotate previously existing objects*/
+ if(!fulldiscovery){
+ hwloc_obj_t pu;
+ nbpackages = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PACKAGE);
+ for(pu = hwloc_get_next_obj_by_type(topology,HWLOC_OBJ_PU ,NULL);
+ pu!=NULL;
+ pu = hwloc_get_next_obj_by_type(topology,HWLOC_OBJ_PU ,pu)){
+ unsigned infoId = pu->os_index;
+ if(infoId<0)
+ continue;
+
+ int numCaches = infos[infoId].numcaches;
+ struct cacheinfo **caches = malloc(numCaches*sizeof(struct cacheinfo*));
+ int i;
+ for(i = 0 ;i<numCaches;i++){
+ caches[i] = &(infos[infoId].cache[i]);
+ }
+
+
+ hwloc_obj_t object;
+ for(object = pu;object!=NULL;object = object->parent) {
+ switch(object->type){
+ /* Annotate packages previously-existing cache */
+ case HWLOC_OBJ_CACHE:
+ {
+ if (hwloc_obj_get_info_by_name(object,"inclusiveness"))
+ break;
+ unsigned char type = 0;
+ switch(object->attr->cache.type){
+ case HWLOC_OBJ_CACHE_DATA : type = 1;
+ break;
+ case HWLOC_OBJ_CACHE_INSTRUCTION : type = 2;
+ break;
+ case HWLOC_OBJ_CACHE_UNIFIED : type = 3;
+ break;
+ }
+ int cacheId =-1;
+ for(i=0;i<numCaches;i++)
+ if(caches[i]->level == object->attr->cache.depth){ // the level is exact, not always the type. If at the level there is a cache with the good type we return it. Else we return a random cache of the level.
+ cacheId = i;
+ if(caches[i]->type == type)
+ break;
+ }
+ hwloc_obj_add_info(object,"inclusiveness",caches[cacheId]->inclusiveness?"true":"false");
+
+ }
+ break;
+ case HWLOC_OBJ_PACKAGE:
+ {
+ /* Annotate packages previously-existing package */
+ // FIXME: ideally, we should check all bits in case x86 and the native backend disagree.
+
+ //We already know the pakage from topology-linux. We only check if the package detected by x86 doesn't disagree
+ if (infos[i].packageid == object->os_index || object->os_index == (unsigned) -1) {
+ hwloc_x86_add_cpuinfos(object, &infos[infoId], 1);
+ }
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ free(caches);
+ }
+ }
+
+
+ /* Look for packages */
+ if (fulldiscovery) {
+ hwloc_bitmap_t packages_cpuset = hwloc_bitmap_dup(complete_cpuset);
+ hwloc_bitmap_t package_cpuset;
+ hwloc_obj_t package;
+
+ while ((i = hwloc_bitmap_first(packages_cpuset)) != (unsigned) -1) {
+ unsigned packageid = infos[i].packageid;
+
+ package_cpuset = hwloc_bitmap_alloc();
+ for (j = i; j < nbprocs; j++) {
+ if (infos[j].packageid == packageid) {
+ hwloc_bitmap_set(package_cpuset, j);
+ hwloc_bitmap_clr(packages_cpuset, j);
+ }
+ }
+ package = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, packageid);
+ package->cpuset = package_cpuset;
+
+ hwloc_x86_add_cpuinfos(package, &infos[i], 0);
+
+ hwloc_debug_1arg_bitmap("os package %u has cpuset %s\n",
+ packageid, package_cpuset);
+ hwloc_insert_object_by_cpuset(topology, package);
+ nbpackages++;
+ }
+ hwloc_bitmap_free(packages_cpuset);
+
+ }
+
+ /* If there was no package, annotate the Machine instead */
+ if ((!nbpackages) && infos[0].cpumodel[0]) {
+ hwloc_x86_add_cpuinfos(hwloc_get_root_obj(topology), &infos[0], 1);
+ }
+
+ /* Look for Numa nodes inside packages */
+ if (fulldiscovery) {
+ hwloc_bitmap_t nodes_cpuset = hwloc_bitmap_dup(complete_cpuset);
+ hwloc_bitmap_t node_cpuset;
+ hwloc_obj_t node;
+
+ /* FIXME: if there's memory inside the root object, divide it into NUMA nodes? */
+
+ while ((i = hwloc_bitmap_first(nodes_cpuset)) != (unsigned) -1) {
+ unsigned packageid = infos[i].packageid;
+ unsigned nodeid = infos[i].nodeid;
+
+ if (nodeid == (unsigned)-1) {
+ hwloc_bitmap_clr(nodes_cpuset, i);
+ continue;
+ }
+
+ node_cpuset = hwloc_bitmap_alloc();
+ for (j = i; j < nbprocs; j++) {
+ if (infos[j].nodeid == (unsigned) -1) {
+ hwloc_bitmap_clr(nodes_cpuset, j);
+ continue;
+ }
+
+ if (infos[j].packageid == packageid && infos[j].nodeid == nodeid) {
+ hwloc_bitmap_set(node_cpuset, j);
+ hwloc_bitmap_clr(nodes_cpuset, j);
+ }
+ }
+ node = hwloc_alloc_setup_object(HWLOC_OBJ_NUMANODE, nodeid);
+ node->cpuset = node_cpuset;
+ node->nodeset = hwloc_bitmap_alloc();
+ hwloc_bitmap_set(node->nodeset, nodeid);
+ hwloc_debug_1arg_bitmap("os node %u has cpuset %s\n",
+ nodeid, node_cpuset);
+ hwloc_insert_object_by_cpuset(topology, node);
+ }
+ hwloc_bitmap_free(nodes_cpuset);
+ }
+
+ /* Look for Compute units inside packages */
+ if (fulldiscovery) {
+ hwloc_bitmap_t units_cpuset = hwloc_bitmap_dup(complete_cpuset);
+ hwloc_bitmap_t unit_cpuset;
+ hwloc_obj_t unit;
+
+ while ((i = hwloc_bitmap_first(units_cpuset)) != (unsigned) -1) {
+ unsigned packageid = infos[i].packageid;
+ unsigned unitid = infos[i].unitid;
+
+ if (unitid == (unsigned)-1) {
+ hwloc_bitmap_clr(units_cpuset, i);
+ continue;
+ }
+
+ unit_cpuset = hwloc_bitmap_alloc();
+ for (j = i; j < nbprocs; j++) {
+ if (infos[j].unitid == (unsigned) -1) {
+ hwloc_bitmap_clr(units_cpuset, j);
+ continue;
+ }
+
+ if (infos[j].packageid == packageid && infos[j].unitid == unitid) {
+ hwloc_bitmap_set(unit_cpuset, j);
+ hwloc_bitmap_clr(units_cpuset, j);
+ }
+ }
+ unit = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, unitid);
+ unit->cpuset = unit_cpuset;
+ hwloc_debug_1arg_bitmap("os unit %u has cpuset %s\n",
+ unitid, unit_cpuset);
+ hwloc_insert_object_by_cpuset(topology, unit);
+ }
+ hwloc_bitmap_free(units_cpuset);
+ }
+
+ /* Look for unknown objects */
+ if (infos[one].otherids) {
+ for (level = infos[one].levels-1; level <= infos[one].levels-1; level--) {
+ if (infos[one].otherids[level] != UINT_MAX) {
+ hwloc_bitmap_t unknowns_cpuset = hwloc_bitmap_dup(complete_cpuset);
+ hwloc_bitmap_t unknown_cpuset;
+ hwloc_obj_t unknown_obj;
+
+ while ((i = hwloc_bitmap_first(unknowns_cpuset)) != (unsigned) -1) {
+ unsigned unknownid = infos[i].otherids[level];
+
+ unknown_cpuset = hwloc_bitmap_alloc();
+ for (j = i; j < nbprocs; j++) {
+ if (infos[j].otherids[level] == unknownid) {
+ hwloc_bitmap_set(unknown_cpuset, j);
+ hwloc_bitmap_clr(unknowns_cpuset, j);
+ }
+ }
+ unknown_obj = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, unknownid);
+ unknown_obj->cpuset = unknown_cpuset;
+ unknown_obj->attr->group.depth = topology->next_group_depth + level;
+ if (next_group_depth <= topology->next_group_depth + level)
+ next_group_depth = topology->next_group_depth + level + 1;
+ hwloc_debug_2args_bitmap("os unknown%d %u has cpuset %s\n",
+ level, unknownid, unknown_cpuset);
+ hwloc_insert_object_by_cpuset(topology, unknown_obj);
+ }
+ hwloc_bitmap_free(unknowns_cpuset);
+ }
+ }
+ }
+
+ /* Look for cores */
+ if (fulldiscovery) {
+ hwloc_bitmap_t cores_cpuset = hwloc_bitmap_dup(complete_cpuset);
+ hwloc_bitmap_t core_cpuset;
+ hwloc_obj_t core;
+
+ while ((i = hwloc_bitmap_first(cores_cpuset)) != (unsigned) -1) {
+ unsigned packageid = infos[i].packageid;
+ unsigned coreid = infos[i].coreid;
+
+ if (coreid == (unsigned) -1) {
+ hwloc_bitmap_clr(cores_cpuset, i);
+ continue;
+ }
+
+ core_cpuset = hwloc_bitmap_alloc();
+ for (j = i; j < nbprocs; j++) {
+ if (infos[j].coreid == (unsigned) -1) {
+ hwloc_bitmap_clr(cores_cpuset, j);
+ continue;
+ }
+
+ if (infos[j].packageid == packageid && infos[j].coreid == coreid) {
+ hwloc_bitmap_set(core_cpuset, j);
+ hwloc_bitmap_clr(cores_cpuset, j);
+ }
+ }
+ core = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, coreid);
+ core->cpuset = core_cpuset;
+ hwloc_debug_1arg_bitmap("os core %u has cpuset %s\n",
+ coreid, core_cpuset);
+ hwloc_insert_object_by_cpuset(topology, core);
+ }
+ hwloc_bitmap_free(cores_cpuset);
+ }
+
+ /* Look for caches */
+ /* First find max level */
+ level = 0;
+ for (i = 0; i < nbprocs; i++)
+ for (j = 0; j < infos[i].numcaches; j++)
+ if (infos[i].cache[j].level > level)
+ level = infos[i].cache[j].level;
+
+ /* Look for known types */
+ if (fulldiscovery) while (level > 0) {
+ for (type = 1; type <= 3; type++) {
+ /* Look for caches of that type at level level */
+ {
+ hwloc_bitmap_t caches_cpuset = hwloc_bitmap_dup(complete_cpuset);
+ hwloc_bitmap_t cache_cpuset;
+ hwloc_obj_t cache;
+
+ while ((i = hwloc_bitmap_first(caches_cpuset)) != (unsigned) -1) {
+ unsigned packageid = infos[i].packageid;
+
+ for (l = 0; l < infos[i].numcaches; l++) {
+ if (infos[i].cache[l].level == level && infos[i].cache[l].type == type)
+ break;
+ }
+ if (l == infos[i].numcaches) {
+ /* no cache Llevel of that type in i */
+ hwloc_bitmap_clr(caches_cpuset, i);
+ continue;
+ }
+
+ /* Found a matching cache, now look for others sharing it */
+ {
+ unsigned cacheid = infos[i].apicid / infos[i].cache[l].nbthreads_sharing;
+
+ cache_cpuset = hwloc_bitmap_alloc();
+ for (j = i; j < nbprocs; j++) {
+ unsigned l2;
+ for (l2 = 0; l2 < infos[j].numcaches; l2++) {
+ if (infos[j].cache[l2].level == level && infos[j].cache[l2].type == type)
+ break;
+ }
+ if (l2 == infos[j].numcaches) {
+ /* no cache Llevel of that type in j */
+ hwloc_bitmap_clr(caches_cpuset, j);
+ continue;
+ }
+ if (infos[j].packageid == packageid && infos[j].apicid / infos[j].cache[l2].nbthreads_sharing == cacheid) {
+ hwloc_bitmap_set(cache_cpuset, j);
+ hwloc_bitmap_clr(caches_cpuset, j);
+ }
+ }
+ cache = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, cacheid);
+ cache->attr->cache.depth = level;
+ cache->attr->cache.size = infos[i].cache[l].size;
+ cache->attr->cache.linesize = infos[i].cache[l].linesize;
+ cache->attr->cache.associativity = infos[i].cache[l].ways;
+ switch (infos[i].cache[l].type) {
+ case 1:
+ cache->attr->cache.type = HWLOC_OBJ_CACHE_DATA;
+ break;
+ case 2:
+ cache->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION;
+ break;
+ case 3:
+ cache->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
+ break;
+ }
+ hwloc_obj_add_info(cache,"inclusiveness",infos[i].cache[l].inclusiveness?"true":"false");
+ cache->cpuset = cache_cpuset;
+ hwloc_debug_2args_bitmap("os L%u cache %u has cpuset %s\n",
+ level, cacheid, cache_cpuset);
+ hwloc_insert_object_by_cpuset(topology, cache);
+ }
+ }
+ hwloc_bitmap_free(caches_cpuset);
+ }
+ }
+ level--;
+ }
+
+ for (i = 0; i < nbprocs; i++) {
+ free(infos[i].cache);
+ if (infos[i].otherids)
+ free(infos[i].otherids);
+ }
+
+ hwloc_bitmap_free(complete_cpuset);
+ topology->next_group_depth = next_group_depth;
+}
+
+static int
+look_procs(struct hwloc_backend *backend, struct procinfo *infos, int fulldiscovery,
+ unsigned highest_cpuid, unsigned highest_ext_cpuid, unsigned *features, enum cpuid_type cpuid_type,
+ int (*get_cpubind)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags),
+ int (*set_cpubind)(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags))
+{
+ struct hwloc_x86_backend_data_s *data = backend->private_data;
+ struct hwloc_topology *topology = backend->topology;
+ unsigned nbprocs = data->nbprocs;
+ hwloc_bitmap_t orig_cpuset = NULL;
+ hwloc_bitmap_t set = NULL;
+ unsigned i;
+
+ if (!data->src_cpuiddump_path) {
+ orig_cpuset = hwloc_bitmap_alloc();
+ if (get_cpubind(topology, orig_cpuset, HWLOC_CPUBIND_STRICT)) {
+ hwloc_bitmap_free(orig_cpuset);
+ return -1;
+ }
+ set = hwloc_bitmap_alloc();
+ }
+
+ for (i = 0; i < nbprocs; i++) {
+ struct cpuiddump *src_cpuiddump = NULL;
+ if (data->src_cpuiddump_path) {
+ src_cpuiddump = cpuiddump_read(data->src_cpuiddump_path, i);
+ } else {
+ hwloc_bitmap_only(set, i);
+ hwloc_debug("binding to CPU%d\n", i);
+ if (set_cpubind(topology, set, HWLOC_CPUBIND_STRICT)) {
+ hwloc_debug("could not bind to CPU%d: %s\n", i, strerror(errno));
+ continue;
+ }
+ }
+
+ look_proc(backend, &infos[i], highest_cpuid, highest_ext_cpuid, features, cpuid_type, src_cpuiddump);
+
+ if (data->src_cpuiddump_path) {
+ cpuiddump_free(src_cpuiddump);
+ }
+ }
+
+ if (!data->src_cpuiddump_path) {
+ set_cpubind(topology, orig_cpuset, 0);
+ hwloc_bitmap_free(set);
+ hwloc_bitmap_free(orig_cpuset);
+ }
+
+ if (!data->apicid_unique)
+ fulldiscovery = 0;
+ summarize(backend, infos, fulldiscovery);
+ return fulldiscovery; /* success, but objects added only if fulldiscovery */
+}
+
+#if defined HWLOC_FREEBSD_SYS && defined HAVE_CPUSET_SETID
+#include <sys/param.h>
+#include <sys/cpuset.h>
+typedef cpusetid_t hwloc_x86_os_state_t;
+static void hwloc_x86_os_state_save(hwloc_x86_os_state_t *state, struct cpuiddump *src_cpuiddump)
+{
+ if (!src_cpuiddump) {
+ /* temporary make all cpus available during discovery */
+ cpuset_getid(CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, state);
+ cpuset_setid(CPU_WHICH_PID, -1, 0);
+ }
+}
+static void hwloc_x86_os_state_restore(hwloc_x86_os_state_t *state, struct cpuiddump *src_cpuiddump)
+{
+ if (!src_cpuiddump) {
+ /* restore initial cpuset */
+ cpuset_setid(CPU_WHICH_PID, -1, *state);
+ }
+}
+#else /* !defined HWLOC_FREEBSD_SYS || !defined HAVE_CPUSET_SETID */
+typedef void * hwloc_x86_os_state_t;
+static void hwloc_x86_os_state_save(hwloc_x86_os_state_t *state __hwloc_attribute_unused, struct cpuiddump *src_cpuiddump __hwloc_attribute_unused) { }
+static void hwloc_x86_os_state_restore(hwloc_x86_os_state_t *state __hwloc_attribute_unused, struct cpuiddump *src_cpuiddump __hwloc_attribute_unused) { }
+#endif /* !defined HWLOC_FREEBSD_SYS || !defined HAVE_CPUSET_SETID */
+
+
+#define INTEL_EBX ('G' | ('e'<<8) | ('n'<<16) | ('u'<<24))
+#define INTEL_EDX ('i' | ('n'<<8) | ('e'<<16) | ('I'<<24))
+#define INTEL_ECX ('n' | ('t'<<8) | ('e'<<16) | ('l'<<24))
+
+#define AMD_EBX ('A' | ('u'<<8) | ('t'<<16) | ('h'<<24))
+#define AMD_EDX ('e' | ('n'<<8) | ('t'<<16) | ('i'<<24))
+#define AMD_ECX ('c' | ('A'<<8) | ('M'<<16) | ('D'<<24))
+
+/* fake cpubind for when nbprocs=1 and no binding support */
+static int fake_get_cpubind(hwloc_topology_t topology __hwloc_attribute_unused,
+ hwloc_cpuset_t set __hwloc_attribute_unused,
+ int flags __hwloc_attribute_unused)
+{
+ return 0;
+}
+static int fake_set_cpubind(hwloc_topology_t topology __hwloc_attribute_unused,
+ hwloc_const_cpuset_t set __hwloc_attribute_unused,
+ int flags __hwloc_attribute_unused)
+{
+ return 0;
+}
+
+static
+int hwloc_look_x86(struct hwloc_backend *backend, int fulldiscovery)
+{
+ struct hwloc_x86_backend_data_s *data = backend->private_data;
+ unsigned nbprocs = data->nbprocs;
+ unsigned eax, ebx, ecx = 0, edx;
+ unsigned i;
+ unsigned highest_cpuid;
+ unsigned highest_ext_cpuid;
+ /* This stores cpuid features with the same indexing as Linux */
+ unsigned features[10] = { 0 };
+ struct procinfo *infos = NULL;
+ enum cpuid_type cpuid_type = unknown;
+ hwloc_x86_os_state_t os_state;
+ struct hwloc_binding_hooks hooks;
+ struct hwloc_topology_support support;
+ struct hwloc_topology_membind_support memsupport __hwloc_attribute_unused;
+ int (*get_cpubind)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags) = NULL;
+ int (*set_cpubind)(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags) = NULL;
+ struct cpuiddump *src_cpuiddump = NULL;
+ int ret = -1;
+
+ if (data->src_cpuiddump_path) {
+ /* just read cpuid from the dump */
+ src_cpuiddump = cpuiddump_read(data->src_cpuiddump_path, 0);
+ } else {
+ /* otherwise check if binding works */
+ memset(&hooks, 0, sizeof(hooks));
+ support.membind = &memsupport;
+ hwloc_set_native_binding_hooks(&hooks, &support);
+ if (hooks.get_thisproc_cpubind && hooks.set_thisproc_cpubind) {
+ get_cpubind = hooks.get_thisproc_cpubind;
+ set_cpubind = hooks.set_thisproc_cpubind;
+ } else if (hooks.get_thisthread_cpubind && hooks.set_thisthread_cpubind) {
+ get_cpubind = hooks.get_thisthread_cpubind;
+ set_cpubind = hooks.set_thisthread_cpubind;
+ } else {
+ /* we need binding support if there are multiple PUs */
+ if (nbprocs > 1)
+ goto out;
+ get_cpubind = fake_get_cpubind;
+ set_cpubind = fake_set_cpubind;
+ }
+ }
+
+ if (!src_cpuiddump && !hwloc_have_x86_cpuid())
+ goto out;
+
+ infos = calloc(nbprocs, sizeof(struct procinfo));
+ if (NULL == infos)
+ goto out;
+ for (i = 0; i < nbprocs; i++) {
+ infos[i].nodeid = (unsigned) -1;
+ infos[i].packageid = (unsigned) -1;
+ infos[i].unitid = (unsigned) -1;
+ infos[i].coreid = (unsigned) -1;
+ infos[i].threadid = (unsigned) -1;
+ }
+
+ eax = 0x00;
+ cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+ highest_cpuid = eax;
+ if (ebx == INTEL_EBX && ecx == INTEL_ECX && edx == INTEL_EDX)
+ cpuid_type = intel;
+ if (ebx == AMD_EBX && ecx == AMD_ECX && edx == AMD_EDX)
+ cpuid_type = amd;
+
+ hwloc_debug("highest cpuid %x, cpuid type %u\n", highest_cpuid, cpuid_type);
+ if (highest_cpuid < 0x01) {
+ goto out_with_infos;
+ }
+
+ eax = 0x01;
+ cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+ features[0] = edx;
+ features[4] = ecx;
+
+ eax = 0x80000000;
+ cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+ highest_ext_cpuid = eax;
+
+ hwloc_debug("highest extended cpuid %x\n", highest_ext_cpuid);
+
+ if (highest_cpuid >= 0x7) {
+ eax = 0x7;
+ cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+ features[9] = ebx;
+ }
+
+ if (cpuid_type != intel && highest_ext_cpuid >= 0x80000001) {
+ eax = 0x80000001;
+ cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+ features[1] = edx;
+ features[6] = ecx;
+ }
+
+ hwloc_x86_os_state_save(&os_state, src_cpuiddump);
+
+ ret = look_procs(backend, infos, fulldiscovery,
+ highest_cpuid, highest_ext_cpuid, features, cpuid_type,
+ get_cpubind, set_cpubind);
+ if (ret >= 0)
+ /* success, we're done */
+ goto out_with_os_state;
+
+ if (nbprocs == 1) {
+ /* only one processor, no need to bind */
+ look_proc(backend, &infos[0], highest_cpuid, highest_ext_cpuid, features, cpuid_type, src_cpuiddump);
+ summarize(backend, infos, fulldiscovery);
+ ret = fulldiscovery;
+ }
+
+out_with_os_state:
+ hwloc_x86_os_state_restore(&os_state, src_cpuiddump);
+
+out_with_infos:
+ if (NULL != infos) {
+ free(infos);
+ }
+
+out:
+ if (src_cpuiddump)
+ cpuiddump_free(src_cpuiddump);
+ return ret;
+}
+
+static int
+hwloc_x86_discover(struct hwloc_backend *backend)
+{
+ struct hwloc_x86_backend_data_s *data = backend->private_data;
+ struct hwloc_topology *topology = backend->topology;
+ int alreadypus = 0;
+ int ret;
+
+ if (!data->src_cpuiddump_path) {
+ data->nbprocs = hwloc_fallback_nbprocessors(topology);
+
+ if (!topology->is_thissystem) {
+ hwloc_debug("%s", "\nno x86 detection (not thissystem)\n");
+ return 0;
+ }
+ }
+
+ if (topology->levels[0][0]->cpuset) {
+ /* somebody else discovered things */
+ if (topology->nb_levels == 2 && topology->level_nbobjects[1] == data->nbprocs) {
+ /* only PUs were discovered, as much as we would, complete the topology with everything else */
+ alreadypus = 1;
+ goto fulldiscovery;
+ }
+
+ /* several object types were added, we can't easily complete, just annotate a bit */
+ ret = hwloc_look_x86(backend, 0);
+ if (ret)
+ hwloc_obj_add_info(topology->levels[0][0], "Backend", "x86");
+ return 0;
+ } else {
+ /* topology is empty, initialize it */
+ hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+ }
+
+fulldiscovery:
+ hwloc_look_x86(backend, 1);
+ /* if failed, just continue and create PUs */
+
+ if (!alreadypus)
+ hwloc_setup_pu_level(topology, data->nbprocs);
+
+ hwloc_obj_add_info(topology->levels[0][0], "Backend", "x86");
+
+ if (!data->src_cpuiddump_path) { /* CPUID dump works for both x86 and x86_64 */
+#ifdef HAVE_UNAME
+ hwloc_add_uname_info(topology, NULL); /* we already know is_thissystem() is true */
+#else
+ /* uname isn't available, manually setup the "Architecture" info */
+#ifdef HWLOC_X86_64_ARCH
+ hwloc_obj_add_info(topology->levels[0][0], "Architecture", "x86_64");
+#else
+ hwloc_obj_add_info(topology->levels[0][0], "Architecture", "x86");
+#endif
+#endif
+ }
+
+ return 1;
+}
+
+static int
+hwloc_x86_check_cpuiddump_input(const char *src_cpuiddump_path, hwloc_bitmap_t set)
+{
+ struct dirent *dirent;
+ DIR *dir;
+ char *path;
+ FILE *file;
+ char line [32];
+
+ dir = opendir(src_cpuiddump_path);
+ if (!dir)
+ return -1;
+
+ path = malloc(strlen(src_cpuiddump_path) + strlen("/hwloc-cpuid-info") + 1);
+ if (!path)
+ goto out_with_dir;
+
+ sprintf(path, "%s/hwloc-cpuid-info", src_cpuiddump_path);
+ file = fopen(path, "r");
+ if (!file) {
+ fprintf(stderr, "Couldn't open dumped cpuid summary %s\n", path);
+ free(path);
+ goto out_with_dir;
+ }
+ if (!fgets(line, sizeof(line), file)) {
+ fprintf(stderr, "Found read dumped cpuid summary in %s\n", path);
+ fclose(file);
+ free(path);
+ goto out_with_dir;
+ }
+ fclose(file);
+ if (strcmp(line, "Architecture: x86\n")) {
+ fprintf(stderr, "Found non-x86 dumped cpuid summary in %s: %s\n", path, line);
+ free(path);
+ goto out_with_dir;
+ }
+ free(path);
+
+ while ((dirent = readdir(dir)) != NULL) {
+ if (!strncmp(dirent->d_name, "pu", 2)) {
+ char *end;
+ unsigned long idx = strtoul(dirent->d_name+2, &end, 10);
+ if (!*end)
+ hwloc_bitmap_set(set, idx);
+ else
+ fprintf(stderr, "Ignoring invalid dirent `%s' in dumped cpuid directory `%s'\n",
+ dirent->d_name, src_cpuiddump_path);
+ }
+ }
+ closedir(dir);
+
+ if (hwloc_bitmap_iszero(set)) {
+ fprintf(stderr, "Did not find any valid pu%%u entry in dumped cpuid directory `%s'\n",
+ src_cpuiddump_path);
+ return -1;
+ } else if (hwloc_bitmap_last(set) != hwloc_bitmap_weight(set) - 1) {
+ /* The x86 backends enforces contigous set of PUs starting at 0 so far */
+ fprintf(stderr, "Found non-contigous pu%%u range in dumped cpuid directory `%s'\n",
+ src_cpuiddump_path);
+ return -1;
+ }
+
+ return 0;
+
+out_with_dir:
+ closedir(dir);
+ return -1;
+}
+
+static void
+hwloc_x86_backend_disable(struct hwloc_backend *backend)
+{
+ struct hwloc_x86_backend_data_s *data = backend->private_data;
+ hwloc_bitmap_free(data->apicid_set);
+ if (data->src_cpuiddump_path)
+ free(data->src_cpuiddump_path);
+ free(data);
+}
+
+static struct hwloc_backend *
+hwloc_x86_component_instantiate(struct hwloc_disc_component *component,
+ const void *_data1 __hwloc_attribute_unused,
+ const void *_data2 __hwloc_attribute_unused,
+ const void *_data3 __hwloc_attribute_unused)
+{
+ struct hwloc_backend *backend;
+ struct hwloc_x86_backend_data_s *data;
+ const char *src_cpuiddump_path;
+
+ backend = hwloc_backend_alloc(component);
+ if (!backend)
+ goto out;
+
+ data = malloc(sizeof(*data));
+ if (!data) {
+ errno = ENOMEM;
+ goto out_with_backend;
+ }
+
+ backend->private_data = data;
+ backend->flags = HWLOC_BACKEND_FLAG_NEED_LEVELS;
+ backend->discover = hwloc_x86_discover;
+ backend->disable = hwloc_x86_backend_disable;
+
+ /* default values */
+ data->apicid_set = hwloc_bitmap_alloc();
+ data->apicid_unique = 1;
+ data->src_cpuiddump_path = NULL;
+
+ src_cpuiddump_path = getenv("HWLOC_CPUID_PATH");
+ if (src_cpuiddump_path) {
+ hwloc_bitmap_t set = hwloc_bitmap_alloc();
+ if (!hwloc_x86_check_cpuiddump_input(src_cpuiddump_path, set)) {
+ backend->is_thissystem = 0;
+ data->src_cpuiddump_path = strdup(src_cpuiddump_path);
+ data->nbprocs = hwloc_bitmap_weight(set);
+ } else {
+ fprintf(stderr, "Ignoring dumped cpuid directory.\n");
+ }
+ hwloc_bitmap_free(set);
+ }
+
+ return backend;
+
+ out_with_backend:
+ free(backend);
+ out:
+ return NULL;
+}
+
+static struct hwloc_disc_component hwloc_x86_disc_component = {
+ HWLOC_DISC_COMPONENT_TYPE_CPU,
+ "x86",
+ HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+ hwloc_x86_component_instantiate,
+ 45, /* between native and no_os */
+ NULL
+};
+
+const struct hwloc_component hwloc_x86_component = {
+ HWLOC_COMPONENT_ABI,
+ NULL, NULL,
+ HWLOC_COMPONENT_TYPE_DISC,
+ 0,
+ &hwloc_x86_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology.c b/ext/hwloc/hwloc/topology.c
new file mode 100644
index 0000000..a67d036
--- /dev/null
+++ b/ext/hwloc/hwloc/topology.c
@@ -0,0 +1,3436 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria. All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+
+#define _ATFILE_SOURCE
+#include <assert.h>
+#include <sys/types.h>
+#ifdef HAVE_DIRENT_H
+#include <dirent.h>
+#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <float.h>
+
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+#include <private/misc.h>
+
+#ifdef HAVE_MACH_MACH_INIT_H
+#include <mach/mach_init.h>
+#endif
+#ifdef HAVE_MACH_MACH_HOST_H
+#include <mach/mach_host.h>
+#endif
+
+#ifdef HAVE_SYS_PARAM_H
+#include <sys/param.h>
+#endif
+
+#ifdef HAVE_SYS_SYSCTL_H
+#include <sys/sysctl.h>
+#endif
+
+#ifdef HWLOC_WIN_SYS
+#include <windows.h>
+#endif
+
+unsigned hwloc_get_api_version(void)
+{
+ return HWLOC_API_VERSION;
+}
+
+int hwloc_hide_errors(void)
+{
+ static int hide = 0;
+ static int checked = 0;
+ if (!checked) {
+ const char *envvar = getenv("HWLOC_HIDE_ERRORS");
+ if (envvar)
+ hide = atoi(envvar);
+ checked = 1;
+ }
+ return hide;
+}
+
+void hwloc_report_os_error(const char *msg, int line)
+{
+ static int reported = 0;
+
+ if (!reported && !hwloc_hide_errors()) {
+ fprintf(stderr, "****************************************************************************\n");
+ fprintf(stderr, "* hwloc %s has encountered what looks like an error from the operating system.\n", HWLOC_VERSION);
+ fprintf(stderr, "*\n");
+ fprintf(stderr, "* %s\n", msg);
+ fprintf(stderr, "* Error occurred in topology.c line %d\n", line);
+ fprintf(stderr, "*\n");
+ fprintf(stderr, "* The following FAQ entry in the hwloc documentation may help:\n");
+ fprintf(stderr, "* What should I do when hwloc reports \"operating system\" warnings?\n");
+ fprintf(stderr, "* Otherwise please report this error message to the hwloc user's mailing list,\n");
+#ifdef HWLOC_LINUX_SYS
+ fprintf(stderr, "* along with the output+tarball generated by the hwloc-gather-topology script.\n");
+#else
+ fprintf(stderr, "* along with any relevant topology information from your platform.\n");
+#endif
+ fprintf(stderr, "****************************************************************************\n");
+ reported = 1;
+ }
+}
+
+#if defined(HAVE_SYSCTLBYNAME)
+int hwloc_get_sysctlbyname(const char *name, int64_t *ret)
+{
+ union {
+ int32_t i32;
+ int64_t i64;
+ } n;
+ size_t size = sizeof(n);
+ if (sysctlbyname(name, &n, &size, NULL, 0))
+ return -1;
+ switch (size) {
+ case sizeof(n.i32):
+ *ret = n.i32;
+ break;
+ case sizeof(n.i64):
+ *ret = n.i64;
+ break;
+ default:
+ return -1;
+ }
+ return 0;
+}
+#endif
+
+#if defined(HAVE_SYSCTL)
+int hwloc_get_sysctl(int name[], unsigned namelen, int *ret)
+{
+ int n;
+ size_t size = sizeof(n);
+ if (sysctl(name, namelen, &n, &size, NULL, 0))
+ return -1;
+ if (size != sizeof(n))
+ return -1;
+ *ret = n;
+ return 0;
+}
+#endif
+
+/* Return the OS-provided number of processors. Unlike other methods such as
+ reading sysfs on Linux, this method is not virtualizable; thus it's only
+ used as a fall-back method, allowing virtual backends (FSROOT, etc) to
+ have the desired effect. */
+unsigned
+hwloc_fallback_nbprocessors(struct hwloc_topology *topology) {
+ int n;
+#if HAVE_DECL__SC_NPROCESSORS_ONLN
+ n = sysconf(_SC_NPROCESSORS_ONLN);
+#elif HAVE_DECL__SC_NPROC_ONLN
+ n = sysconf(_SC_NPROC_ONLN);
+#elif HAVE_DECL__SC_NPROCESSORS_CONF
+ n = sysconf(_SC_NPROCESSORS_CONF);
+#elif HAVE_DECL__SC_NPROC_CONF
+ n = sysconf(_SC_NPROC_CONF);
+#elif defined(HAVE_HOST_INFO) && HAVE_HOST_INFO
+ struct host_basic_info info;
+ mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
+ host_info(mach_host_self(), HOST_BASIC_INFO, (integer_t*) &info, &count);
+ n = info.avail_cpus;
+#elif defined(HAVE_SYSCTLBYNAME)
+ int64_t nn;
+ if (hwloc_get_sysctlbyname("hw.ncpu", &nn))
+ nn = -1;
+ n = nn;
+#elif defined(HAVE_SYSCTL) && HAVE_DECL_CTL_HW && HAVE_DECL_HW_NCPU
+ static int name[2] = {CTL_HW, HW_NPCU};
+ if (hwloc_get_sysctl(name, sizeof(name)/sizeof(*name)), &n)
+ n = -1;
+#elif defined(HWLOC_WIN_SYS)
+ SYSTEM_INFO sysinfo;
+ GetSystemInfo(&sysinfo);
+ n = sysinfo.dwNumberOfProcessors;
+#else
+#ifdef __GNUC__
+#warning No known way to discover number of available processors on this system
+#warning hwloc_fallback_nbprocessors will default to 1
+#endif
+ n = -1;
+#endif
+ if (n >= 1)
+ topology->support.discovery->pu = 1;
+ else
+ n = 1;
+ return n;
+}
+
+/*
+ * Use the given number of processors to set a PU level.
+ */
+void
+hwloc_setup_pu_level(struct hwloc_topology *topology,
+ unsigned nb_pus)
+{
+ struct hwloc_obj *obj;
+ unsigned oscpu,cpu;
+
+ hwloc_debug("%s", "\n\n * CPU cpusets *\n\n");
+ for (cpu=0,oscpu=0; cpu<nb_pus; oscpu++)
+ {
+ obj = hwloc_alloc_setup_object(HWLOC_OBJ_PU, oscpu);
+ obj->cpuset = hwloc_bitmap_alloc();
+ hwloc_bitmap_only(obj->cpuset, oscpu);
+
+ hwloc_debug_2args_bitmap("cpu %u (os %u) has cpuset %s\n",
+ cpu, oscpu, obj->cpuset);
+ hwloc_insert_object_by_cpuset(topology, obj);
+
+ cpu++;
+ }
+}
+
+#ifdef HWLOC_DEBUG
+/* Just for debugging. */
+static void
+hwloc_debug_print_object(int indent __hwloc_attribute_unused, hwloc_obj_t obj)
+{
+ char type[64], idx[10], attr[1024], *cpuset = NULL;
+ hwloc_debug("%*s", 2*indent, "");
+ hwloc_obj_type_snprintf(type, sizeof(type), obj, 1);
+ if (obj->os_index != (unsigned) -1)
+ snprintf(idx, sizeof(idx), "#%u", obj->os_index);
+ else
+ *idx = '\0';
+ hwloc_obj_attr_snprintf(attr, sizeof(attr), obj, " ", 1);
+ hwloc_debug("%s%s%s%s%s", type, idx, *attr ? "(" : "", attr, *attr ? ")" : "");
+ if (obj->name)
+ hwloc_debug(" name %s", obj->name);
+ if (obj->cpuset) {
+ hwloc_bitmap_asprintf(&cpuset, obj->cpuset);
+ hwloc_debug(" cpuset %s", cpuset);
+ free(cpuset);
+ }
+ if (obj->complete_cpuset) {
+ hwloc_bitmap_asprintf(&cpuset, obj->complete_cpuset);
+ hwloc_debug(" complete %s", cpuset);
+ free(cpuset);
+ }
+ if (obj->allowed_cpuset) {
+ hwloc_bitmap_asprintf(&cpuset, obj->allowed_cpuset);
+ hwloc_debug(" allowed %s", cpuset);
+ free(cpuset);
+ }
+ if (obj->nodeset) {
+ hwloc_bitmap_asprintf(&cpuset, obj->nodeset);
+ hwloc_debug(" nodeset %s", cpuset);
+ free(cpuset);
+ }
+ if (obj->complete_nodeset) {
+ hwloc_bitmap_asprintf(&cpuset, obj->complete_nodeset);
+ hwloc_debug(" completeN %s", cpuset);
+ free(cpuset);
+ }
+ if (obj->allowed_nodeset) {
+ hwloc_bitmap_asprintf(&cpuset, obj->allowed_nodeset);
+ hwloc_debug(" allowedN %s", cpuset);
+ free(cpuset);
+ }
+ if (obj->arity)
+ hwloc_debug(" arity %u", obj->arity);
+ hwloc_debug("%s", "\n");
+}
+
+static void
+hwloc_debug_print_objects(int indent __hwloc_attribute_unused, hwloc_obj_t obj)
+{
+ hwloc_obj_t child;
+ hwloc_debug_print_object(indent, obj);
+ for (child = obj->first_child; child; child = child->next_sibling)
+ hwloc_debug_print_objects(indent + 1, child);
+ for (child = obj->io_first_child; child; child = child->next_sibling)
+ hwloc_debug_print_objects(indent + 1, child);
+ for (child = obj->misc_first_child; child; child = child->next_sibling)
+ hwloc_debug_print_objects(indent + 1, child);
+}
+#else /* !HWLOC_DEBUG */
+#define hwloc_debug_print_object(indent, obj) do { /* nothing */ } while (0)
+#define hwloc_debug_print_objects(indent, obj) do { /* nothing */ } while (0)
+#endif /* !HWLOC_DEBUG */
+
+void hwloc__free_infos(struct hwloc_obj_info_s *infos, unsigned count)
+{
+ unsigned i;
+ for(i=0; i<count; i++) {
+ free(infos[i].name);
+ free(infos[i].value);
+ }
+ free(infos);
+}
+
+void hwloc__add_info(struct hwloc_obj_info_s **infosp, unsigned *countp, const char *name, const char *value)
+{
+ unsigned count = *countp;
+ struct hwloc_obj_info_s *infos = *infosp;
+#define OBJECT_INFO_ALLOC 8
+ /* nothing allocated initially, (re-)allocate by multiple of 8 */
+ unsigned alloccount = (count + 1 + (OBJECT_INFO_ALLOC-1)) & ~(OBJECT_INFO_ALLOC-1);
+ if (count != alloccount)
+ infos = realloc(infos, alloccount*sizeof(*infos));
+ infos[count].name = strdup(name);
+ infos[count].value = value ? strdup(value) : NULL;
+ *infosp = infos;
+ *countp = count+1;
+}
+
+char ** hwloc__find_info_slot(struct hwloc_obj_info_s **infosp, unsigned *countp, const char *name)
+{
+ unsigned i;
+ for(i=0; i<*countp; i++) {
+ if (!strcmp((*infosp)[i].name, name))
+ return &(*infosp)[i].value;
+ }
+ hwloc__add_info(infosp, countp, name, NULL);
+ return &(*infosp)[*countp-1].value;
+}
+
+void hwloc__move_infos(struct hwloc_obj_info_s **dst_infosp, unsigned *dst_countp,
+ struct hwloc_obj_info_s **src_infosp, unsigned *src_countp)
+{
+ unsigned dst_count = *dst_countp;
+ struct hwloc_obj_info_s *dst_infos = *dst_infosp;
+ unsigned src_count = *src_countp;
+ struct hwloc_obj_info_s *src_infos = *src_infosp;
+ unsigned i;
+#define OBJECT_INFO_ALLOC 8
+ /* nothing allocated initially, (re-)allocate by multiple of 8 */
+ unsigned alloccount = (dst_count + src_count + (OBJECT_INFO_ALLOC-1)) & ~(OBJECT_INFO_ALLOC-1);
+ if (dst_count != alloccount)
+ dst_infos = realloc(dst_infos, alloccount*sizeof(*dst_infos));
+ for(i=0; i<src_count; i++, dst_count++) {
+ dst_infos[dst_count].name = src_infos[i].name;
+ dst_infos[dst_count].value = src_infos[i].value;
+ }
+ *dst_infosp = dst_infos;
+ *dst_countp = dst_count;
+ free(src_infos);
+ *src_infosp = NULL;
+ *src_countp = 0;
+}
+
+void hwloc_obj_add_info(hwloc_obj_t obj, const char *name, const char *value)
+{
+ hwloc__add_info(&obj->infos, &obj->infos_count, name, value);
+}
+
+void hwloc_obj_add_info_nodup(hwloc_obj_t obj, const char *name, const char *value, int nodup)
+{
+ if (nodup && hwloc_obj_get_info_by_name(obj, name))
+ return;
+ hwloc__add_info(&obj->infos, &obj->infos_count, name, value);
+}
+
+static int hwloc_obj_type_is_special (hwloc_obj_type_t type)
+{
+ HWLOC_BUILD_ASSERT(HWLOC_OBJ_MISC + 1 == HWLOC_OBJ_BRIDGE);
+ HWLOC_BUILD_ASSERT(HWLOC_OBJ_BRIDGE + 1 == HWLOC_OBJ_PCI_DEVICE);
+ HWLOC_BUILD_ASSERT(HWLOC_OBJ_PCI_DEVICE + 1 == HWLOC_OBJ_OS_DEVICE);
+ return type >= HWLOC_OBJ_MISC && type <= HWLOC_OBJ_OS_DEVICE;
+}
+static int hwloc_obj_type_is_io (hwloc_obj_type_t type)
+{
+ HWLOC_BUILD_ASSERT(HWLOC_OBJ_BRIDGE + 1 == HWLOC_OBJ_PCI_DEVICE);
+ HWLOC_BUILD_ASSERT(HWLOC_OBJ_PCI_DEVICE + 1 == HWLOC_OBJ_OS_DEVICE);
+ return type >= HWLOC_OBJ_BRIDGE && type <= HWLOC_OBJ_OS_DEVICE;
+}
+
+/* Traverse children of a parent in a safe way: reread the next pointer as
+ * appropriate to prevent crash on child deletion: */
+#define for_each_child_safe(child, parent, pchild) \
+ for (pchild = &(parent)->first_child, child = *pchild; \
+ child; \
+ /* Check whether the current child was not dropped. */ \
+ (*pchild == child ? pchild = &(child->next_sibling) : NULL), \
+ /* Get pointer to next child. */ \
+ child = *pchild)
+#define for_each_io_child_safe(child, parent, pchild) \
+ for (pchild = &(parent)->io_first_child, child = *pchild; \
+ child; \
+ /* Check whether the current child was not dropped. */ \
+ (*pchild == child ? pchild = &(child->next_sibling) : NULL), \
+ /* Get pointer to next child. */ \
+ child = *pchild)
+#define for_each_misc_child_safe(child, parent, pchild) \
+ for (pchild = &(parent)->misc_first_child, child = *pchild; \
+ child; \
+ /* Check whether the current child was not dropped. */ \
+ (*pchild == child ? pchild = &(child->next_sibling) : NULL), \
+ /* Get pointer to next child. */ \
+ child = *pchild)
+
+/* Free an object and all its content. */
+void
+hwloc_free_unlinked_object(hwloc_obj_t obj)
+{
+ switch (obj->type) {
+ default:
+ break;
+ }
+ hwloc__free_infos(obj->infos, obj->infos_count);
+ hwloc_clear_object_distances(obj);
+ free(obj->memory.page_types);
+ free(obj->attr);
+ free(obj->children);
+ free(obj->name);
+ hwloc_bitmap_free(obj->cpuset);
+ hwloc_bitmap_free(obj->complete_cpuset);
+ hwloc_bitmap_free(obj->allowed_cpuset);
+ hwloc_bitmap_free(obj->nodeset);
+ hwloc_bitmap_free(obj->complete_nodeset);
+ hwloc_bitmap_free(obj->allowed_nodeset);
+ free(obj);
+}
+
+/* insert the (non-empty) list of sibling starting at firstnew as new children of newparent,
+ * and return the address of the pointer to the next one
+ */
+static hwloc_obj_t *
+insert_siblings_list(hwloc_obj_t *firstp, hwloc_obj_t firstnew, hwloc_obj_t newparent)
+{
+ hwloc_obj_t tmp;
+ assert(firstnew);
+ *firstp = tmp = firstnew;
+ tmp->parent = newparent;
+ while (tmp->next_sibling) {
+ tmp = tmp->next_sibling;
+ tmp->parent = newparent;
+ }
+ return &tmp->next_sibling;
+}
+
+static void
+append_siblings_list(hwloc_obj_t *firstp, hwloc_obj_t firstnew, hwloc_obj_t newparent)
+{
+ hwloc_obj_t *tmpp, tmp;
+ /* find the end of the list */
+ for(tmpp = firstp ; *tmpp; tmpp = &((*tmpp)->next_sibling));
+ *tmpp = firstnew;
+ /* update parent pointers */
+ for(tmp = firstnew; tmp; tmp = tmp->next_sibling)
+ tmp->parent = newparent;
+}
+
+/* Remove an object from its parent and free it.
+ * Only updates next_sibling/first_child pointers,
+ * so may only be used during early discovery.
+ *
+ * Children are inserted in the parent.
+ * If children should be inserted somewhere else (e.g. when merging with a child),
+ * the caller should move them before calling this function.
+ */
+static void
+unlink_and_free_single_object(hwloc_obj_t *pparent)
+{
+ hwloc_obj_t old = *pparent;
+ hwloc_obj_t *lastp;
+
+ if (old->type == HWLOC_OBJ_MISC) {
+ /* Misc object */
+
+ /* no normal children */
+ assert(!old->first_child);
+
+ /* no I/O children */
+ assert(!old->io_first_child);
+
+ if (old->misc_first_child)
+ /* insert old misc object children as new siblings below parent instead of old */
+ lastp = insert_siblings_list(pparent, old->misc_first_child, old->parent);
+ else
+ lastp = pparent;
+ /* append old siblings back */
+ *lastp = old->next_sibling;
+
+ } else if (hwloc_obj_type_is_io(old->type)) {
+ /* I/O object */
+
+ /* no normal children */
+ assert(!old->first_child);
+
+ if (old->io_first_child)
+ /* insert old I/O object children as new siblings below parent instead of old */
+ lastp = insert_siblings_list(pparent, old->io_first_child, old->parent);
+ else
+ lastp = pparent;
+ /* append old siblings back */
+ *lastp = old->next_sibling;
+
+ /* append old Misc children to parent */
+ if (old->misc_first_child)
+ append_siblings_list(&old->parent->misc_first_child, old->misc_first_child, old->parent);
+
+ } else {
+ /* Normal object */
+
+ if (old->first_child)
+ /* insert old object children as new siblings below parent instead of old */
+ lastp = insert_siblings_list(pparent, old->first_child, old->parent);
+ else
+ lastp = pparent;
+ /* append old siblings back */
+ *lastp = old->next_sibling;
+
+ /* append old I/O and Misc children to parent
+ * old->parent cannot be NULL (removing root), misc children should have been moved by the caller earlier.
+ */
+ if (old->io_first_child)
+ append_siblings_list(&old->parent->io_first_child, old->io_first_child, old->parent);
+ if (old->misc_first_child)
+ append_siblings_list(&old->parent->misc_first_child, old->misc_first_child, old->parent);
+ }
+
+ hwloc_free_unlinked_object(old);
+}
+
+/* Remove an object and its children from its parent and free them.
+ * Only updates next_sibling/first_child pointers,
+ * so may only be used during early discovery.
+ */
+static void
+unlink_and_free_object_and_children(hwloc_obj_t *pobj)
+{
+ hwloc_obj_t obj = *pobj, child, *pchild;
+
+ for_each_child_safe(child, obj, pchild)
+ unlink_and_free_object_and_children(pchild);
+ for_each_io_child_safe(child, obj, pchild)
+ unlink_and_free_object_and_children(pchild);
+ for_each_misc_child_safe(child, obj, pchild)
+ unlink_and_free_object_and_children(pchild);
+
+ *pobj = obj->next_sibling;
+ hwloc_free_unlinked_object(obj);
+}
+
+static void
+hwloc__duplicate_object(struct hwloc_obj *newobj,
+ struct hwloc_obj *src)
+{
+ size_t len;
+ unsigned i;
+
+ newobj->type = src->type;
+ newobj->os_index = src->os_index;
+
+ if (src->name)
+ newobj->name = strdup(src->name);
+ newobj->userdata = src->userdata;
+
+ memcpy(&newobj->memory, &src->memory, sizeof(struct hwloc_obj_memory_s));
+ if (src->memory.page_types_len) {
+ len = src->memory.page_types_len * sizeof(struct hwloc_obj_memory_page_type_s);
+ newobj->memory.page_types = malloc(len);
+ memcpy(newobj->memory.page_types, src->memory.page_types, len);
+ }
+
+ memcpy(newobj->attr, src->attr, sizeof(*newobj->attr));
+
+ newobj->cpuset = hwloc_bitmap_dup(src->cpuset);
+ newobj->complete_cpuset = hwloc_bitmap_dup(src->complete_cpuset);
+ newobj->allowed_cpuset = hwloc_bitmap_dup(src->allowed_cpuset);
+ newobj->nodeset = hwloc_bitmap_dup(src->nodeset);
+ newobj->complete_nodeset = hwloc_bitmap_dup(src->complete_nodeset);
+ newobj->allowed_nodeset = hwloc_bitmap_dup(src->allowed_nodeset);
+
+ /* don't duplicate distances, they'll be recreated at the end of the topology build */
+
+ for(i=0; i<src->infos_count; i++)
+ hwloc__add_info(&newobj->infos, &newobj->infos_count, src->infos[i].name, src->infos[i].value);
+}
+
+void
+hwloc__duplicate_objects(struct hwloc_topology *newtopology,
+ struct hwloc_obj *newparent,
+ struct hwloc_obj *src)
+{
+ hwloc_obj_t newobj;
+ hwloc_obj_t child;
+
+ newobj = hwloc_alloc_setup_object(src->type, src->os_index);
+ hwloc__duplicate_object(newobj, src);
+
+ for(child = src->first_child; child; child = child->next_sibling)
+ hwloc__duplicate_objects(newtopology, newobj, child);
+ for(child = src->io_first_child; child; child = child->next_sibling)
+ hwloc__duplicate_objects(newtopology, newobj, child);
+ for(child = src->misc_first_child; child; child = child->next_sibling)
+ hwloc__duplicate_objects(newtopology, newobj, child);
+
+ /* no need to check the children order here, the source topology
+ * is supposed to be OK already, and we have debug asserts.
+ */
+ hwloc_insert_object_by_parent(newtopology, newparent, newobj);
+}
+
+int
+hwloc_topology_dup(hwloc_topology_t *newp,
+ hwloc_topology_t old)
+{
+ hwloc_topology_t new;
+ hwloc_obj_t newroot;
+ hwloc_obj_t oldroot = hwloc_get_root_obj(old);
+ hwloc_obj_t child;
+
+ if (!old->is_loaded) {
+ errno = -EINVAL;
+ return -1;
+ }
+
+ hwloc_topology_init(&new);
+
+ new->flags = old->flags;
+ memcpy(new->ignored_types, old->ignored_types, sizeof(old->ignored_types));
+ new->is_thissystem = old->is_thissystem;
+ new->is_loaded = 1;
+ new->pid = old->pid;
+
+ memcpy(&new->binding_hooks, &old->binding_hooks, sizeof(old->binding_hooks));
+
+ memcpy(new->support.discovery, old->support.discovery, sizeof(*old->support.discovery));
+ memcpy(new->support.cpubind, old->support.cpubind, sizeof(*old->support.cpubind));
+ memcpy(new->support.membind, old->support.membind, sizeof(*old->support.membind));
+
+ new->userdata_export_cb = old->userdata_export_cb;
+ new->userdata_import_cb = old->userdata_import_cb;
+
+ newroot = hwloc_get_root_obj(new);
+ hwloc__duplicate_object(newroot, oldroot);
+
+ for(child = oldroot->first_child; child; child = child->next_sibling)
+ hwloc__duplicate_objects(new, newroot, child);
+ for(child = oldroot->io_first_child; child; child = child->next_sibling)
+ hwloc__duplicate_objects(new, newroot, child);
+ for(child = oldroot->misc_first_child; child; child = child->next_sibling)
+ hwloc__duplicate_objects(new, newroot, child);
+
+ if (old->first_osdist) {
+ struct hwloc_os_distances_s *olddist = old->first_osdist;
+ while (olddist) {
+ struct hwloc_os_distances_s *newdist = malloc(sizeof(*newdist));
+ newdist->type = olddist->type;
+ newdist->nbobjs = olddist->nbobjs;
+ newdist->indexes = malloc(newdist->nbobjs * sizeof(*newdist->indexes));
+ memcpy(newdist->indexes, olddist->indexes, newdist->nbobjs * sizeof(*newdist->indexes));
+ newdist->objs = NULL; /* will be recomputed when needed */
+ newdist->distances = malloc(newdist->nbobjs * newdist->nbobjs * sizeof(*newdist->distances));
+ memcpy(newdist->distances, olddist->distances, newdist->nbobjs * newdist->nbobjs * sizeof(*newdist->distances));
+
+ newdist->forced = olddist->forced;
+ if (new->first_osdist) {
+ new->last_osdist->next = newdist;
+ newdist->prev = new->last_osdist;
+ } else {
+ new->first_osdist = newdist;
+ newdist->prev = NULL;
+ }
+ new->last_osdist = newdist;
+ newdist->next = NULL;
+
+ olddist = olddist->next;
+ }
+ } else
+ new->first_osdist = old->last_osdist = NULL;
+
+ /* no need to duplicate backends, topology is already loaded */
+ new->backends = NULL;
+
+ hwloc_connect_children(new->levels[0][0]);
+ if (hwloc_connect_levels(new) < 0)
+ goto out;
+ new->modified = 0;
+
+ hwloc_distances_finalize_os(new);
+ hwloc_distances_finalize_logical(new);
+
+#ifndef HWLOC_DEBUG
+ if (getenv("HWLOC_DEBUG_CHECK"))
+#endif
+ hwloc_topology_check(new);
+
+ *newp = new;
+ return 0;
+
+ out:
+ hwloc_topology_clear(new);
+ hwloc_distances_destroy(new);
+ hwloc_topology_setup_defaults(new);
+ return -1;
+}
+
+/*
+ * How to compare objects based on types.
+ *
+ * Note that HIGHER/LOWER is only a (consistent) heuristic, used to sort
+ * objects with same cpuset consistently.
+ * Only EQUAL / not EQUAL can be relied upon.
+ */
+
+enum hwloc_type_cmp_e {
+ HWLOC_TYPE_HIGHER,
+ HWLOC_TYPE_DEEPER,
+ HWLOC_TYPE_EQUAL
+};
+
+/* WARNING: The indexes of this array MUST match the ordering that of
+ the obj_order_type[] array, below. Specifically, the values must
+ be laid out such that:
+
+ obj_order_type[obj_type_order[N]] = N
+
+ for all HWLOC_OBJ_* values of N. Put differently:
+
+ obj_type_order[A] = B
+
+ where the A values are in order of the hwloc_obj_type_t enum, and
+ the B values are the corresponding indexes of obj_order_type.
+
+ We can't use C99 syntax to initialize this in a little safer manner
+ -- bummer. :-(
+
+ *************************************************************
+ *** DO NOT CHANGE THE ORDERING OF THIS ARRAY WITHOUT TRIPLE
+ *** CHECKING ITS CORRECTNESS!
+ *************************************************************
+ */
+static const unsigned obj_type_order[] = {
+ /* first entry is HWLOC_OBJ_SYSTEM */ 0,
+ /* next entry is HWLOC_OBJ_MACHINE */ 1,
+ /* next entry is HWLOC_OBJ_NUMANODE */ 3,
+ /* next entry is HWLOC_OBJ_PACKAGE */ 4,
+ /* next entry is HWLOC_OBJ_CACHE */ 5,
+ /* next entry is HWLOC_OBJ_CORE */ 6,
+ /* next entry is HWLOC_OBJ_PU */ 10,
+ /* next entry is HWLOC_OBJ_GROUP */ 2,
+ /* next entry is HWLOC_OBJ_MISC */ 11,
+ /* next entry is HWLOC_OBJ_BRIDGE */ 7,
+ /* next entry is HWLOC_OBJ_PCI_DEVICE */ 8,
+ /* next entry is HWLOC_OBJ_OS_DEVICE */ 9
+};
+
+static const hwloc_obj_type_t obj_order_type[] = {
+ HWLOC_OBJ_SYSTEM,
+ HWLOC_OBJ_MACHINE,
+ HWLOC_OBJ_GROUP,
+ HWLOC_OBJ_NUMANODE,
+ HWLOC_OBJ_PACKAGE,
+ HWLOC_OBJ_CACHE,
+ HWLOC_OBJ_CORE,
+ HWLOC_OBJ_BRIDGE,
+ HWLOC_OBJ_PCI_DEVICE,
+ HWLOC_OBJ_OS_DEVICE,
+ HWLOC_OBJ_PU,
+ HWLOC_OBJ_MISC,
+};
+
+/* priority to be used when merging identical parent/children object
+ * (in merge_useless_child), keep the highest priority one.
+ *
+ * Always keep Machine/PU/PCIDev/OSDev
+ * then System/Node
+ * then Core
+ * then Package
+ * then Cache
+ * then always drop Group/Misc/Bridge.
+ *
+ * Some type won't actually ever be involved in such merging.
+ */
+static const int obj_type_priority[] = {
+ /* first entry is HWLOC_OBJ_SYSTEM */ 80,
+ /* next entry is HWLOC_OBJ_MACHINE */ 90,
+ /* next entry is HWLOC_OBJ_NUMANODE */ 100,
+ /* next entry is HWLOC_OBJ_PACKAGE */ 40,
+ /* next entry is HWLOC_OBJ_CACHE */ 20,
+ /* next entry is HWLOC_OBJ_CORE */ 60,
+ /* next entry is HWLOC_OBJ_PU */ 100,
+ /* next entry is HWLOC_OBJ_GROUP */ 0,
+ /* next entry is HWLOC_OBJ_MISC */ 0,
+ /* next entry is HWLOC_OBJ_BRIDGE */ 0,
+ /* next entry is HWLOC_OBJ_PCI_DEVICE */ 100,
+ /* next entry is HWLOC_OBJ_OS_DEVICE */ 100
+};
+
+static unsigned __hwloc_attribute_const
+hwloc_get_type_order(hwloc_obj_type_t type)
+{
+ return obj_type_order[type];
+}
+
+#if !defined(NDEBUG)
+static hwloc_obj_type_t hwloc_get_order_type(int order)
+{
+ return obj_order_type[order];
+}
+#endif
+
+int hwloc_compare_types (hwloc_obj_type_t type1, hwloc_obj_type_t type2)
+{
+ unsigned order1 = hwloc_get_type_order(type1);
+ unsigned order2 = hwloc_get_type_order(type2);
+
+ /* I/O are only comparable with each others and with machine and system */
+ if (hwloc_obj_type_is_io(type1)
+ && !hwloc_obj_type_is_io(type2) && type2 != HWLOC_OBJ_SYSTEM && type2 != HWLOC_OBJ_MACHINE)
+ return HWLOC_TYPE_UNORDERED;
+ if (hwloc_obj_type_is_io(type2)
+ && !hwloc_obj_type_is_io(type1) && type1 != HWLOC_OBJ_SYSTEM && type1 != HWLOC_OBJ_MACHINE)
+ return HWLOC_TYPE_UNORDERED;
+
+ return order1 - order2;
+}
+
+static enum hwloc_type_cmp_e
+hwloc_type_cmp(hwloc_obj_t obj1, hwloc_obj_t obj2)
+{
+ hwloc_obj_type_t type1 = obj1->type;
+ hwloc_obj_type_t type2 = obj2->type;
+ int compare;
+
+ compare = hwloc_compare_types(type1, type2);
+ if (compare == HWLOC_TYPE_UNORDERED)
+ return HWLOC_TYPE_EQUAL; /* we cannot do better */
+ if (compare > 0)
+ return HWLOC_TYPE_DEEPER;
+ if (compare < 0)
+ return HWLOC_TYPE_HIGHER;
+
+ /* Caches have the same types but can have different depths. */
+ if (type1 == HWLOC_OBJ_CACHE) {
+ if (obj1->attr->cache.depth < obj2->attr->cache.depth)
+ return HWLOC_TYPE_DEEPER;
+ else if (obj1->attr->cache.depth > obj2->attr->cache.depth)
+ return HWLOC_TYPE_HIGHER;
+ else if (obj1->attr->cache.type > obj2->attr->cache.type)
+ /* consider icache deeper than dcache and dcache deeper than unified */
+ return HWLOC_TYPE_DEEPER;
+ else if (obj1->attr->cache.type < obj2->attr->cache.type)
+ /* consider icache deeper than dcache and dcache deeper than unified */
+ return HWLOC_TYPE_HIGHER;
+ }
+
+ /* Group objects have the same types but can have different depths. */
+ if (type1 == HWLOC_OBJ_GROUP) {
+ if (obj1->attr->group.depth == (unsigned) -1
+ || obj2->attr->group.depth == (unsigned) -1)
+ return HWLOC_TYPE_EQUAL;
+ if (obj1->attr->group.depth < obj2->attr->group.depth)
+ return HWLOC_TYPE_DEEPER;
+ else if (obj1->attr->group.depth > obj2->attr->group.depth)
+ return HWLOC_TYPE_HIGHER;
+ }
+
+ /* Bridges objects have the same types but can have different depths. */
+ if (type1 == HWLOC_OBJ_BRIDGE) {
+ if (obj1->attr->bridge.depth < obj2->attr->bridge.depth)
+ return HWLOC_TYPE_DEEPER;
+ else if (obj1->attr->bridge.depth > obj2->attr->bridge.depth)
+ return HWLOC_TYPE_HIGHER;
+ }
+
+ return HWLOC_TYPE_EQUAL;
+}
+
+/*
+ * How to compare objects based on cpusets.
+ */
+
+enum hwloc_obj_cmp_e {
+ HWLOC_OBJ_EQUAL = HWLOC_BITMAP_EQUAL, /**< \brief Equal */
+ HWLOC_OBJ_INCLUDED = HWLOC_BITMAP_INCLUDED, /**< \brief Strictly included into */
+ HWLOC_OBJ_CONTAINS = HWLOC_BITMAP_CONTAINS, /**< \brief Strictly contains */
+ HWLOC_OBJ_INTERSECTS = HWLOC_BITMAP_INTERSECTS, /**< \brief Intersects, but no inclusion! */
+ HWLOC_OBJ_DIFFERENT = HWLOC_BITMAP_DIFFERENT /**< \brief No intersection */
+};
+
+static int
+hwloc_obj_cmp_sets(hwloc_obj_t obj1, hwloc_obj_t obj2)
+{
+ hwloc_bitmap_t set1, set2;
+ int res = HWLOC_OBJ_DIFFERENT;
+
+ assert(!hwloc_obj_type_is_special(obj1->type));
+ assert(!hwloc_obj_type_is_special(obj2->type));
+
+ /* compare cpusets first */
+ if (obj1->complete_cpuset && obj2->complete_cpuset) {
+ set1 = obj1->complete_cpuset;
+ set2 = obj2->complete_cpuset;
+ } else {
+ set1 = obj1->cpuset;
+ set2 = obj2->cpuset;
+ }
+ if (set1 && set2 && !hwloc_bitmap_iszero(set1) && !hwloc_bitmap_iszero(set2)) {
+ res = hwloc_bitmap_compare_inclusion(set1, set2);
+ if (res == HWLOC_OBJ_INTERSECTS)
+ return HWLOC_OBJ_INTERSECTS;
+ }
+
+ /* then compare nodesets, and combine the results */
+ if (obj1->complete_nodeset && obj2->complete_nodeset) {
+ set1 = obj1->complete_nodeset;
+ set2 = obj2->complete_nodeset;
+ } else {
+ set1 = obj1->nodeset;
+ set2 = obj2->nodeset;
+ }
+ if (set1 && set2 && !hwloc_bitmap_iszero(set1) && !hwloc_bitmap_iszero(set2)) {
+ int noderes = hwloc_bitmap_compare_inclusion(set1, set2);
+ /* deal with conflicting cpusets/nodesets inclusions */
+ if (noderes == HWLOC_OBJ_INCLUDED) {
+ if (res == HWLOC_OBJ_CONTAINS)
+ /* contradicting order for cpusets and nodesets */
+ return HWLOC_OBJ_INTERSECTS;
+ res = HWLOC_OBJ_INCLUDED;
+
+ } else if (noderes == HWLOC_OBJ_CONTAINS) {
+ if (res == HWLOC_OBJ_INCLUDED)
+ /* contradicting order for cpusets and nodesets */
+ return HWLOC_OBJ_INTERSECTS;
+ res = HWLOC_OBJ_CONTAINS;
+
+ } else if (noderes == HWLOC_OBJ_INTERSECTS) {
+ return HWLOC_OBJ_INTERSECTS;
+
+ } else {
+ /* nodesets are different, keep the cpuset order */
+ /* FIXME: with upcoming multiple levels of NUMA, we may have to report INCLUDED or CONTAINED here */
+
+ }
+ }
+
+ return res;
+}
+
+static int
+hwloc_obj_cmp_types(hwloc_obj_t obj1, hwloc_obj_t obj2)
+{
+ /* Same sets, subsort by type to have a consistent ordering. */
+ int typeres = hwloc_type_cmp(obj1, obj2);
+ if (typeres == HWLOC_TYPE_DEEPER)
+ return HWLOC_OBJ_INCLUDED;
+ if (typeres == HWLOC_TYPE_HIGHER)
+ return HWLOC_OBJ_CONTAINS;
+
+ /* Same sets and types! Let's hope it's coherent. */
+ return HWLOC_OBJ_EQUAL;
+}
+
+/* Compare object cpusets based on complete_cpuset if defined (always correctly ordered),
+ * or fallback to the main cpusets (only correctly ordered during early insert before disallowed bits are cleared).
+ *
+ * This is the sane way to compare object among a horizontal level.
+ */
+int
+hwloc__object_cpusets_compare_first(hwloc_obj_t obj1, hwloc_obj_t obj2)
+{
+ if (obj1->complete_cpuset && obj2->complete_cpuset)
+ return hwloc_bitmap_compare_first(obj1->complete_cpuset, obj2->complete_cpuset);
+ else
+ return hwloc_bitmap_compare_first(obj1->cpuset, obj2->cpuset);
+}
+
+/* format the obj info to print in error messages */
+static void
+hwloc__report_error_format_obj(char *buf, size_t buflen, hwloc_obj_t obj)
+{
+ char typestr[64];
+ char *cpusetstr;
+ hwloc_obj_type_snprintf(typestr, sizeof(typestr), obj, 0);
+ hwloc_bitmap_asprintf(&cpusetstr, obj->cpuset);
+ if (obj->os_index != (unsigned) -1)
+ snprintf(buf, buflen, "%s (P#%u cpuset %s)",
+ typestr, obj->os_index, cpusetstr);
+ else
+ snprintf(buf, buflen, "%s (cpuset %s)",
+ typestr, cpusetstr);
+ free(cpusetstr);
+}
+
+/*
+ * How to insert objects into the topology.
+ *
+ * Note: during detection, only the first_child and next_sibling pointers are
+ * kept up to date. Others are computed only once topology detection is
+ * complete.
+ */
+
+#define merge_index(new, old, field, type) \
+ if ((old)->field == (type) -1) \
+ (old)->field = (new)->field;
+#define merge_sizes(new, old, field) \
+ if (!(old)->field) \
+ (old)->field = (new)->field;
+#ifdef HWLOC_DEBUG
+#define check_sizes(new, old, field) \
+ if ((new)->field) \
+ assert((old)->field == (new)->field)
+#else
+#define check_sizes(new, old, field)
+#endif
+
+static void
+merge_insert_equal(hwloc_obj_t new, hwloc_obj_t old)
+{
+ merge_index(new, old, os_index, unsigned);
+
+ if (new->distances_count) {
+ if (old->distances_count) {
+ old->distances_count += new->distances_count;
+ old->distances = realloc(old->distances, old->distances_count * sizeof(*old->distances));
+ memcpy(old->distances + new->distances_count, new->distances, new->distances_count * sizeof(*old->distances));
+ free(new->distances);
+ } else {
+ old->distances_count = new->distances_count;
+ old->distances = new->distances;
+ }
+ new->distances_count = 0;
+ new->distances = NULL;
+ }
+
+ if (new->infos_count) {
+ hwloc__move_infos(&old->infos, &old->infos_count,
+ &new->infos, &new->infos_count);
+ }
+
+ if (new->name) {
+ if (old->name)
+ free(old->name);
+ old->name = new->name;
+ new->name = NULL;
+ }
+
+ /* Ignore userdata. It will be NULL before load().
+ * It may be non-NULL if alloc+insert_group() after load().
+ */
+
+ switch(new->type) {
+ case HWLOC_OBJ_NUMANODE:
+ /* Do not check these, it may change between calls */
+ merge_sizes(new, old, memory.local_memory);
+ merge_sizes(new, old, memory.total_memory);
+ /* if both newects have a page_types array, just keep the biggest one for now */
+ if (new->memory.page_types_len && old->memory.page_types_len)
+ hwloc_debug("%s", "merging page_types by keeping the biggest one only\n");
+ if (new->memory.page_types_len < old->memory.page_types_len) {
+ free(new->memory.page_types);
+ } else {
+ free(old->memory.page_types);
+ old->memory.page_types_len = new->memory.page_types_len;
+ old->memory.page_types = new->memory.page_types;
+ new->memory.page_types = NULL;
+ new->memory.page_types_len = 0;
+ }
+ break;
+ case HWLOC_OBJ_CACHE:
+ merge_sizes(new, old, attr->cache.size);
+ check_sizes(new, old, attr->cache.size);
+ merge_sizes(new, old, attr->cache.linesize);
+ check_sizes(new, old, attr->cache.linesize);
+ break;
+ default:
+ break;
+ }
+}
+
+/* Try to insert OBJ in CUR, recurse if needed.
+ * Returns the object if it was inserted,
+ * the remaining object it was merged,
+ * NULL if failed to insert.
+ */
+static struct hwloc_obj *
+hwloc___insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t cur, hwloc_obj_t obj,
+ hwloc_report_error_t report_error)
+{
+ hwloc_obj_t child, next_child = NULL;
+ /* These will always point to the pointer to their next last child. */
+ hwloc_obj_t *cur_children = &cur->first_child;
+ hwloc_obj_t *obj_children = &obj->first_child;
+ /* Pointer where OBJ should be put */
+ hwloc_obj_t *putp = NULL; /* OBJ position isn't found yet */
+
+ /* Make sure we haven't gone too deep. */
+ if (!hwloc_bitmap_isincluded(obj->cpuset, cur->cpuset)) {
+ fprintf(stderr,"recursion has gone too deep?!\n");
+ return NULL;
+ }
+
+ /* Iteration with prefetching to be completely safe against CHILD removal.
+ * The list is already sorted by cpuset, and there's no intersection between siblings.
+ */
+ for (child = cur->first_child, child ? next_child = child->next_sibling : NULL;
+ child;
+ child = next_child, child ? next_child = child->next_sibling : NULL) {
+
+ int res = hwloc_obj_cmp_sets(obj, child);
+
+ if (res == HWLOC_OBJ_EQUAL) {
+ if (obj->type == HWLOC_OBJ_GROUP) {
+ /* Group are ignored keep_structure. ignored always are handled earlier. Non-ignored Groups isn't possible. */
+ assert(topology->ignored_types[HWLOC_OBJ_GROUP] == HWLOC_IGNORE_TYPE_KEEP_STRUCTURE);
+ /* Remove the Group now. The normal ignore code path wouldn't tell us whether the Group was removed or not,
+ * while some callers need to know (at least hwloc_topology_insert_group()).
+ *
+ * Keep EQUAL so that the Group gets merged.
+ */
+ } else {
+ /* otherwise compare actual types to decide of the inclusion */
+ res = hwloc_obj_cmp_types(obj, child);
+ }
+ }
+
+ switch (res) {
+ case HWLOC_OBJ_EQUAL:
+ /* Can be two objects with same type. Or one Group and anything else. */
+ if (obj->type == child->type
+ && (obj->type == HWLOC_OBJ_PU || obj->type == HWLOC_OBJ_NUMANODE)
+ && obj->os_index != child->os_index) {
+ static int reported = 0;
+ if (!reported && !hwloc_hide_errors()) {
+ fprintf(stderr, "Cannot merge similar %s objects with different OS indexes %u and %u\n",
+ hwloc_obj_type_string(obj->type), child->os_index, obj->os_index);
+ reported = 1;
+ }
+ return NULL;
+ }
+ merge_insert_equal(obj, child);
+ /* Already present, no need to insert. */
+ return child;
+
+ case HWLOC_OBJ_INCLUDED:
+ /* OBJ is strictly contained is some child of CUR, go deeper. */
+ return hwloc___insert_object_by_cpuset(topology, child, obj, report_error);
+
+ case HWLOC_OBJ_INTERSECTS:
+ if (report_error) {
+ char childstr[512];
+ char objstr[512];
+ char msg[1024];
+ hwloc__report_error_format_obj(objstr, sizeof(objstr), obj);
+ hwloc__report_error_format_obj(childstr, sizeof(childstr), child);
+ snprintf(msg, sizeof(msg), "%s intersects with %s without inclusion!", objstr, childstr);
+ report_error(msg, __LINE__);
+ }
+ goto putback;
+
+ case HWLOC_OBJ_DIFFERENT:
+ /* OBJ should be a child of CUR before CHILD, mark its position if not found yet. */
+ if (!putp && hwloc__object_cpusets_compare_first(obj, child) < 0)
+ /* Don't insert yet, there could be intersect errors later */
+ putp = cur_children;
+ /* Advance cur_children. */
+ cur_children = &child->next_sibling;
+ break;
+
+ case HWLOC_OBJ_CONTAINS:
+ /* OBJ contains CHILD, remove CHILD from CUR */
+ *cur_children = child->next_sibling;
+ child->next_sibling = NULL;
+ /* Put CHILD in OBJ */
+ *obj_children = child;
+ obj_children = &child->next_sibling;
+ child->parent = obj;
+ break;
+ }
+ }
+ /* cur/obj_children points to last CUR/OBJ child next_sibling pointer, which must be NULL. */
+ assert(!*obj_children);
+ assert(!*cur_children);
+
+ /* Put OBJ where it belongs, or in last in CUR's children. */
+ if (!putp)
+ putp = cur_children;
+ obj->next_sibling = *putp;
+ *putp = obj;
+ obj->parent = cur;
+
+ topology->modified = 1;
+ return obj;
+
+ putback:
+ /* Put-back OBJ children in CUR and return an error. */
+ if (putp)
+ cur_children = putp; /* No need to try to insert before where OBJ was supposed to go */
+ else
+ cur_children = &cur->first_child; /* Start from the beginning */
+ /* We can insert in order, but there can be holes in the middle. */
+ while ((child = obj->first_child) != NULL) {
+ /* Remove from OBJ */
+ obj->first_child = child->next_sibling;
+ obj->parent = cur;
+ /* Find child position in CUR, and insert. */
+ while (*cur_children && hwloc__object_cpusets_compare_first(*cur_children, child) < 0)
+ cur_children = &(*cur_children)->next_sibling;
+ child->next_sibling = *cur_children;
+ *cur_children = child;
+ }
+ return NULL;
+}
+
+/* insertion routine that lets you change the error reporting callback */
+struct hwloc_obj *
+hwloc__insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t obj,
+ hwloc_report_error_t report_error)
+{
+ struct hwloc_obj *result;
+
+ assert(!hwloc_obj_type_is_special(obj->type));
+
+ /* Start at the top. */
+ result = hwloc___insert_object_by_cpuset(topology, topology->levels[0][0], obj, report_error);
+ if (result != obj) {
+ /* either failed to insert, or got merged, free the original object */
+ hwloc_free_unlinked_object(obj);
+ } else {
+ /* Add the cpuset to the top */
+ hwloc_bitmap_or(topology->levels[0][0]->complete_cpuset, topology->levels[0][0]->complete_cpuset, obj->cpuset);
+ if (obj->nodeset)
+ hwloc_bitmap_or(topology->levels[0][0]->complete_nodeset, topology->levels[0][0]->complete_nodeset, obj->nodeset);
+ }
+ return result;
+}
+
+/* the default insertion routine warns in case of error.
+ * it's used by most backends */
+struct hwloc_obj *
+hwloc_insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t obj)
+{
+ return hwloc__insert_object_by_cpuset(topology, obj, hwloc_report_os_error);
+}
+
+void
+hwloc_insert_object_by_parent(struct hwloc_topology *topology, hwloc_obj_t parent, hwloc_obj_t obj)
+{
+ hwloc_obj_t *current;
+
+ if (obj->type == HWLOC_OBJ_MISC) {
+ /* Append to the end of the Misc list */
+ for (current = &parent->misc_first_child; *current; current = &(*current)->next_sibling);
+ } else if (hwloc_obj_type_is_io(obj->type)) {
+ /* Append to the end of the I/O list */
+ for (current = &parent->io_first_child; *current; current = &(*current)->next_sibling);
+ } else {
+ /* Append to the end of the list.
+ * The caller takes care of inserting children in the right cpuset order, without intersection between them.
+ * Duplicating doesn't need to check the order since the source topology is supposed to be OK already.
+ * XML reorders if needed, and fails on intersecting siblings.
+ * Other callers just insert random objects such as I/O or Misc, no cpuset issue there.
+ */
+ for (current = &parent->first_child; *current; current = &(*current)->next_sibling);
+ }
+
+ *current = obj;
+ obj->parent = parent;
+ obj->next_sibling = NULL;
+ topology->modified = 1;
+}
+
+hwloc_obj_t
+hwloc_topology_alloc_group_object(struct hwloc_topology *topology __hwloc_attribute_unused)
+{
+ hwloc_obj_t obj = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, -1);
+ if (!obj)
+ return NULL;
+ obj->attr->group.depth = -1;
+ return obj;
+}
+
+hwloc_obj_t
+hwloc_topology_insert_group_object(struct hwloc_topology *topology, hwloc_obj_t obj)
+{
+ hwloc_obj_t res;
+
+ if (!topology->is_loaded) {
+ /* this could actually work, we would just need to disable connect_children/levels below */
+ hwloc_free_unlinked_object(obj);
+ errno = EINVAL;
+ return NULL;
+ }
+
+ if (topology->ignored_types[HWLOC_OBJ_GROUP] == HWLOC_IGNORE_TYPE_ALWAYS) {
+ hwloc_free_unlinked_object(obj);
+ errno = EINVAL;
+ return NULL;
+ }
+
+ if ((!obj->cpuset || hwloc_bitmap_iszero(obj->cpuset))
+ && (!obj->complete_cpuset || hwloc_bitmap_iszero(obj->complete_cpuset))
+ && (!obj->nodeset || hwloc_bitmap_iszero(obj->nodeset))
+ && (!obj->complete_nodeset || hwloc_bitmap_iszero(obj->complete_nodeset))) {
+ hwloc_free_unlinked_object(obj);
+ errno = EINVAL;
+ return NULL;
+ }
+
+ res = hwloc__insert_object_by_cpuset(topology, obj, NULL /* do not show errors on stdout */);
+ if (!res)
+ return NULL;
+ if (res != obj)
+ /* merged */
+ return res;
+
+ /* properly inserted */
+ hwloc_obj_add_children_sets(obj);
+ hwloc_connect_children(topology->levels[0][0]);
+ if (hwloc_connect_levels(topology) < 0)
+ return NULL;
+ topology->modified = 0;
+ return obj;
+}
+
+static void hwloc_connect_misc_level(hwloc_topology_t topology);
+
+hwloc_obj_t
+hwloc_topology_insert_misc_object(struct hwloc_topology *topology, hwloc_obj_t parent, const char *name)
+{
+ hwloc_obj_t obj;
+
+ if (topology->ignored_types[HWLOC_OBJ_MISC] == HWLOC_IGNORE_TYPE_ALWAYS) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ if (!topology->is_loaded) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ obj = hwloc_alloc_setup_object(HWLOC_OBJ_MISC, -1);
+ if (name)
+ obj->name = strdup(name);
+
+ hwloc_insert_object_by_parent(topology, parent, obj);
+
+ hwloc_connect_children(parent); /* FIXME: only connect misc children */
+ hwloc_connect_misc_level(topology);
+ topology->modified = 0;
+
+ return obj;
+}
+
+static int hwloc_memory_page_type_compare(const void *_a, const void *_b)
+{
+ const struct hwloc_obj_memory_page_type_s *a = _a;
+ const struct hwloc_obj_memory_page_type_s *b = _b;
+ /* consider 0 as larger so that 0-size page_type go to the end */
+ if (!b->size)
+ return -1;
+ /* don't cast a-b in int since those are ullongs */
+ if (b->size == a->size)
+ return 0;
+ return a->size < b->size ? -1 : 1;
+}
+
+/* Propagate memory counts */
+static void
+propagate_total_memory(hwloc_obj_t obj)
+{
+ hwloc_obj_t *temp, child;
+ unsigned i;
+
+ /* reset total before counting local and children memory */
+ obj->memory.total_memory = 0;
+
+ /* Propagate memory up. */
+ for_each_child_safe(child, obj, temp) {
+ propagate_total_memory(child);
+ obj->memory.total_memory += child->memory.total_memory;
+ }
+ /* No memory under I/O or Misc */
+
+ obj->memory.total_memory += obj->memory.local_memory;
+
+ /* By the way, sort the page_type array.
+ * Cannot do it on insert since some backends (e.g. XML) add page_types after inserting the object.
+ */
+ qsort(obj->memory.page_types, obj->memory.page_types_len, sizeof(*obj->memory.page_types), hwloc_memory_page_type_compare);
+ /* Ignore 0-size page_types, they are at the end */
+ for(i=obj->memory.page_types_len; i>=1; i--)
+ if (obj->memory.page_types[i-1].size)
+ break;
+ obj->memory.page_types_len = i;
+}
+
+/* Collect the cpuset of all the PU objects. */
+static void
+collect_proc_cpuset(hwloc_obj_t obj, hwloc_obj_t sys)
+{
+ hwloc_obj_t child, *temp;
+
+ if (sys) {
+ /* We are already given a pointer to a system object */
+ if (obj->type == HWLOC_OBJ_PU)
+ hwloc_bitmap_or(sys->cpuset, sys->cpuset, obj->cpuset);
+ } else {
+ if (obj->cpuset) {
+ /* This object is the root of a machine */
+ sys = obj;
+ /* Assume no PU for now */
+ hwloc_bitmap_zero(obj->cpuset);
+ }
+ }
+
+ for_each_child_safe(child, obj, temp)
+ collect_proc_cpuset(child, sys);
+ /* No PU under I/O or Misc */
+}
+
+/* While traversing down and up, propagate the disallowed cpus by
+ * and'ing them to and from the first object that has a cpuset */
+static void
+propagate_unused_cpuset(hwloc_obj_t obj, hwloc_obj_t sys)
+{
+ hwloc_obj_t child, *temp;
+
+ if (obj->cpuset) {
+ if (sys) {
+ /* We are already given a pointer to an system object, update it and update ourselves */
+ hwloc_bitmap_t mask = hwloc_bitmap_alloc();
+
+ /* Apply the topology cpuset */
+ hwloc_bitmap_and(obj->cpuset, obj->cpuset, sys->cpuset);
+
+ /* Update complete cpuset down */
+ if (obj->complete_cpuset) {
+ hwloc_bitmap_and(obj->complete_cpuset, obj->complete_cpuset, sys->complete_cpuset);
+ } else {
+ obj->complete_cpuset = hwloc_bitmap_dup(sys->complete_cpuset);
+ hwloc_bitmap_and(obj->complete_cpuset, obj->complete_cpuset, obj->cpuset);
+ }
+
+ /* Update allowed cpusets */
+ if (obj->allowed_cpuset) {
+ /* Update ours */
+ hwloc_bitmap_and(obj->allowed_cpuset, obj->allowed_cpuset, sys->allowed_cpuset);
+
+ /* Update the given cpuset, but only what we know */
+ hwloc_bitmap_copy(mask, obj->cpuset);
+ hwloc_bitmap_not(mask, mask);
+ hwloc_bitmap_or(mask, mask, obj->allowed_cpuset);
+ hwloc_bitmap_and(sys->allowed_cpuset, sys->allowed_cpuset, mask);
+ } else {
+ /* Just take it as such */
+ obj->allowed_cpuset = hwloc_bitmap_dup(sys->allowed_cpuset);
+ hwloc_bitmap_and(obj->allowed_cpuset, obj->allowed_cpuset, obj->cpuset);
+ }
+
+ hwloc_bitmap_free(mask);
+ } else {
+ /* This object is the root of a machine */
+ sys = obj;
+ /* Apply complete_cpuset to cpuset and allowed_cpuset, it
+ * will automatically be applied below */
+ if (obj->complete_cpuset)
+ hwloc_bitmap_and(obj->cpuset, obj->cpuset, obj->complete_cpuset);
+ else
+ obj->complete_cpuset = hwloc_bitmap_dup(obj->cpuset);
+ if (obj->allowed_cpuset)
+ hwloc_bitmap_and(obj->allowed_cpuset, obj->allowed_cpuset, obj->complete_cpuset);
+ else
+ obj->allowed_cpuset = hwloc_bitmap_dup(obj->complete_cpuset);
+ }
+ }
+
+ for_each_child_safe(child, obj, temp)
+ propagate_unused_cpuset(child, sys);
+ /* No PU under I/O or Misc */
+}
+
+/* Setup object cpusets/nodesets by OR'ing its children. */
+HWLOC_DECLSPEC int
+hwloc_obj_add_other_obj_sets(hwloc_obj_t dst, hwloc_obj_t src)
+{
+#define ADD_OTHER_OBJ_SET(_dst, _src, _set) \
+ if ((_src)->_set) { \
+ if (!(_dst)->_set) \
+ (_dst)->_set = hwloc_bitmap_alloc(); \
+ hwloc_bitmap_or((_dst)->_set, (_dst)->_set, (_src)->_set); \
+ }
+ ADD_OTHER_OBJ_SET(dst, src, cpuset);
+ ADD_OTHER_OBJ_SET(dst, src, complete_cpuset);
+ ADD_OTHER_OBJ_SET(dst, src, allowed_cpuset);
+ ADD_OTHER_OBJ_SET(dst, src, nodeset);
+ ADD_OTHER_OBJ_SET(dst, src, complete_nodeset);
+ ADD_OTHER_OBJ_SET(dst, src, allowed_nodeset);
+ return 0;
+}
+
+HWLOC_DECLSPEC int
+hwloc_obj_add_children_sets(hwloc_obj_t obj)
+{
+ hwloc_obj_t child;
+ assert(obj->cpuset != NULL);
+ child = obj->first_child;
+ while (child) {
+ assert(child->cpuset != NULL);
+ hwloc_obj_add_other_obj_sets(obj, child);
+ child = child->next_sibling;
+ }
+ /* No need to look at Misc children, they contain no PU. */
+ return 0;
+}
+
+/* Propagate nodesets up and down */
+static void
+propagate_nodeset(hwloc_obj_t obj, hwloc_obj_t sys)
+{
+ hwloc_obj_t child, *temp;
+ hwloc_bitmap_t parent_nodeset = NULL;
+ int parent_weight = 0;
+
+ if (!sys && obj->nodeset) {
+ sys = obj;
+ if (!obj->complete_nodeset)
+ obj->complete_nodeset = hwloc_bitmap_dup(obj->nodeset);
+ if (!obj->allowed_nodeset)
+ obj->allowed_nodeset = hwloc_bitmap_dup(obj->complete_nodeset);
+ }
+
+ if (sys) {
+ if (obj->nodeset) {
+ /* Some existing nodeset coming from above, to possibly propagate down */
+ parent_nodeset = obj->nodeset;
+ parent_weight = hwloc_bitmap_weight(parent_nodeset);
+ } else
+ obj->nodeset = hwloc_bitmap_alloc();
+ }
+
+ for_each_child_safe(child, obj, temp) {
+ /* Propagate singleton nodesets down */
+ if (parent_weight == 1) {
+ if (!child->nodeset)
+ child->nodeset = hwloc_bitmap_dup(obj->nodeset);
+ else if (!hwloc_bitmap_isequal(child->nodeset, parent_nodeset)) {
+ hwloc_debug_bitmap("Oops, parent nodeset %s", parent_nodeset);
+ hwloc_debug_bitmap(" is different from child nodeset %s, ignoring the child one\n", child->nodeset);
+ hwloc_bitmap_copy(child->nodeset, parent_nodeset);
+ }
+ }
+
+ /* Recurse */
+ propagate_nodeset(child, sys);
+
+ /* Propagate children nodesets up */
+ if (sys && child->nodeset)
+ hwloc_bitmap_or(obj->nodeset, obj->nodeset, child->nodeset);
+ }
+ /* No nodeset under I/O or Misc */
+}
+
+/* Propagate allowed and complete nodesets */
+static void
+propagate_nodesets(hwloc_obj_t obj)
+{
+ hwloc_bitmap_t mask = hwloc_bitmap_alloc();
+ hwloc_obj_t child, *temp;
+
+ for_each_child_safe(child, obj, temp) {
+ if (obj->nodeset) {
+ /* Update complete nodesets down */
+ if (child->complete_nodeset) {
+ hwloc_bitmap_and(child->complete_nodeset, child->complete_nodeset, obj->complete_nodeset);
+ } else if (child->nodeset) {
+ child->complete_nodeset = hwloc_bitmap_dup(obj->complete_nodeset);
+ hwloc_bitmap_and(child->complete_nodeset, child->complete_nodeset, child->nodeset);
+ } /* else the child doesn't have nodeset information, we can not provide a complete nodeset */
+
+ /* Update allowed nodesets down */
+ if (child->allowed_nodeset) {
+ hwloc_bitmap_and(child->allowed_nodeset, child->allowed_nodeset, obj->allowed_nodeset);
+ } else if (child->nodeset) {
+ child->allowed_nodeset = hwloc_bitmap_dup(obj->allowed_nodeset);
+ hwloc_bitmap_and(child->allowed_nodeset, child->allowed_nodeset, child->nodeset);
+ }
+ }
+
+ propagate_nodesets(child);
+
+ if (obj->nodeset) {
+ /* Update allowed nodesets up */
+ if (child->nodeset && child->allowed_nodeset) {
+ hwloc_bitmap_copy(mask, child->nodeset);
+ hwloc_bitmap_andnot(mask, mask, child->allowed_nodeset);
+ hwloc_bitmap_andnot(obj->allowed_nodeset, obj->allowed_nodeset, mask);
+ }
+ }
+ }
+ hwloc_bitmap_free(mask);
+ /* No nodeset under I/O or Misc */
+
+ if (obj->nodeset) {
+ /* Apply complete nodeset to nodeset and allowed_nodeset */
+ if (obj->complete_nodeset)
+ hwloc_bitmap_and(obj->nodeset, obj->nodeset, obj->complete_nodeset);
+ else
+ obj->complete_nodeset = hwloc_bitmap_dup(obj->nodeset);
+ if (obj->allowed_nodeset)
+ hwloc_bitmap_and(obj->allowed_nodeset, obj->allowed_nodeset, obj->complete_nodeset);
+ else
+ obj->allowed_nodeset = hwloc_bitmap_dup(obj->complete_nodeset);
+ }
+}
+
+static void
+remove_unused_sets(hwloc_obj_t obj)
+{
+ hwloc_obj_t child, *temp;
+
+ if (obj->cpuset) {
+ hwloc_bitmap_and(obj->cpuset, obj->cpuset, obj->allowed_cpuset);
+ }
+ if (obj->nodeset) {
+ hwloc_bitmap_and(obj->nodeset, obj->nodeset, obj->allowed_nodeset);
+ }
+ if (obj->type == HWLOC_OBJ_NUMANODE && obj->os_index != (unsigned) -1 &&
+ !hwloc_bitmap_isset(obj->allowed_nodeset, obj->os_index)) {
+ unsigned i;
+ hwloc_debug("Dropping memory from disallowed node %u\n", obj->os_index);
+ obj->memory.local_memory = 0;
+ obj->memory.total_memory = 0;
+ for(i=0; i<obj->memory.page_types_len; i++)
+ obj->memory.page_types[i].count = 0;
+ }
+
+ for_each_child_safe(child, obj, temp)
+ remove_unused_sets(child);
+ /* No cpuset under I/O or Misc */
+}
+
+void
+hwloc__reorder_children(hwloc_obj_t parent)
+{
+ /* move the children list on the side */
+ hwloc_obj_t *prev, child, children = parent->first_child;
+ parent->first_child = NULL;
+ while (children) {
+ /* dequeue child */
+ child = children;
+ children = child->next_sibling;
+ /* find where to enqueue it */
+ prev = &parent->first_child;
+ while (*prev && hwloc__object_cpusets_compare_first(child, *prev) > 0)
+ prev = &((*prev)->next_sibling);
+ /* enqueue */
+ child->next_sibling = *prev;
+ *prev = child;
+ }
+ /* No ordering to enforce for Misc children. */
+}
+
+/* Remove objects that are ignored in any case.
+ * Returns 1 if *pparent were replaced, which means the caller need to reorder its children.
+ * Returns 0 otherwise.
+ */
+static int
+ignore_type_always(hwloc_topology_t topology, hwloc_obj_t *pparent)
+{
+ hwloc_obj_t parent = *pparent, child, *pchild;
+ int dropped_children = 0;
+ int dropped = 0;
+
+ /* account dropped normal children only, others don't required reordering */
+ for_each_child_safe(child, parent, pchild)
+ dropped_children += ignore_type_always(topology, pchild);
+ for_each_io_child_safe(child, parent, pchild) /* There can be Misc under I/O */
+ ignore_type_always(topology, pchild);
+ for_each_misc_child_safe(child, parent, pchild)
+ ignore_type_always(topology, pchild);
+
+ if ((parent != topology->levels[0][0] &&
+ topology->ignored_types[parent->type] == HWLOC_IGNORE_TYPE_ALWAYS)
+ || (parent->type == HWLOC_OBJ_CACHE && parent->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION
+ && !(topology->flags & HWLOC_TOPOLOGY_FLAG_ICACHES))) {
+ hwloc_debug("%s", "\nDropping ignored object ");
+ hwloc_debug_print_object(0, parent);
+ unlink_and_free_single_object(pparent);
+ topology->modified = 1;
+ dropped = 1;
+
+ } else if (dropped_children) {
+ /* we keep this object but its children changed, reorder them by complete_cpuset */
+ hwloc__reorder_children(parent);
+ }
+
+ return dropped;
+}
+
+/* Remove all children whose cpuset is empty, except NUMA nodes
+ * since we want to keep memory information, and except PCI bridges and devices.
+ */
+static void
+remove_empty(hwloc_topology_t topology, hwloc_obj_t *pobj)
+{
+ hwloc_obj_t obj = *pobj, child, *pchild;
+
+ for_each_child_safe(child, obj, pchild)
+ remove_empty(topology, pchild);
+ /* No cpuset under I/O or Misc */
+
+ if (obj->type != HWLOC_OBJ_NUMANODE
+ && !obj->first_child /* only remove if all children were removed above, so that we don't remove parents of NUMAnode */
+ && !obj->io_first_child /* only remove if no I/O is attached there */
+ && hwloc_bitmap_iszero(obj->cpuset)) {
+ /* Remove empty children (even if it has Misc children) */
+ hwloc_debug("%s", "\nRemoving empty object ");
+ hwloc_debug_print_object(0, obj);
+ unlink_and_free_single_object(pobj);
+ topology->modified = 1;
+ }
+}
+
+/* Remove objects that are ignored with keep structure flag.
+ * Returns 1 if *pparent were replaced, which means the caller need to reorder its children.
+ * Returns 0 otherwise.
+ */
+static int
+ignore_type_keep_structure(hwloc_topology_t topology, hwloc_obj_t *pparent)
+{
+ hwloc_obj_t parent = *pparent, child, *pchild;
+ int replacechild = 0, replaceparent = 0, droppedchildren = 0;
+
+ if (!parent->first_child) /* can't use arity yet */
+ /* There are no children, nothing to merge. */
+ return 0;
+
+ /* account dropped normal children only, others don't required reordering */
+ for_each_child_safe(child, parent, pchild)
+ droppedchildren += ignore_type_keep_structure(topology, pchild);
+ for_each_io_child_safe(child, parent, pchild)
+ ignore_type_keep_structure(topology, pchild);
+ for_each_misc_child_safe(child, parent, pchild)
+ ignore_type_keep_structure(topology, pchild);
+
+ if (droppedchildren)
+ hwloc__reorder_children(parent);
+
+ child = parent->first_child;
+ /* we don't merge if there are multiple "important" children. */
+ if (child->next_sibling) /* can't use arity yet */
+ return 0;
+
+ /* Check whether parent and/or child can be replaced */
+ if (topology->ignored_types[parent->type] == HWLOC_IGNORE_TYPE_KEEP_STRUCTURE) {
+ /* Parent can be ignored in favor of the child. */
+ replaceparent = 1;
+ }
+ if (topology->ignored_types[child->type] == HWLOC_IGNORE_TYPE_KEEP_STRUCTURE) {
+ /* Child can be ignored in favor of the parent. */
+ replacechild = 1;
+ }
+
+ /* Decide which one to actually replace */
+ if (replaceparent && replacechild) {
+ /* If both may be replaced, look at obj_type_priority */
+ if (obj_type_priority[parent->type] > obj_type_priority[child->type])
+ replaceparent = 0;
+ else
+ replacechild = 0;
+ }
+
+ if (replaceparent) {
+ /* Replace parent with child */
+ hwloc_debug("%s", "\nIgnoring parent ");
+ hwloc_debug_print_object(0, parent);
+ /* move children to child, so that unlink_and_free_single_object() doesn't move them to the grandparent */
+ if (parent->io_first_child) {
+ append_siblings_list(&child->io_first_child, parent->io_first_child, child);
+ parent->io_first_child = NULL;
+ }
+ if (parent->misc_first_child) {
+ append_siblings_list(&child->misc_first_child, parent->misc_first_child, child);
+ parent->misc_first_child = NULL;
+ }
+ unlink_and_free_single_object(pparent);
+ topology->modified = 1;
+
+ } else if (replacechild) {
+ /* Replace child with parent */
+ hwloc_debug("%s", "\nIgnoring child ");
+ hwloc_debug_print_object(0, child);
+ unlink_and_free_single_object(&parent->first_child);
+ topology->modified = 1;
+ }
+
+ return replaceparent ? 1 : 0;
+}
+
+static void
+hwloc_drop_all_io(hwloc_topology_t topology, hwloc_obj_t root)
+{
+ hwloc_obj_t child, *pchild;
+ for_each_child_safe(child, root, pchild) {
+ hwloc_drop_all_io(topology, child);
+ }
+ for_each_io_child_safe(child, root, pchild) {
+ unlink_and_free_object_and_children(pchild);
+ topology->modified = 1;
+ }
+ /* No I/O under Misc */
+}
+
+/*
+ * If IO_DEVICES and WHOLE_IO are not set, we drop everything.
+ * If WHOLE_IO is not set, we drop non-interesting devices,
+ * and bridges that have no children.
+ * If IO_BRIDGES is also not set, we also drop all bridges
+ * except the hostbridges.
+ */
+static void
+hwloc_drop_useless_io(hwloc_topology_t topology, hwloc_obj_t root)
+{
+ hwloc_obj_t child, *pchild;
+
+ /* recurse into normal children */
+ for_each_child_safe(child, root, pchild) {
+ hwloc_drop_useless_io(topology, child);
+ }
+
+ /* filter I/O children and recurse */
+ for_each_io_child_safe(child, root, pchild) {
+ /* remove useless children if needed */
+ if (!(topology->flags & HWLOC_TOPOLOGY_FLAG_WHOLE_IO)
+ && child->type == HWLOC_OBJ_PCI_DEVICE) {
+ unsigned classid = child->attr->pcidev.class_id;
+ unsigned baseclass = classid >> 8;
+ if (baseclass != 0x03 /* PCI_BASE_CLASS_DISPLAY */
+ && baseclass != 0x02 /* PCI_BASE_CLASS_NETWORK */
+ && baseclass != 0x01 /* PCI_BASE_CLASS_STORAGE */
+ && baseclass != 0x0b /* PCI_BASE_CLASS_PROCESSOR */
+ && classid != 0x0c06 /* PCI_CLASS_SERIAL_INFINIBAND */
+ && baseclass != 0x12 /* Processing Accelerators */) {
+ unlink_and_free_object_and_children(pchild);
+ topology->modified = 1;
+ continue;
+ }
+ }
+ /* recurse to ignore grand-children etc */
+ hwloc_drop_useless_io(topology, child);
+ /* now remove useless bridges if needed */
+ if (child->type == HWLOC_OBJ_BRIDGE) {
+ if (!child->io_first_child) {
+ /* bridges with no children are removed if WHOLE_IO isn't given */
+ if (!(topology->flags & (HWLOC_TOPOLOGY_FLAG_WHOLE_IO))) {
+ unlink_and_free_single_object(pchild);
+ topology->modified = 1;
+ continue;
+ }
+ } else if (child->attr->bridge.upstream_type != HWLOC_OBJ_BRIDGE_HOST) {
+ /* only hostbridges are kept if WHOLE_IO or IO_BRIDGE are not given */
+ if (!(topology->flags & (HWLOC_TOPOLOGY_FLAG_IO_BRIDGES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO))) {
+ unlink_and_free_single_object(pchild);
+ topology->modified = 1;
+ continue;
+ }
+ }
+ }
+ }
+
+ /* No I/O under Misc */
+}
+
+static void
+hwloc_propagate_bridge_depth(hwloc_topology_t topology, hwloc_obj_t root, unsigned depth)
+{
+ hwloc_obj_t child;
+ for(child = root->first_child; child; child = child->next_sibling) {
+ assert(!depth); /* no normal children under I/O */
+ hwloc_propagate_bridge_depth(topology, child, 0);
+ }
+ for(child = root->io_first_child; child; child = child->next_sibling) {
+ if (child->type == HWLOC_OBJ_BRIDGE) {
+ child->attr->bridge.depth = depth;
+ hwloc_propagate_bridge_depth(topology, child, depth+1);
+ } else if (!hwloc_obj_type_is_io(child->type)) {
+ hwloc_propagate_bridge_depth(topology, child, 0);
+ }
+ }
+ /* No I/O under Misc children */
+}
+
+static void
+hwloc_propagate_symmetric_subtree(hwloc_topology_t topology, hwloc_obj_t root)
+{
+ hwloc_obj_t child, *array;
+ int ok;
+
+ /* assume we're not symmetric by default */
+ root->symmetric_subtree = 0;
+
+ /* if no child, we are symmetric */
+ if (!root->arity) {
+ root->symmetric_subtree = 1;
+ return;
+ }
+
+ /* look at normal children only, I/O and Misc are ignored.
+ * return if any child is not symmetric.
+ */
+ ok = 1;
+ for(child = root->first_child; child; child = child->next_sibling) {
+ hwloc_propagate_symmetric_subtree(topology, child);
+ if (!child->symmetric_subtree)
+ ok = 0;
+ }
+ if (!ok)
+ return;
+ /* Misc and I/O children do not care about symmetric_subtree */
+
+ /* now check that children subtrees are identical.
+ * just walk down the first child in each tree and compare their depth and arities
+ */
+ array = malloc(root->arity * sizeof(*array));
+ memcpy(array, root->children, root->arity * sizeof(*array));
+ while (1) {
+ unsigned i;
+ /* check current level arities and depth */
+ for(i=1; i<root->arity; i++)
+ if (array[i]->depth != array[0]->depth
+ || array[i]->arity != array[0]->arity) {
+ free(array);
+ return;
+ }
+ if (!array[0]->arity)
+ /* no more children level, we're ok */
+ break;
+ /* look at first child of each element now */
+ for(i=0; i<root->arity; i++)
+ array[i] = array[i]->first_child;
+ }
+ free(array);
+
+ /* everything went fine, we're symmetric */
+ root->symmetric_subtree = 1;
+}
+
+/*
+ * Initialize handy pointers in the whole topology.
+ * The topology only had first_child and next_sibling pointers.
+ * When this funtions return, all parent/children pointers are initialized.
+ * The remaining fields (levels, cousins, logical_index, depth, ...) will
+ * be setup later in hwloc_connect_levels().
+ *
+ * Can be called several times, so may have to update the array.
+ */
+void
+hwloc_connect_children(hwloc_obj_t parent)
+{
+ unsigned n, oldn = parent->arity;
+ hwloc_obj_t child, prev_child;
+ int ok;
+
+ /* Main children list */
+
+ ok = 1;
+ prev_child = NULL;
+ for (n = 0, child = parent->first_child;
+ child;
+ n++, prev_child = child, child = child->next_sibling) {
+ child->sibling_rank = n;
+ child->prev_sibling = prev_child;
+ /* already OK in the array? */
+ if (n >= oldn || parent->children[n] != child)
+ ok = 0;
+ /* recurse */
+ hwloc_connect_children(child);
+ }
+ parent->last_child = prev_child;
+ parent->arity = n;
+ if (!n) {
+ /* no need for an array anymore */
+ free(parent->children);
+ parent->children = NULL;
+ goto io;
+ }
+ if (ok)
+ /* array is already OK (even if too large) */
+ goto io;
+
+ /* alloc a larger array if needed */
+ if (oldn < n) {
+ free(parent->children);
+ parent->children = malloc(n * sizeof(*parent->children));
+ }
+ /* refill */
+ for (n = 0, child = parent->first_child;
+ child;
+ n++, child = child->next_sibling) {
+ parent->children[n] = child;
+ }
+
+ /* Misc children list */
+ io:
+
+ prev_child = NULL;
+ for (n = 0, child = parent->io_first_child;
+ child;
+ n++, prev_child = child, child = child->next_sibling) {
+ child->parent = parent;
+ child->sibling_rank = n;
+ child->prev_sibling = prev_child;
+ hwloc_connect_children(child);
+ }
+ parent->io_arity = n;
+
+ /* Misc children list */
+
+ prev_child = NULL;
+ for (n = 0, child = parent->misc_first_child;
+ child;
+ n++, prev_child = child, child = child->next_sibling) {
+ child->parent = parent;
+ child->sibling_rank = n;
+ child->prev_sibling = prev_child;
+ hwloc_connect_children(child);
+ }
+ parent->misc_arity = n;
+}
+
+/*
+ * Check whether there is an object below ROOT that has the same type as OBJ
+ */
+static int
+find_same_type(hwloc_obj_t root, hwloc_obj_t obj)
+{
+ hwloc_obj_t child;
+
+ if (hwloc_type_cmp(root, obj) == HWLOC_TYPE_EQUAL)
+ return 1;
+
+ for (child = root->first_child; child; child = child->next_sibling)
+ if (find_same_type(child, obj))
+ return 1;
+
+ return 0;
+}
+
+/* traverse the array of current object and compare them with top_obj.
+ * if equal, take the object and put its children into the remaining objs.
+ * if not equal, put the object into the remaining objs.
+ */
+static int
+hwloc_level_take_objects(hwloc_obj_t top_obj,
+ hwloc_obj_t *current_objs, unsigned n_current_objs,
+ hwloc_obj_t *taken_objs, unsigned n_taken_objs __hwloc_attribute_unused,
+ hwloc_obj_t *remaining_objs, unsigned n_remaining_objs __hwloc_attribute_unused)
+{
+ unsigned taken_i = 0;
+ unsigned new_i = 0;
+ unsigned i, j;
+
+ for (i = 0; i < n_current_objs; i++)
+ if (hwloc_type_cmp(top_obj, current_objs[i]) == HWLOC_TYPE_EQUAL) {
+ /* Take it, add main children. */
+ taken_objs[taken_i++] = current_objs[i];
+ for (j = 0; j < current_objs[i]->arity; j++)
+ remaining_objs[new_i++] = current_objs[i]->children[j];
+ } else {
+ /* Leave it. */
+ remaining_objs[new_i++] = current_objs[i];
+ }
+
+#ifdef HWLOC_DEBUG
+ /* Make sure we didn't mess up. */
+ assert(taken_i == n_taken_objs);
+ assert(new_i == n_current_objs - n_taken_objs + n_remaining_objs);
+#endif
+
+ return new_i;
+}
+
+static unsigned
+hwloc_build_level_from_list(struct hwloc_obj *first, struct hwloc_obj ***levelp)
+{
+ unsigned i, nb;
+ struct hwloc_obj * obj;
+
+ /* count */
+ obj = first;
+ i = 0;
+ while (obj) {
+ i++;
+ obj = obj->next_cousin;
+ }
+ nb = i;
+
+ /* allocate and fill level */
+ *levelp = malloc(nb * sizeof(struct hwloc_obj *));
+ obj = first;
+ i = 0;
+ while (obj) {
+ obj->logical_index = i;
+ (*levelp)[i] = obj;
+ i++;
+ obj = obj->next_cousin;
+ }
+
+ return nb;
+}
+
+/* Append I/O objects to their lists */
+static void
+hwloc_list_io_objects(hwloc_topology_t topology, hwloc_obj_t obj)
+{
+ hwloc_obj_t child, *temp;
+
+ if (hwloc_obj_type_is_io(obj->type)) {
+ /* make sure we don't have remaining stale pointers from a previous load */
+ obj->next_cousin = NULL;
+ obj->prev_cousin = NULL;
+
+ if (obj->type == HWLOC_OBJ_BRIDGE) {
+ obj->depth = HWLOC_TYPE_DEPTH_BRIDGE;
+ /* Insert in the main bridge list */
+ if (topology->first_bridge) {
+ obj->prev_cousin = topology->last_bridge;
+ obj->prev_cousin->next_cousin = obj;
+ topology->last_bridge = obj;
+ } else {
+ topology->first_bridge = topology->last_bridge = obj;
+ }
+
+ } else if (obj->type == HWLOC_OBJ_PCI_DEVICE) {
+ obj->depth = HWLOC_TYPE_DEPTH_PCI_DEVICE;
+ /* Insert in the main pcidev list */
+ if (topology->first_pcidev) {
+ obj->prev_cousin = topology->last_pcidev;
+ obj->prev_cousin->next_cousin = obj;
+ topology->last_pcidev = obj;
+ } else {
+ topology->first_pcidev = topology->last_pcidev = obj;
+ }
+
+ } else if (obj->type == HWLOC_OBJ_OS_DEVICE) {
+ obj->depth = HWLOC_TYPE_DEPTH_OS_DEVICE;
+ /* Insert in the main osdev list */
+ if (topology->first_osdev) {
+ obj->prev_cousin = topology->last_osdev;
+ obj->prev_cousin->next_cousin = obj;
+ topology->last_osdev = obj;
+ } else {
+ topology->first_osdev = topology->last_osdev = obj;
+ }
+ }
+ }
+
+ for_each_child_safe(child, obj, temp)
+ hwloc_list_io_objects(topology, child);
+ for_each_io_child_safe(child, obj, temp)
+ hwloc_list_io_objects(topology, child);
+ /* No I/O under Misc */
+}
+
+/* Build I/O levels */
+static void
+hwloc_connect_io_levels(hwloc_topology_t topology)
+{
+ free(topology->bridge_level);
+ topology->bridge_level = NULL;
+ topology->bridge_nbobjects = 0;
+ topology->first_bridge = topology->last_bridge = NULL;
+ topology->type_depth[HWLOC_OBJ_BRIDGE] = HWLOC_TYPE_DEPTH_BRIDGE;
+
+ free(topology->pcidev_level);
+ topology->pcidev_level = NULL;
+ topology->pcidev_nbobjects = 0;
+ topology->first_pcidev = topology->last_pcidev = NULL;
+ topology->type_depth[HWLOC_OBJ_PCI_DEVICE] = HWLOC_TYPE_DEPTH_PCI_DEVICE;
+
+ free(topology->osdev_level);
+ topology->osdev_level = NULL;
+ topology->osdev_nbobjects = 0;
+ topology->first_osdev = topology->last_osdev = NULL;
+ topology->type_depth[HWLOC_OBJ_OS_DEVICE] = HWLOC_TYPE_DEPTH_OS_DEVICE;
+
+ hwloc_list_io_objects(topology, topology->levels[0][0]);
+ topology->bridge_nbobjects = hwloc_build_level_from_list(topology->first_bridge, &topology->bridge_level);
+ topology->pcidev_nbobjects = hwloc_build_level_from_list(topology->first_pcidev, &topology->pcidev_level);
+ topology->osdev_nbobjects = hwloc_build_level_from_list(topology->first_osdev, &topology->osdev_level);
+}
+
+/* Append Misc object to their list */
+static void
+hwloc_list_misc_objects(hwloc_topology_t topology, hwloc_obj_t obj)
+{
+ hwloc_obj_t child, *temp;
+
+ if (obj->type == HWLOC_OBJ_MISC) {
+ obj->depth = HWLOC_TYPE_DEPTH_MISC;
+ /* Insert the main Misc list */
+ if (topology->first_misc) {
+ obj->prev_cousin = topology->last_misc;
+ obj->prev_cousin->next_cousin = obj;
+ topology->last_misc = obj;
+ } else {
+ topology->first_misc = topology->last_misc = obj;
+ }
+ }
+
+ for_each_child_safe(child, obj, temp)
+ hwloc_list_misc_objects(topology, child);
+ for_each_io_child_safe(child, obj, temp)
+ hwloc_list_misc_objects(topology, child);
+ for_each_misc_child_safe(child, obj, temp)
+ hwloc_list_misc_objects(topology, child);
+}
+
+/* Build Misc level */
+static void
+hwloc_connect_misc_level(hwloc_topology_t topology)
+{
+ free(topology->misc_level);
+ topology->misc_level = NULL;
+ topology->misc_nbobjects = 0;
+ topology->first_misc = topology->last_misc = NULL;
+ topology->type_depth[HWLOC_OBJ_MISC] = HWLOC_TYPE_DEPTH_MISC;
+
+ hwloc_list_misc_objects(topology, topology->levels[0][0]);
+ topology->misc_nbobjects = hwloc_build_level_from_list(topology->first_misc, &topology->misc_level);
+}
+
+/*
+ * Do the remaining work that hwloc_connect_children() did not do earlier.
+ */
+int
+hwloc_connect_levels(hwloc_topology_t topology)
+{
+ unsigned l, i=0;
+ hwloc_obj_t *objs, *taken_objs, *new_objs, top_obj, root;
+ unsigned n_objs, n_taken_objs, n_new_objs;
+
+ /* reset non-root levels (root was initialized during init and will not change here) */
+ for(l=1; l<HWLOC_DEPTH_MAX; l++)
+ free(topology->levels[l]);
+ memset(topology->levels+1, 0, (HWLOC_DEPTH_MAX-1)*sizeof(*topology->levels));
+ memset(topology->level_nbobjects+1, 0, (HWLOC_DEPTH_MAX-1)*sizeof(*topology->level_nbobjects));
+ topology->nb_levels = 1;
+ /* don't touch next_group_depth, the Group objects are still here */
+
+ /* initialize all depth to unknown */
+ for (l = HWLOC_OBJ_SYSTEM; l < HWLOC_OBJ_TYPE_MAX; l++)
+ topology->type_depth[l] = HWLOC_TYPE_DEPTH_UNKNOWN;
+
+ /* initialize root type depth */
+ root = topology->levels[0][0];
+ root->depth = 0;
+ topology->type_depth[root->type] = 0;
+ /* root level */
+ root->logical_index = 0;
+ root->prev_cousin = NULL;
+ root->next_cousin = NULL;
+ /* root as a child of nothing */
+ root->parent = NULL;
+ root->sibling_rank = 0;
+ root->prev_sibling = NULL;
+ root->next_sibling = NULL;
+
+ /* Start with children of the whole system. */
+ n_objs = topology->levels[0][0]->arity;
+ objs = malloc(n_objs * sizeof(objs[0]));
+ if (!objs) {
+ errno = ENOMEM;
+ return -1;
+ }
+ memcpy(objs, topology->levels[0][0]->children, n_objs*sizeof(objs[0]));
+
+ /* Keep building levels while there are objects left in OBJS. */
+ while (n_objs) {
+ /* At this point, the objs array contains only objects that may go into levels */
+
+ /* First find which type of object is the topmost.
+ * Don't use PU if there are other types since we want to keep PU at the bottom.
+ */
+
+ /* Look for the first non-PU object, and use the first PU if we really find nothing else */
+ for (i = 0; i < n_objs; i++)
+ if (objs[i]->type != HWLOC_OBJ_PU)
+ break;
+ top_obj = i == n_objs ? objs[0] : objs[i];
+
+ /* See if this is actually the topmost object */
+ for (i = 0; i < n_objs; i++) {
+ if (hwloc_type_cmp(top_obj, objs[i]) != HWLOC_TYPE_EQUAL) {
+ if (find_same_type(objs[i], top_obj)) {
+ /* OBJS[i] is strictly above an object of the same type as TOP_OBJ, so it
+ * is above TOP_OBJ. */
+ top_obj = objs[i];
+ }
+ }
+ }
+
+ /* Now peek all objects of the same type, build a level with that and
+ * replace them with their children. */
+
+ /* First count them. */
+ n_taken_objs = 0;
+ n_new_objs = 0;
+ for (i = 0; i < n_objs; i++)
+ if (hwloc_type_cmp(top_obj, objs[i]) == HWLOC_TYPE_EQUAL) {
+ n_taken_objs++;
+ n_new_objs += objs[i]->arity;
+ }
+
+ /* New level. */
+ taken_objs = malloc((n_taken_objs + 1) * sizeof(taken_objs[0]));
+ /* New list of pending objects. */
+ if (n_objs - n_taken_objs + n_new_objs) {
+ new_objs = malloc((n_objs - n_taken_objs + n_new_objs) * sizeof(new_objs[0]));
+ } else {
+#ifdef HWLOC_DEBUG
+ assert(!n_new_objs);
+ assert(n_objs == n_taken_objs);
+#endif
+ new_objs = NULL;
+ }
+
+ n_new_objs = hwloc_level_take_objects(top_obj,
+ objs, n_objs,
+ taken_objs, n_taken_objs,
+ new_objs, n_new_objs);
+
+ /* Ok, put numbers in the level and link cousins. */
+ for (i = 0; i < n_taken_objs; i++) {
+ taken_objs[i]->depth = topology->nb_levels;
+ taken_objs[i]->logical_index = i;
+ if (i) {
+ taken_objs[i]->prev_cousin = taken_objs[i-1];
+ taken_objs[i-1]->next_cousin = taken_objs[i];
+ }
+ }
+ taken_objs[0]->prev_cousin = NULL;
+ taken_objs[n_taken_objs-1]->next_cousin = NULL;
+
+ /* One more level! */
+ if (top_obj->type == HWLOC_OBJ_CACHE)
+ hwloc_debug("--- Cache level depth %u", top_obj->attr->cache.depth);
+ else
+ hwloc_debug("--- %s level", hwloc_obj_type_string(top_obj->type));
+ hwloc_debug(" has number %u\n\n", topology->nb_levels);
+
+ if (topology->type_depth[top_obj->type] == HWLOC_TYPE_DEPTH_UNKNOWN)
+ topology->type_depth[top_obj->type] = topology->nb_levels;
+ else
+ topology->type_depth[top_obj->type] = HWLOC_TYPE_DEPTH_MULTIPLE; /* mark as unknown */
+
+ taken_objs[n_taken_objs] = NULL;
+
+ topology->level_nbobjects[topology->nb_levels] = n_taken_objs;
+ topology->levels[topology->nb_levels] = taken_objs;
+
+ topology->nb_levels++;
+
+ free(objs);
+
+ /* Switch to new_objs */
+ objs = new_objs;
+ n_objs = n_new_objs;
+ }
+
+ /* It's empty now. */
+ if (objs)
+ free(objs);
+
+ hwloc_connect_io_levels(topology);
+ hwloc_connect_misc_level(topology);
+
+ hwloc_propagate_symmetric_subtree(topology, topology->levels[0][0]);
+
+ return 0;
+}
+
+void hwloc_alloc_obj_cpusets(hwloc_obj_t obj)
+{
+ if (!obj->cpuset)
+ obj->cpuset = hwloc_bitmap_alloc_full();
+ if (!obj->complete_cpuset)
+ obj->complete_cpuset = hwloc_bitmap_alloc();
+ if (!obj->allowed_cpuset)
+ obj->allowed_cpuset = hwloc_bitmap_alloc_full();
+ if (!obj->nodeset)
+ obj->nodeset = hwloc_bitmap_alloc();
+ if (!obj->complete_nodeset)
+ obj->complete_nodeset = hwloc_bitmap_alloc();
+ if (!obj->allowed_nodeset)
+ obj->allowed_nodeset = hwloc_bitmap_alloc_full();
+}
+
+/* Main discovery loop */
+static int
+hwloc_discover(struct hwloc_topology *topology)
+{
+ struct hwloc_backend *backend;
+ int gotsomeio = 0;
+ unsigned discoveries = 0;
+
+ topology->modified = 0; /* no need to reconnect yet */
+
+ /* discover() callbacks should use hwloc_insert to add objects initialized
+ * through hwloc_alloc_setup_object.
+ * For node levels, nodeset and memory must be initialized.
+ * For cache levels, memory and type/depth must be initialized.
+ * For group levels, depth must be initialized.
+ */
+
+ /* There must be at least a PU object for each logical processor, at worse
+ * produced by hwloc_setup_pu_level()
+ */
+
+ /* To be able to just use hwloc_insert_object_by_cpuset to insert the object
+ * in the topology according to the cpuset, the cpuset field must be
+ * initialized.
+ */
+
+ /* A priori, All processors are visible in the topology, and allowed
+ * for the application.
+ *
+ * - If some processors exist but topology information is unknown for them
+ * (and thus the backend couldn't create objects for them), they should be
+ * added to the complete_cpuset field of the lowest object where the object
+ * could reside.
+ *
+ * - If some processors are not allowed for the application (e.g. for
+ * administration reasons), they should be dropped from the allowed_cpuset
+ * field.
+ *
+ * The same applies to the node sets complete_nodeset and allowed_cpuset.
+ *
+ * If such field doesn't exist yet, it can be allocated, and initialized to
+ * zero (for complete), or to full (for allowed). The values are
+ * automatically propagated to the whole tree after detection.
+ */
+
+ /*
+ * Discover CPUs first
+ */
+ backend = topology->backends;
+ while (NULL != backend) {
+ int err;
+ if (backend->component->type != HWLOC_DISC_COMPONENT_TYPE_CPU
+ && backend->component->type != HWLOC_DISC_COMPONENT_TYPE_GLOBAL)
+ /* not yet */
+ goto next_cpubackend;
+ if (!backend->discover)
+ goto next_cpubackend;
+
+ if (topology->modified && (backend->flags & HWLOC_BACKEND_FLAG_NEED_LEVELS)) {
+ hwloc_debug("Backend %s forcing a reconnect of levels\n", backend->component->name);
+ hwloc_connect_children(topology->levels[0][0]);
+ if (hwloc_connect_levels(topology) < 0)
+ return -1;
+ topology->modified = 0;
+ }
+
+ err = backend->discover(backend);
+ if (err >= 0) {
+ if (backend->component->type == HWLOC_DISC_COMPONENT_TYPE_GLOBAL)
+ gotsomeio += err;
+ discoveries++;
+ }
+ hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+next_cpubackend:
+ backend = backend->next;
+ }
+
+ if (!discoveries) {
+ hwloc_debug("%s", "No CPU backend enabled or no discovery succeeded\n");
+ errno = EINVAL;
+ return -1;
+ }
+
+ /* Update objects cpusets and nodesets now that the CPU/GLOBAL backend populated PUs and nodes */
+
+ hwloc_debug("%s", "\nRestrict topology cpusets to existing PU and NODE objects\n");
+ collect_proc_cpuset(topology->levels[0][0], NULL);
+
+ hwloc_debug("%s", "\nPropagate disallowed cpus down and up\n");
+ propagate_unused_cpuset(topology->levels[0][0], NULL);
+
+ /* Backends must allocate root->*nodeset.
+ *
+ * Most of them call hwloc_alloc_obj_cpusets() on the root to do so.
+ * root->complete_nodeset is empty by default, and filled by the core
+ * when NUMA nodes are added with insert_by_cpuset().
+ * root->allowed_nodeset is everything by default, unless reduced by backends.
+ *
+ * The XML backend takes care of everything to properly support old XML input
+ * with missing nodesets and/or NUMA nodes. It checks nodesets and fix them if needed.
+ */
+ assert(topology->levels[0][0]->nodeset);
+ assert(topology->levels[0][0]->complete_nodeset);
+ assert(topology->levels[0][0]->allowed_nodeset);
+ /* If there's no NUMA node, add one with all the memory */
+ if (hwloc_bitmap_iszero(topology->levels[0][0]->complete_nodeset)) {
+ hwloc_obj_t node = hwloc_alloc_setup_object(HWLOC_OBJ_NUMANODE, 0);
+ node->cpuset = hwloc_bitmap_dup(topology->levels[0][0]->cpuset); /* requires root cpuset to be initialized above */
+ node->complete_cpuset = hwloc_bitmap_dup(topology->levels[0][0]->complete_cpuset); /* requires root cpuset to be initialized above */
+ node->allowed_cpuset = hwloc_bitmap_dup(topology->levels[0][0]->allowed_cpuset); /* requires root cpuset to be initialized above */
+ node->nodeset = hwloc_bitmap_alloc();
+ /* other nodesets will be filled below */
+ hwloc_bitmap_set(node->nodeset, 0);
+ memcpy(&node->memory, &topology->levels[0][0]->memory, sizeof(node->memory));
+ memset(&topology->levels[0][0]->memory, 0, sizeof(node->memory));
+ hwloc_insert_object_by_cpuset(topology, node);
+ }
+ hwloc_debug("%s", "\nPropagate nodesets\n");
+ propagate_nodeset(topology->levels[0][0], NULL);
+ propagate_nodesets(topology->levels[0][0]);
+
+ hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+ if (!(topology->flags & HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM)) {
+ hwloc_debug("%s", "\nRemoving unauthorized sets from all sets\n");
+ remove_unused_sets(topology->levels[0][0]);
+ hwloc_debug_print_objects(0, topology->levels[0][0]);
+ }
+
+ /*
+ * All object cpusets and nodesets are properly set now.
+ */
+
+ /*
+ * Group levels by distances
+ */
+ hwloc_distances_finalize_os(topology);
+ hwloc_group_by_distances(topology);
+
+ /* Now connect handy pointers to make remaining discovery easier. */
+ hwloc_debug("%s", "\nOk, finished tweaking, now connect\n");
+ if (topology->modified) {
+ hwloc_connect_children(topology->levels[0][0]);
+ if (hwloc_connect_levels(topology) < 0)
+ return -1;
+ topology->modified = 0;
+ }
+ hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+ /*
+ * Additional discovery with other backends
+ */
+
+ backend = topology->backends;
+ while (NULL != backend) {
+ int err;
+ if (backend->component->type == HWLOC_DISC_COMPONENT_TYPE_CPU
+ || backend->component->type == HWLOC_DISC_COMPONENT_TYPE_GLOBAL)
+ /* already done above */
+ goto next_noncpubackend;
+ if (!backend->discover)
+ goto next_noncpubackend;
+
+ if (topology->modified && (backend->flags & HWLOC_BACKEND_FLAG_NEED_LEVELS)) {
+ hwloc_debug("Backend %s forcing a reconnect of levels\n", backend->component->name);
+ hwloc_connect_children(topology->levels[0][0]);
+ if (hwloc_connect_levels(topology) < 0)
+ return -1;
+ topology->modified = 0;
+ }
+
+ err = backend->discover(backend);
+ if (err >= 0) {
+ gotsomeio += err;
+ }
+ hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+next_noncpubackend:
+ backend = backend->next;
+ }
+
+ /* if we got anything, filter interesting objects and update the tree */
+ if (gotsomeio) {
+ if (!(topology->flags & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO)))
+ /* drop all I/O children */
+ hwloc_drop_all_io(topology, topology->levels[0][0]);
+ else
+ hwloc_drop_useless_io(topology, topology->levels[0][0]);
+ hwloc_debug("%s", "\nNow reconnecting\n");
+ hwloc_debug_print_objects(0, topology->levels[0][0]);
+ hwloc_propagate_bridge_depth(topology, topology->levels[0][0], 0);
+ }
+
+ /* Remove some stuff */
+
+ hwloc_debug("%s", "\nRemoving ignored objects\n");
+ ignore_type_always(topology, &topology->levels[0][0]);
+ hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+ hwloc_debug("%s", "\nRemoving empty objects except numa nodes and PCI devices\n");
+ remove_empty(topology, &topology->levels[0][0]);
+ if (!topology->levels[0][0]) {
+ fprintf(stderr, "Topology became empty, aborting!\n");
+ abort();
+ }
+ hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+ hwloc_debug("%s", "\nRemoving objects whose type has HWLOC_IGNORE_TYPE_KEEP_STRUCTURE and have only one child or are the only child\n");
+ ignore_type_keep_structure(topology, &topology->levels[0][0]);
+ hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+ /* Reconnect things after all these changes */
+ if (topology->modified) {
+ /* Often raised because of Groups inserted for I/Os */
+ hwloc_connect_children(topology->levels[0][0]);
+ if (hwloc_connect_levels(topology) < 0)
+ return -1;
+ topology->modified = 0;
+ }
+
+ /* accumulate children memory in total_memory fields (only once parent is set) */
+ hwloc_debug("%s", "\nPropagate total memory up\n");
+ propagate_total_memory(topology->levels[0][0]);
+
+ /*
+ * Now that objects are numbered, take distance matrices from backends and put them in the main topology.
+ *
+ * Some objects may have disappeared (in removed_empty or removed_ignored) since we setup os distances
+ * (hwloc_distances_finalize_os()) above. Reset them so as to not point to disappeared objects anymore.
+ */
+ hwloc_distances_restrict_os(topology);
+ hwloc_distances_finalize_os(topology);
+ hwloc_distances_finalize_logical(topology);
+
+ /* add some identification attributes if not loading from XML */
+ if (topology->backends
+ && strcmp(topology->backends->component->name, "xml")) {
+ char *value;
+ /* add a hwlocVersion */
+ hwloc_obj_add_info(topology->levels[0][0], "hwlocVersion", VERSION);
+ /* add a ProcessName */
+ value = hwloc_progname(topology);
+ if (value) {
+ hwloc_obj_add_info(topology->levels[0][0], "ProcessName", value);
+ free(value);
+ }
+ }
+
+ /*
+ * Now set binding hooks according to topology->is_thissystem
+ * what the native OS backend offers.
+ */
+ hwloc_set_binding_hooks(topology);
+
+ return 0;
+}
+
+/* To be before discovery is actually launched,
+ * Resets everything in case a previous load initialized some stuff.
+ */
+void
+hwloc_topology_setup_defaults(struct hwloc_topology *topology)
+{
+ struct hwloc_obj *root_obj;
+
+ /* reset support */
+ memset(&topology->binding_hooks, 0, sizeof(topology->binding_hooks));
+ memset(topology->support.discovery, 0, sizeof(*topology->support.discovery));
+ memset(topology->support.cpubind, 0, sizeof(*topology->support.cpubind));
+ memset(topology->support.membind, 0, sizeof(*topology->support.membind));
+
+ /* Only the System object on top by default */
+ topology->nb_levels = 1; /* there's at least SYSTEM */
+ topology->next_group_depth = 0;
+ topology->levels[0] = malloc (sizeof (hwloc_obj_t));
+ topology->level_nbobjects[0] = 1;
+ /* NULLify other levels so that we can detect and free old ones in hwloc_connect_levels() if needed */
+ memset(topology->levels+1, 0, (HWLOC_DEPTH_MAX-1)*sizeof(*topology->levels));
+ topology->bridge_level = NULL;
+ topology->pcidev_level = NULL;
+ topology->osdev_level = NULL;
+ topology->first_bridge = topology->last_bridge = NULL;
+ topology->first_pcidev = topology->last_pcidev = NULL;
+ topology->first_osdev = topology->last_osdev = NULL;
+ topology->misc_level = NULL;
+ topology->first_misc = topology->last_misc = NULL;
+
+ /* Create the actual machine object, but don't touch its attributes yet
+ * since the OS backend may still change the object into something else
+ * (for instance System)
+ */
+ root_obj = hwloc_alloc_setup_object(HWLOC_OBJ_MACHINE, 0);
+ topology->levels[0][0] = root_obj;
+}
+
+int
+hwloc_topology_init (struct hwloc_topology **topologyp)
+{
+ struct hwloc_topology *topology;
+ int i;
+
+ topology = malloc (sizeof (struct hwloc_topology));
+ if(!topology)
+ return -1;
+
+ hwloc_components_init(topology);
+
+ /* Setup topology context */
+ topology->is_loaded = 0;
+ topology->flags = 0;
+ topology->is_thissystem = 1;
+ topology->pid = 0;
+ topology->userdata = NULL;
+
+ topology->support.discovery = malloc(sizeof(*topology->support.discovery));
+ topology->support.cpubind = malloc(sizeof(*topology->support.cpubind));
+ topology->support.membind = malloc(sizeof(*topology->support.membind));
+
+ /* Only ignore useless cruft by default */
+ for(i = HWLOC_OBJ_SYSTEM; i < HWLOC_OBJ_TYPE_MAX; i++)
+ topology->ignored_types[i] = HWLOC_IGNORE_TYPE_NEVER;
+ topology->ignored_types[HWLOC_OBJ_GROUP] = HWLOC_IGNORE_TYPE_KEEP_STRUCTURE;
+
+ hwloc_distances_init(topology);
+
+ topology->userdata_export_cb = NULL;
+ topology->userdata_import_cb = NULL;
+
+ /* Make the topology look like something coherent but empty */
+ hwloc_topology_setup_defaults(topology);
+
+ *topologyp = topology;
+ return 0;
+}
+
+int
+hwloc_topology_set_pid(struct hwloc_topology *topology __hwloc_attribute_unused,
+ hwloc_pid_t pid __hwloc_attribute_unused)
+{
+ /* this does *not* change the backend */
+#ifdef HWLOC_LINUX_SYS
+ topology->pid = pid;
+ return 0;
+#else /* HWLOC_LINUX_SYS */
+ errno = ENOSYS;
+ return -1;
+#endif /* HWLOC_LINUX_SYS */
+}
+
+int
+hwloc_topology_set_synthetic(struct hwloc_topology *topology, const char *description)
+{
+ return hwloc_disc_component_force_enable(topology,
+ 0 /* api */,
+ -1, "synthetic",
+ description, NULL, NULL);
+}
+
+int
+hwloc_topology_set_xml(struct hwloc_topology *topology,
+ const char *xmlpath)
+{
+ return hwloc_disc_component_force_enable(topology,
+ 0 /* api */,
+ -1, "xml",
+ xmlpath, NULL, NULL);
+}
+
+int
+hwloc_topology_set_xmlbuffer(struct hwloc_topology *topology,
+ const char *xmlbuffer,
+ int size)
+{
+ return hwloc_disc_component_force_enable(topology,
+ 0 /* api */,
+ -1, "xml", NULL,
+ xmlbuffer, (void*) (uintptr_t) size);
+}
+
+int
+hwloc_topology_set_flags (struct hwloc_topology *topology, unsigned long flags)
+{
+ if (topology->is_loaded) {
+ /* actually harmless */
+ errno = EBUSY;
+ return -1;
+ }
+ topology->flags = flags;
+ return 0;
+}
+
+unsigned long
+hwloc_topology_get_flags (struct hwloc_topology *topology)
+{
+ return topology->flags;
+}
+
+int
+hwloc_topology_ignore_type(struct hwloc_topology *topology, hwloc_obj_type_t type)
+{
+ if (type >= HWLOC_OBJ_TYPE_MAX) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (type == HWLOC_OBJ_PU || type == HWLOC_OBJ_NUMANODE) {
+ /* we need the PU and NUMA levels */
+ errno = EINVAL;
+ return -1;
+ } else if (hwloc_obj_type_is_io(type)) {
+ /* I/O devices aren't in any level, use topology flags to ignore them */
+ errno = EINVAL;
+ return -1;
+ }
+
+ topology->ignored_types[type] = HWLOC_IGNORE_TYPE_ALWAYS;
+ return 0;
+}
+
+int
+hwloc_topology_ignore_type_keep_structure(struct hwloc_topology *topology, hwloc_obj_type_t type)
+{
+ if (type >= HWLOC_OBJ_TYPE_MAX) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (type == HWLOC_OBJ_PU || type == HWLOC_OBJ_NUMANODE || type == HWLOC_OBJ_MISC) {
+ /* We need the PU and NUMA levels.
+ * Misc are outside of the main topology structure, makes no sense.
+ */
+ errno = EINVAL;
+ return -1;
+ } else if (hwloc_obj_type_is_io(type)) {
+ /* I/O devices aren't in any level, use topology flags to ignore them */
+ errno = EINVAL;
+ return -1;
+ }
+
+ topology->ignored_types[type] = HWLOC_IGNORE_TYPE_KEEP_STRUCTURE;
+ return 0;
+}
+
+int
+hwloc_topology_ignore_all_keep_structure(struct hwloc_topology *topology)
+{
+ unsigned type;
+ for(type = HWLOC_OBJ_SYSTEM; type < HWLOC_OBJ_TYPE_MAX; type++)
+ if (type != HWLOC_OBJ_PU && type != HWLOC_OBJ_NUMANODE
+ && !hwloc_obj_type_is_io((hwloc_obj_type_t) type))
+ topology->ignored_types[type] = HWLOC_IGNORE_TYPE_KEEP_STRUCTURE;
+ return 0;
+}
+
+/* traverse the tree and free everything.
+ * only use first_child/next_sibling so that it works before load()
+ * and may be used when switching between backend.
+ */
+static void
+hwloc_topology_clear_tree (struct hwloc_topology *topology, struct hwloc_obj *root)
+{
+ hwloc_obj_t child;
+ child = root->first_child;
+ while (child) {
+ hwloc_obj_t nextchild = child->next_sibling;
+ hwloc_topology_clear_tree (topology, child);
+ child = nextchild;
+ }
+ child = root->io_first_child;
+ while (child) {
+ hwloc_obj_t nextchild = child->next_sibling;
+ hwloc_topology_clear_tree (topology, child);
+ child = nextchild;
+ }
+ child = root->misc_first_child;
+ while (child) {
+ hwloc_obj_t nextchild = child->next_sibling;
+ hwloc_topology_clear_tree (topology, child);
+ child = nextchild;
+ }
+ hwloc_free_unlinked_object (root);
+}
+
+void
+hwloc_topology_clear (struct hwloc_topology *topology)
+{
+ unsigned l;
+ hwloc_topology_clear_tree (topology, topology->levels[0][0]);
+ for (l=0; l<topology->nb_levels; l++) {
+ free(topology->levels[l]);
+ topology->levels[l] = NULL;
+ }
+ free(topology->bridge_level);
+ free(topology->pcidev_level);
+ free(topology->osdev_level);
+ free(topology->misc_level);
+}
+
+void
+hwloc_topology_destroy (struct hwloc_topology *topology)
+{
+ hwloc_backends_disable_all(topology);
+ hwloc_components_destroy_all(topology);
+
+ hwloc_topology_clear(topology);
+ hwloc_distances_destroy(topology);
+
+ free(topology->support.discovery);
+ free(topology->support.cpubind);
+ free(topology->support.membind);
+ free(topology);
+}
+
+int
+hwloc_topology_load (struct hwloc_topology *topology)
+{
+ int err;
+
+ if (topology->is_loaded) {
+ errno = EBUSY;
+ return -1;
+ }
+
+ /* Only apply variables if we have not changed the backend yet.
+ * Only the last one will be kept.
+ * Check for XML last (that's the one that may be set system-wide by administrators)
+ * so that it's only used if other variables are not set,
+ * to allow users to override easily.
+ */
+ if (!topology->backends) {
+ const char *synthetic_env = getenv("HWLOC_SYNTHETIC");
+ if (synthetic_env)
+ hwloc_disc_component_force_enable(topology,
+ 1 /* env force */,
+ -1, "synthetic",
+ synthetic_env, NULL, NULL);
+ }
+ if (!topology->backends) {
+ const char *fsroot_path_env = getenv("HWLOC_FSROOT");
+ if (fsroot_path_env)
+ hwloc_disc_component_force_enable(topology,
+ 1 /* env force */,
+ HWLOC_DISC_COMPONENT_TYPE_CPU, "linux",
+ fsroot_path_env, NULL, NULL);
+ }
+ if (!topology->backends) {
+ const char *xmlpath_env = getenv("HWLOC_XMLFILE");
+ if (xmlpath_env)
+ hwloc_disc_component_force_enable(topology,
+ 1 /* env force */,
+ -1, "xml",
+ xmlpath_env, NULL, NULL);
+ }
+
+ /* instantiate all possible other backends now */
+ hwloc_disc_components_enable_others(topology);
+ /* now that backends are enabled, update the thissystem flag */
+ hwloc_backends_is_thissystem(topology);
+
+ /* get distance matrix from the environment are store them (as indexes) in the topology.
+ * indexes will be converted into objects later once the tree will be filled
+ */
+ hwloc_distances_set_from_env(topology);
+
+ /* actual topology discovery */
+ err = hwloc_discover(topology);
+ if (err < 0)
+ goto out;
+
+#ifndef HWLOC_DEBUG
+ if (getenv("HWLOC_DEBUG_CHECK"))
+#endif
+ hwloc_topology_check(topology);
+
+ topology->is_loaded = 1;
+ return 0;
+
+ out:
+ hwloc_topology_clear(topology);
+ hwloc_distances_destroy(topology);
+ hwloc_topology_setup_defaults(topology);
+ hwloc_backends_disable_all(topology);
+ return -1;
+}
+
+/* adjust object cpusets according the given droppedcpuset,
+ * drop object whose cpuset becomes empty,
+ * and mark dropped nodes in droppednodeset
+ */
+static void
+restrict_object(hwloc_topology_t topology, unsigned long flags, hwloc_obj_t *pobj, hwloc_const_cpuset_t droppedcpuset, hwloc_nodeset_t droppednodeset, int droppingparent)
+{
+ hwloc_obj_t obj = *pobj, child, *pchild;
+ int dropping;
+ int modified = hwloc_bitmap_intersects(obj->complete_cpuset, droppedcpuset);
+
+ hwloc_clear_object_distances(obj);
+
+ hwloc_bitmap_andnot(obj->cpuset, obj->cpuset, droppedcpuset);
+ hwloc_bitmap_andnot(obj->complete_cpuset, obj->complete_cpuset, droppedcpuset);
+ hwloc_bitmap_andnot(obj->allowed_cpuset, obj->allowed_cpuset, droppedcpuset);
+
+ dropping = droppingparent || hwloc_bitmap_iszero(obj->cpuset);
+
+ if (modified) {
+ for_each_child_safe(child, obj, pchild)
+ restrict_object(topology, flags, pchild, droppedcpuset, droppednodeset, dropping);
+ /* Nothing to restrict under I/O or Misc */
+ }
+
+ if (dropping) {
+ hwloc_debug("%s", "\nRemoving object during restrict");
+ hwloc_debug_print_object(0, obj);
+ if (obj->type == HWLOC_OBJ_NUMANODE)
+ hwloc_bitmap_set(droppednodeset, obj->os_index);
+ if (obj->io_first_child && !(flags & HWLOC_RESTRICT_FLAG_ADAPT_IO))
+ unlink_and_free_object_and_children(&obj->io_first_child);
+ if (obj->misc_first_child && !(flags & HWLOC_RESTRICT_FLAG_ADAPT_MISC))
+ unlink_and_free_object_and_children(&obj->misc_first_child);
+ unlink_and_free_single_object(pobj);
+ topology->modified = 1;
+ /* do not remove children. if they were to be removed, they would have been already */
+ }
+}
+
+/* adjust object nodesets accordingly the given droppednodeset
+ */
+static void
+restrict_object_nodeset(hwloc_topology_t topology, hwloc_obj_t *pobj, hwloc_nodeset_t droppednodeset)
+{
+ hwloc_obj_t obj = *pobj, child, *pchild;
+
+ /* if this object isn't modified, don't bother looking at children */
+ if (!hwloc_bitmap_intersects(obj->complete_nodeset, droppednodeset))
+ return;
+
+ hwloc_bitmap_andnot(obj->nodeset, obj->nodeset, droppednodeset);
+ hwloc_bitmap_andnot(obj->complete_nodeset, obj->complete_nodeset, droppednodeset);
+ hwloc_bitmap_andnot(obj->allowed_nodeset, obj->allowed_nodeset, droppednodeset);
+
+ for_each_child_safe(child, obj, pchild)
+ restrict_object_nodeset(topology, pchild, droppednodeset);
+ /* Nothing to restrict under I/O and Misc */
+}
+
+int
+hwloc_topology_restrict(struct hwloc_topology *topology, hwloc_const_cpuset_t cpuset, unsigned long flags)
+{
+ hwloc_bitmap_t droppedcpuset, droppednodeset;
+
+ /* make sure we'll keep something in the topology */
+ if (!hwloc_bitmap_intersects(cpuset, topology->levels[0][0]->cpuset)) {
+ errno = EINVAL; /* easy failure, just don't touch the topology */
+ return -1;
+ }
+
+ droppedcpuset = hwloc_bitmap_alloc();
+ droppednodeset = hwloc_bitmap_alloc();
+
+ /* drop object based on the reverse of cpuset, and fill the 'dropped' nodeset */
+ hwloc_bitmap_not(droppedcpuset, cpuset);
+ restrict_object(topology, flags, &topology->levels[0][0], droppedcpuset, droppednodeset, 0 /* root cannot be removed */);
+ /* update nodesets according to dropped nodeset */
+ restrict_object_nodeset(topology, &topology->levels[0][0], droppednodeset);
+
+ hwloc_bitmap_free(droppedcpuset);
+ hwloc_bitmap_free(droppednodeset);
+
+ hwloc_connect_children(topology->levels[0][0]);
+ if (hwloc_connect_levels(topology) < 0)
+ goto out;
+ topology->modified = 0;
+
+ propagate_total_memory(topology->levels[0][0]);
+ hwloc_distances_restrict(topology, flags);
+ hwloc_distances_finalize_os(topology);
+ hwloc_distances_finalize_logical(topology);
+ return 0;
+
+ out:
+ /* unrecoverable failure, re-init the topology */
+ hwloc_topology_clear(topology);
+ hwloc_distances_destroy(topology);
+ hwloc_topology_setup_defaults(topology);
+ return -1;
+}
+
+int
+hwloc_topology_is_thissystem(struct hwloc_topology *topology)
+{
+ return topology->is_thissystem;
+}
+
+unsigned
+hwloc_topology_get_depth(struct hwloc_topology *topology)
+{
+ return topology->nb_levels;
+}
+
+const struct hwloc_topology_support *
+hwloc_topology_get_support(struct hwloc_topology * topology)
+{
+ return &topology->support;
+}
+
+void hwloc_topology_set_userdata(struct hwloc_topology * topology, const void *userdata)
+{
+ topology->userdata = (void *) userdata;
+}
+
+void * hwloc_topology_get_userdata(struct hwloc_topology * topology)
+{
+ return topology->userdata;
+}
+
+/****************
+ * Debug Checks *
+ ****************/
+
+static void
+hwloc__check_object(hwloc_topology_t topology, hwloc_obj_t obj);
+
+/* check children between a parent object */
+static void
+hwloc__check_children(hwloc_topology_t topology, hwloc_obj_t parent)
+{
+ unsigned j;
+
+ if (!parent->arity) {
+ /* check whether that parent has no children for real */
+ assert(!parent->children);
+ assert(!parent->first_child);
+ assert(!parent->last_child);
+ return;
+ }
+ /* check whether that parent has children for real */
+ assert(parent->children);
+ assert(parent->first_child);
+ assert(parent->last_child);
+
+ /* sibling checks */
+ for(j=0; j<parent->arity; j++) {
+ hwloc_obj_t child = parent->children[j];
+ assert(child->parent == parent);
+ assert(child->sibling_rank == j);
+ if (j)
+ assert(child->prev_sibling == parent->children[j-1]);
+ else
+ assert(!child->prev_sibling);
+ if (j == parent->arity-1)
+ assert(!child->next_sibling);
+ else
+ assert(child->next_sibling == parent->children[j+1]);
+ if (!hwloc_obj_type_is_io(child->type))
+ assert(child->depth > parent->depth);
+ /* recurse */
+ hwloc__check_object(topology, child);
+ }
+ assert(parent->first_child == parent->children[0]);
+ assert(parent->last_child == parent->children[parent->arity-1]);
+
+ /* we already checked in the caller that objects have either all sets or none */
+
+ {
+ /* check that parent->cpuset == exclusive OR of children
+ * (can be wrong for complete_cpuset since disallowed/offline/unknown PUs can be removed)
+ */
+ hwloc_bitmap_t remaining_parent_cpuset = hwloc_bitmap_dup(parent->cpuset);
+ hwloc_bitmap_t remaining_parent_nodeset = hwloc_bitmap_dup(parent->nodeset);
+ for(j=0; j<parent->arity; j++) {
+ if (!parent->children[j]->cpuset)
+ continue;
+ /* check that child cpuset is included in the reminder of the parent */
+ assert(hwloc_bitmap_isincluded(parent->children[j]->cpuset, remaining_parent_cpuset));
+ hwloc_bitmap_andnot(remaining_parent_cpuset, remaining_parent_cpuset, parent->children[j]->cpuset);
+ /* check that child cpuset is included in the parent (multiple children may have the same nodeset when we're below a NUMA node) */
+ assert(hwloc_bitmap_isincluded(parent->children[j]->nodeset, parent->nodeset));
+ hwloc_bitmap_andnot(remaining_parent_nodeset, remaining_parent_nodeset, parent->children[j]->nodeset);
+ }
+
+ if (parent->type == HWLOC_OBJ_PU) {
+ /* if parent is a PU (with Misc children for instance),
+ * its os_index bit may remain in cpuset. */
+ assert(hwloc_bitmap_weight(remaining_parent_cpuset) == 1);
+ assert(hwloc_bitmap_first(remaining_parent_cpuset) == (int)parent->os_index);
+ } else {
+ /* nothing remains */
+ assert(hwloc_bitmap_iszero(remaining_parent_cpuset));
+ }
+ hwloc_bitmap_free(remaining_parent_cpuset);
+
+ if (parent->type == HWLOC_OBJ_NUMANODE)
+ /* if parent is a NUMA node, its os_index bit may remain.
+ * or it could already have been removed by a child. */
+ hwloc_bitmap_clr(remaining_parent_nodeset, parent->os_index);
+ if (parent->type == HWLOC_OBJ_PU) {
+ /* if parent is a PU (with Misc children for instance),
+ * one bit may remain in nodeset. */
+ assert(hwloc_bitmap_weight(remaining_parent_nodeset) == 1);
+ } else {
+ /* nothing remains */
+ assert(hwloc_bitmap_iszero(remaining_parent_nodeset));
+ }
+ hwloc_bitmap_free(remaining_parent_nodeset);
+ }
+
+ /* check that children complete_cpuset are properly ordered, empty ones may be anywhere
+ * (can be wrong for main cpuset since removed PUs can break the ordering).
+ */
+ {
+ int firstchild;
+ int prev_firstchild = -1; /* -1 works fine with first comparisons below */
+ for(j=0; j<parent->arity; j++) {
+ if (!parent->children[j]->complete_cpuset
+ || hwloc_bitmap_iszero(parent->children[j]->complete_cpuset))
+ continue;
+
+ firstchild = hwloc_bitmap_first(parent->children[j]->complete_cpuset);
+ assert(prev_firstchild < firstchild);
+ prev_firstchild = firstchild;
+ }
+ }
+}
+
+static void
+hwloc__check_io_children(hwloc_topology_t topology, hwloc_obj_t parent)
+{
+ unsigned j;
+ hwloc_obj_t child, prev;
+
+ if (!parent->io_arity) {
+ /* check whether that parent has no children for real */
+ assert(!parent->io_first_child);
+ return;
+ }
+ /* check whether that parent has children for real */
+ assert(parent->io_first_child);
+
+ for(prev = NULL, child = parent->io_first_child, j = 0;
+ child;
+ prev = child, child = child->next_sibling, j++) {
+ /* all children must be I/O */
+ assert(hwloc_obj_type_is_io(child->type));
+
+ /* check siblings */
+ assert(child->parent == parent);
+ assert(child->sibling_rank == j);
+ if (prev)
+ assert(prev->next_sibling == child);
+ assert(child->prev_sibling == prev);
+ if (j == parent->io_arity-1)
+ assert(child->next_sibling == NULL);
+
+ /* only I/O and Misc children, recurse */
+ assert(!child->first_child);
+ hwloc__check_object(topology, child);
+ }
+ /* check arity */
+ assert(j == parent->io_arity);
+}
+
+static void
+hwloc__check_misc_children(hwloc_topology_t topology, hwloc_obj_t parent)
+{
+ unsigned j;
+ hwloc_obj_t child, prev;
+
+ if (!parent->misc_arity) {
+ /* check whether that parent has no children for real */
+ assert(!parent->misc_first_child);
+ return;
+ }
+ /* check whether that parent has children for real */
+ assert(parent->misc_first_child);
+
+ for(prev = NULL, child = parent->misc_first_child, j = 0;
+ child;
+ prev = child, child = child->next_sibling, j++) {
+ /* all children must be Misc */
+ assert(child->type == HWLOC_OBJ_MISC);
+
+ /* check siblings */
+ assert(child->parent == parent);
+ assert(child->sibling_rank == j);
+ if (prev)
+ assert(prev->next_sibling == child);
+ assert(child->prev_sibling == prev);
+ if (j == parent->misc_arity-1)
+ assert(child->next_sibling == NULL);
+
+ /* only Misc children, recurse */
+ assert(!child->first_child);
+ assert(!child->io_first_child);
+ hwloc__check_object(topology, child);
+ }
+ /* check arity */
+ assert(j == parent->misc_arity);
+}
+
+static void
+hwloc__check_object(hwloc_topology_t topology, hwloc_obj_t obj)
+{
+ /* check that sets and depth */
+ if (hwloc_obj_type_is_special(obj->type)) {
+ assert(!obj->cpuset);
+ if (obj->type == HWLOC_OBJ_BRIDGE)
+ assert(obj->depth == (unsigned) HWLOC_TYPE_DEPTH_BRIDGE);
+ else if (obj->type == HWLOC_OBJ_PCI_DEVICE)
+ assert(obj->depth == (unsigned) HWLOC_TYPE_DEPTH_PCI_DEVICE);
+ else if (obj->type == HWLOC_OBJ_OS_DEVICE)
+ assert(obj->depth == (unsigned) HWLOC_TYPE_DEPTH_OS_DEVICE);
+ else if (obj->type == HWLOC_OBJ_MISC)
+ assert(obj->depth == (unsigned) HWLOC_TYPE_DEPTH_MISC);
+ } else {
+ assert(obj->cpuset);
+ assert((int) obj->depth >= 0);
+ }
+
+ /* there's other cpusets and nodesets if and only if there's a main cpuset */
+ assert(!!obj->cpuset == !!obj->complete_cpuset);
+ assert(!!obj->cpuset == !!obj->allowed_cpuset);
+ assert(!!obj->cpuset == !!obj->nodeset);
+ assert(!!obj->nodeset == !!obj->complete_nodeset);
+ assert(!!obj->nodeset == !!obj->allowed_nodeset);
+
+ /* check that complete/allowed/inline sets are larger than the main sets */
+ if (obj->cpuset) {
+ assert(hwloc_bitmap_isincluded(obj->cpuset, obj->complete_cpuset));
+ assert(hwloc_bitmap_isincluded(obj->nodeset, obj->complete_nodeset));
+ if (topology->flags & HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM) {
+ assert(hwloc_bitmap_isincluded(obj->allowed_cpuset, obj->cpuset));
+ assert(hwloc_bitmap_isincluded(obj->allowed_nodeset, obj->nodeset));
+ } else {
+ assert(hwloc_bitmap_isequal(obj->allowed_cpuset, obj->cpuset));
+ assert(hwloc_bitmap_isequal(obj->allowed_nodeset, obj->nodeset));
+ }
+ }
+
+ /* check children */
+ hwloc__check_children(topology, obj);
+ hwloc__check_io_children(topology, obj);
+ hwloc__check_misc_children(topology, obj);
+}
+
+static void
+hwloc__check_level(struct hwloc_topology *topology, unsigned depth)
+{
+ unsigned width = hwloc_get_nbobjs_by_depth(topology, depth);
+ struct hwloc_obj *prev = NULL;
+ hwloc_obj_t obj;
+ unsigned j;
+
+ /* check each object of the level */
+ for(j=0; j<width; j++) {
+ obj = hwloc_get_obj_by_depth(topology, depth, j);
+ /* check that the object is corrected placed horizontally and vertically */
+ assert(obj);
+ assert(obj->depth == depth);
+ assert(obj->logical_index == j);
+ /* check that all objects in the level have the same type */
+ if (prev) {
+ assert(hwloc_type_cmp(obj, prev) == HWLOC_TYPE_EQUAL);
+ assert(prev->next_cousin == obj);
+ }
+ assert(obj->prev_cousin == prev);
+
+ /* check that PUs and NUMA nodes have correct cpuset/nodeset */
+ if (obj->type == HWLOC_OBJ_PU) {
+ assert(hwloc_bitmap_weight(obj->complete_cpuset) == 1);
+ assert(hwloc_bitmap_first(obj->complete_cpuset) == (int) obj->os_index);
+ }
+ if (obj->type == HWLOC_OBJ_NUMANODE) {
+ assert(hwloc_bitmap_weight(obj->complete_nodeset) == 1);
+ assert(hwloc_bitmap_first(obj->complete_nodeset) == (int) obj->os_index);
+ }
+ prev = obj;
+ }
+ if (prev)
+ assert(prev->next_cousin == NULL);
+
+ if (width) {
+ /* check first object of the level */
+ obj = hwloc_get_obj_by_depth(topology, depth, 0);
+ assert(obj);
+ assert(!obj->prev_cousin);
+ /* check type */
+ assert(hwloc_get_depth_type(topology, depth) == obj->type);
+ assert(depth == (unsigned) hwloc_get_type_depth(topology, obj->type)
+ || HWLOC_TYPE_DEPTH_MULTIPLE == hwloc_get_type_depth(topology, obj->type));
+ /* check last object of the level */
+ obj = hwloc_get_obj_by_depth(topology, depth, width-1);
+ assert(obj);
+ assert(!obj->next_cousin);
+ }
+
+ /* check last+1 object of the level */
+ obj = hwloc_get_obj_by_depth(topology, depth, width);
+ assert(!obj);
+}
+
+/* check a whole topology structure */
+void
+hwloc_topology_check(struct hwloc_topology *topology)
+{
+ struct hwloc_obj *obj;
+ hwloc_obj_type_t type;
+ unsigned i, j, depth;
+
+ depth = hwloc_topology_get_depth(topology);
+
+ assert(!topology->modified);
+
+ /* check type orders */
+ for (type = HWLOC_OBJ_SYSTEM; type < HWLOC_OBJ_TYPE_MAX; type++) {
+ assert(hwloc_get_order_type(hwloc_get_type_order(type)) == type);
+ }
+ for (i = hwloc_get_type_order(HWLOC_OBJ_SYSTEM);
+ i <= hwloc_get_type_order(HWLOC_OBJ_CORE); i++) {
+ assert(i == hwloc_get_type_order(hwloc_get_order_type(i)));
+ }
+
+ /* check that last level is PU */
+ assert(hwloc_get_depth_type(topology, depth-1) == HWLOC_OBJ_PU);
+ assert(hwloc_get_nbobjs_by_depth(topology, depth-1) > 0);
+ for(j=0; j<hwloc_get_nbobjs_by_depth(topology, depth-1); j++) {
+ obj = hwloc_get_obj_by_depth(topology, depth-1, j);
+ assert(obj);
+ assert(obj->type == HWLOC_OBJ_PU);
+ }
+ /* check that other levels are not PU */
+ for(i=1; i<depth-1; i++)
+ assert(hwloc_get_depth_type(topology, i) != HWLOC_OBJ_PU);
+
+ /* check that we have a NUMA level */
+ j = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+ assert(j < hwloc_topology_get_depth(topology));
+ assert(hwloc_get_depth_type(topology, j) == HWLOC_OBJ_NUMANODE);
+ /* check that other levels are not NUMA */
+ for(i=0; i<depth-1; i++)
+ if (i != j)
+ assert(hwloc_get_depth_type(topology, i) != HWLOC_OBJ_NUMANODE);
+
+ /* top-level specific checks */
+ assert(hwloc_get_nbobjs_by_depth(topology, 0) == 1);
+ obj = hwloc_get_root_obj(topology);
+ assert(obj);
+ assert(!obj->parent);
+ assert(obj->cpuset);
+ assert(!obj->depth);
+
+ /* check each level */
+ for(i=0; i<depth; i++)
+ hwloc__check_level(topology, i);
+ hwloc__check_level(topology, HWLOC_OBJ_BRIDGE);
+ hwloc__check_level(topology, HWLOC_OBJ_PCI_DEVICE);
+ hwloc__check_level(topology, HWLOC_OBJ_OS_DEVICE);
+ hwloc__check_level(topology, HWLOC_OBJ_MISC);
+
+ /* recurse and check the tree of children, and type-specific checks */
+ hwloc__check_object(topology, obj);
+}
diff --git a/ext/hwloc/hwloc/traversal.c b/ext/hwloc/hwloc/traversal.c
new file mode 100644
index 0000000..f1e9ba7
--- /dev/null
+++ b/ext/hwloc/hwloc/traversal.c
@@ -0,0 +1,701 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria. All rights reserved.
+ * Copyright © 2009-2010 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/misc.h>
+#include <private/debug.h>
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif /* HAVE_STRINGS_H */
+
+int
+hwloc_get_type_depth (struct hwloc_topology *topology, hwloc_obj_type_t type)
+{
+ return topology->type_depth[type];
+}
+
+hwloc_obj_type_t
+hwloc_get_depth_type (hwloc_topology_t topology, unsigned depth)
+{
+ if (depth >= topology->nb_levels)
+ switch (depth) {
+ case HWLOC_TYPE_DEPTH_BRIDGE:
+ return HWLOC_OBJ_BRIDGE;
+ case HWLOC_TYPE_DEPTH_PCI_DEVICE:
+ return HWLOC_OBJ_PCI_DEVICE;
+ case HWLOC_TYPE_DEPTH_OS_DEVICE:
+ return HWLOC_OBJ_OS_DEVICE;
+ case HWLOC_TYPE_DEPTH_MISC:
+ return HWLOC_OBJ_MISC;
+ default:
+ return (hwloc_obj_type_t) -1;
+ }
+ return topology->levels[depth][0]->type;
+}
+
+unsigned
+hwloc_get_nbobjs_by_depth (struct hwloc_topology *topology, unsigned depth)
+{
+ if (depth >= topology->nb_levels)
+ switch (depth) {
+ case HWLOC_TYPE_DEPTH_BRIDGE:
+ return topology->bridge_nbobjects;
+ case HWLOC_TYPE_DEPTH_PCI_DEVICE:
+ return topology->pcidev_nbobjects;
+ case HWLOC_TYPE_DEPTH_OS_DEVICE:
+ return topology->osdev_nbobjects;
+ case HWLOC_TYPE_DEPTH_MISC:
+ return topology->misc_nbobjects;
+ default:
+ return 0;
+ }
+ return topology->level_nbobjects[depth];
+}
+
+struct hwloc_obj *
+hwloc_get_obj_by_depth (struct hwloc_topology *topology, unsigned depth, unsigned idx)
+{
+ if (depth >= topology->nb_levels)
+ switch (depth) {
+ case HWLOC_TYPE_DEPTH_BRIDGE:
+ return idx < topology->bridge_nbobjects ? topology->bridge_level[idx] : NULL;
+ case HWLOC_TYPE_DEPTH_PCI_DEVICE:
+ return idx < topology->pcidev_nbobjects ? topology->pcidev_level[idx] : NULL;
+ case HWLOC_TYPE_DEPTH_OS_DEVICE:
+ return idx < topology->osdev_nbobjects ? topology->osdev_level[idx] : NULL;
+ case HWLOC_TYPE_DEPTH_MISC:
+ return idx < topology->misc_nbobjects ? topology->misc_level[idx] : NULL;
+ default:
+ return NULL;
+ }
+ if (idx >= topology->level_nbobjects[depth])
+ return NULL;
+ return topology->levels[depth][idx];
+}
+
+unsigned hwloc_get_closest_objs (struct hwloc_topology *topology, struct hwloc_obj *src, struct hwloc_obj **objs, unsigned max)
+{
+ struct hwloc_obj *parent, *nextparent, **src_objs;
+ int i,src_nbobjects;
+ unsigned stored = 0;
+
+ if (!src->cpuset)
+ return 0;
+
+ src_nbobjects = topology->level_nbobjects[src->depth];
+ src_objs = topology->levels[src->depth];
+
+ parent = src;
+ while (stored < max) {
+ while (1) {
+ nextparent = parent->parent;
+ if (!nextparent)
+ goto out;
+ if (!hwloc_bitmap_isequal(parent->cpuset, nextparent->cpuset))
+ break;
+ parent = nextparent;
+ }
+
+ /* traverse src's objects and find those that are in nextparent and were not in parent */
+ for(i=0; i<src_nbobjects; i++) {
+ if (hwloc_bitmap_isincluded(src_objs[i]->cpuset, nextparent->cpuset)
+ && !hwloc_bitmap_isincluded(src_objs[i]->cpuset, parent->cpuset)) {
+ objs[stored++] = src_objs[i];
+ if (stored == max)
+ goto out;
+ }
+ }
+ parent = nextparent;
+ }
+
+ out:
+ return stored;
+}
+
+static int
+hwloc__get_largest_objs_inside_cpuset (struct hwloc_obj *current, hwloc_const_bitmap_t set,
+ struct hwloc_obj ***res, int *max)
+{
+ int gotten = 0;
+ unsigned i;
+
+ /* the caller must ensure this */
+ if (*max <= 0)
+ return 0;
+
+ if (hwloc_bitmap_isequal(current->cpuset, set)) {
+ **res = current;
+ (*res)++;
+ (*max)--;
+ return 1;
+ }
+
+ for (i=0; i<current->arity; i++) {
+ hwloc_bitmap_t subset = hwloc_bitmap_dup(set);
+ int ret;
+
+ /* split out the cpuset part corresponding to this child and see if there's anything to do */
+ hwloc_bitmap_and(subset, subset, current->children[i]->cpuset);
+ if (hwloc_bitmap_iszero(subset)) {
+ hwloc_bitmap_free(subset);
+ continue;
+ }
+
+ ret = hwloc__get_largest_objs_inside_cpuset (current->children[i], subset, res, max);
+ gotten += ret;
+ hwloc_bitmap_free(subset);
+
+ /* if no more room to store remaining objects, return what we got so far */
+ if (!*max)
+ break;
+ }
+
+ return gotten;
+}
+
+int
+hwloc_get_largest_objs_inside_cpuset (struct hwloc_topology *topology, hwloc_const_bitmap_t set,
+ struct hwloc_obj **objs, int max)
+{
+ struct hwloc_obj *current = topology->levels[0][0];
+
+ if (!hwloc_bitmap_isincluded(set, current->cpuset))
+ return -1;
+
+ if (max <= 0)
+ return 0;
+
+ return hwloc__get_largest_objs_inside_cpuset (current, set, &objs, &max);
+}
+
+const char *
+hwloc_obj_type_string (hwloc_obj_type_t obj)
+{
+ switch (obj)
+ {
+ case HWLOC_OBJ_SYSTEM: return "System";
+ case HWLOC_OBJ_MACHINE: return "Machine";
+ case HWLOC_OBJ_MISC: return "Misc";
+ case HWLOC_OBJ_GROUP: return "Group";
+ case HWLOC_OBJ_NUMANODE: return "NUMANode";
+ case HWLOC_OBJ_PACKAGE: return "Package";
+ case HWLOC_OBJ_CACHE: return "Cache";
+ case HWLOC_OBJ_CORE: return "Core";
+ case HWLOC_OBJ_BRIDGE: return "Bridge";
+ case HWLOC_OBJ_PCI_DEVICE: return "PCIDev";
+ case HWLOC_OBJ_OS_DEVICE: return "OSDev";
+ case HWLOC_OBJ_PU: return "PU";
+ default: return "Unknown";
+ }
+}
+
+hwloc_obj_type_t
+hwloc_obj_type_of_string (const char * string)
+{
+ if (!strcasecmp(string, "System")) return HWLOC_OBJ_SYSTEM;
+ if (!strcasecmp(string, "Machine")) return HWLOC_OBJ_MACHINE;
+ if (!strcasecmp(string, "Misc")) return HWLOC_OBJ_MISC;
+ if (!strcasecmp(string, "Group")) return HWLOC_OBJ_GROUP;
+ if (!strcasecmp(string, "NUMANode") || !strcasecmp(string, "Node")) return HWLOC_OBJ_NUMANODE;
+ if (!strcasecmp(string, "Package") || !strcasecmp(string, "Socket") /* backward compat with v1.10 */) return HWLOC_OBJ_PACKAGE;
+ if (!strcasecmp(string, "Cache")) return HWLOC_OBJ_CACHE;
+ if (!strcasecmp(string, "Core")) return HWLOC_OBJ_CORE;
+ if (!strcasecmp(string, "PU")) return HWLOC_OBJ_PU;
+ if (!strcasecmp(string, "Bridge")) return HWLOC_OBJ_BRIDGE;
+ if (!strcasecmp(string, "PCIDev")) return HWLOC_OBJ_PCI_DEVICE;
+ if (!strcasecmp(string, "OSDev")) return HWLOC_OBJ_OS_DEVICE;
+ return (hwloc_obj_type_t) -1;
+}
+
+int
+hwloc_obj_type_sscanf(const char *string, hwloc_obj_type_t *typep, int *depthattrp, void *typeattrp, size_t typeattrsize)
+{
+ hwloc_obj_type_t type = (hwloc_obj_type_t) -1;
+ int depthattr = -1;
+ hwloc_obj_cache_type_t cachetypeattr = (hwloc_obj_cache_type_t) -1; /* unspecified */
+ char *end;
+
+ /* types without depthattr */
+ if (!hwloc_strncasecmp(string, "system", 2)) {
+ type = HWLOC_OBJ_SYSTEM;
+ } else if (!hwloc_strncasecmp(string, "machine", 2)) {
+ type = HWLOC_OBJ_MACHINE;
+ } else if (!hwloc_strncasecmp(string, "node", 1)
+ || !hwloc_strncasecmp(string, "numa", 1)) { /* matches node and numanode */
+ type = HWLOC_OBJ_NUMANODE;
+ } else if (!hwloc_strncasecmp(string, "package", 2)
+ || !hwloc_strncasecmp(string, "socket", 2)) { /* backward compat with v1.10 */
+ type = HWLOC_OBJ_PACKAGE;
+ } else if (!hwloc_strncasecmp(string, "core", 2)) {
+ type = HWLOC_OBJ_CORE;
+ } else if (!hwloc_strncasecmp(string, "pu", 2)) {
+ type = HWLOC_OBJ_PU;
+ } else if (!hwloc_strncasecmp(string, "misc", 2)) {
+ type = HWLOC_OBJ_MISC;
+ } else if (!hwloc_strncasecmp(string, "bridge", 2)) {
+ type = HWLOC_OBJ_BRIDGE;
+ } else if (!hwloc_strncasecmp(string, "pci", 2)) {
+ type = HWLOC_OBJ_PCI_DEVICE;
+ } else if (!hwloc_strncasecmp(string, "os", 2)) {
+ type = HWLOC_OBJ_OS_DEVICE;
+
+ /* types with depthattr */
+ } else if (!hwloc_strncasecmp(string, "cache", 2)) {
+ type = HWLOC_OBJ_CACHE;
+
+ } else if ((string[0] == 'l' || string[0] == 'L') && string[1] >= '0' && string[1] <= '9') {
+ type = HWLOC_OBJ_CACHE;
+ depthattr = strtol(string+1, &end, 10);
+ if (*end == 'd') {
+ cachetypeattr = HWLOC_OBJ_CACHE_DATA;
+ } else if (*end == 'i') {
+ cachetypeattr = HWLOC_OBJ_CACHE_INSTRUCTION;
+ } else if (*end == 'u') {
+ cachetypeattr = HWLOC_OBJ_CACHE_UNIFIED;
+ }
+
+ } else if (!hwloc_strncasecmp(string, "group", 2)) {
+ int length;
+ type = HWLOC_OBJ_GROUP;
+ length = strcspn(string, "0123456789");
+ if (length <= 5 && !hwloc_strncasecmp(string, "group", length)
+ && string[length] >= '0' && string[length] <= '9') {
+ depthattr = strtol(string+length, &end, 10);
+ }
+ } else
+ return -1;
+
+ *typep = type;
+ if (depthattrp)
+ *depthattrp = depthattr;
+ if (typeattrp) {
+ if (type == HWLOC_OBJ_CACHE && sizeof(hwloc_obj_cache_type_t) <= typeattrsize)
+ memcpy(typeattrp, &cachetypeattr, sizeof(hwloc_obj_cache_type_t));
+ }
+
+ return 0;
+}
+
+static const char *
+hwloc_pci_class_string(unsigned short class_id)
+{
+ switch ((class_id & 0xff00) >> 8) {
+ case 0x00:
+ switch (class_id) {
+ case 0x0001: return "VGA";
+ }
+ return "PCI";
+ case 0x01:
+ switch (class_id) {
+ case 0x0100: return "SCSI";
+ case 0x0101: return "IDE";
+ case 0x0102: return "Flop";
+ case 0x0103: return "IPI";
+ case 0x0104: return "RAID";
+ case 0x0105: return "ATA";
+ case 0x0106: return "SATA";
+ case 0x0107: return "SAS";
+ case 0x0108: return "NVMExp";
+ }
+ return "Stor";
+ case 0x02:
+ switch (class_id) {
+ case 0x0200: return "Ether";
+ case 0x0201: return "TokRn";
+ case 0x0202: return "FDDI";
+ case 0x0203: return "ATM";
+ case 0x0204: return "ISDN";
+ case 0x0205: return "WrdFip";
+ case 0x0206: return "PICMG";
+ case 0x0207: return "IB";
+ }
+ return "Net";
+ case 0x03:
+ switch (class_id) {
+ case 0x0300: return "VGA";
+ case 0x0301: return "XGA";
+ case 0x0302: return "3D";
+ }
+ return "Disp";
+ case 0x04:
+ switch (class_id) {
+ case 0x0400: return "Video";
+ case 0x0401: return "Audio";
+ case 0x0402: return "Phone";
+ case 0x0403: return "Auddv";
+ }
+ return "MM";
+ case 0x05:
+ switch (class_id) {
+ case 0x0500: return "RAM";
+ case 0x0501: return "Flash";
+ }
+ return "Mem";
+ case 0x06:
+ switch (class_id) {
+ case 0x0600: return "Host";
+ case 0x0601: return "ISA";
+ case 0x0602: return "EISA";
+ case 0x0603: return "MC";
+ case 0x0604: return "PCI_B";
+ case 0x0605: return "PCMCIA";
+ case 0x0606: return "Nubus";
+ case 0x0607: return "CardBus";
+ case 0x0608: return "RACEway";
+ case 0x0609: return "PCI_SB";
+ case 0x060a: return "IB_B";
+ }
+ return "Bridg";
+ case 0x07:
+ switch (class_id) {
+ case 0x0700: return "Ser";
+ case 0x0701: return "Para";
+ case 0x0702: return "MSer";
+ case 0x0703: return "Modm";
+ case 0x0704: return "GPIB";
+ case 0x0705: return "SmrtCrd";
+ }
+ return "Comm";
+ case 0x08:
+ switch (class_id) {
+ case 0x0800: return "PIC";
+ case 0x0801: return "DMA";
+ case 0x0802: return "Time";
+ case 0x0803: return "RTC";
+ case 0x0804: return "HtPl";
+ case 0x0805: return "SD-HtPl";
+ case 0x0806: return "IOMMU";
+ }
+ return "Syst";
+ case 0x09:
+ switch (class_id) {
+ case 0x0900: return "Kbd";
+ case 0x0901: return "Pen";
+ case 0x0902: return "Mouse";
+ case 0x0903: return "Scan";
+ case 0x0904: return "Game";
+ }
+ return "In";
+ case 0x0a:
+ return "Dock";
+ case 0x0b:
+ switch (class_id) {
+ case 0x0b00: return "386";
+ case 0x0b01: return "486";
+ case 0x0b02: return "Pent";
+ case 0x0b10: return "Alpha";
+ case 0x0b20: return "PPC";
+ case 0x0b30: return "MIPS";
+ case 0x0b40: return "CoProc";
+ }
+ return "Proc";
+ case 0x0c:
+ switch (class_id) {
+ case 0x0c00: return "Firw";
+ case 0x0c01: return "ACCES";
+ case 0x0c02: return "SSA";
+ case 0x0c03: return "USB";
+ case 0x0c04: return "Fiber";
+ case 0x0c05: return "SMBus";
+ case 0x0c06: return "IB";
+ case 0x0c07: return "IPMI";
+ case 0x0c08: return "SERCOS";
+ case 0x0c09: return "CANBUS";
+ }
+ return "Ser";
+ case 0x0d:
+ switch (class_id) {
+ case 0x0d00: return "IRDA";
+ case 0x0d01: return "IR";
+ case 0x0d10: return "RF";
+ case 0x0d11: return "Blueth";
+ case 0x0d12: return "BroadB";
+ case 0x0d20: return "802.1a";
+ case 0x0d21: return "802.1b";
+ }
+ return "Wifi";
+ case 0x0e:
+ switch (class_id) {
+ case 0x0e00: return "I2O";
+ }
+ return "Intll";
+ case 0x0f:
+ switch (class_id) {
+ case 0x0f00: return "S-TV";
+ case 0x0f01: return "S-Aud";
+ case 0x0f02: return "S-Voice";
+ case 0x0f03: return "S-Data";
+ }
+ return "Satel";
+ case 0x10:
+ return "Crypt";
+ case 0x11:
+ return "Signl";
+ case 0x12:
+ return "Accel";
+ case 0x13:
+ return "Instr";
+ case 0xff:
+ return "Oth";
+ }
+ return "PCI";
+}
+
+static const char* hwloc_obj_cache_type_letter(hwloc_obj_cache_type_t type)
+{
+ switch (type) {
+ case HWLOC_OBJ_CACHE_UNIFIED: return "";
+ case HWLOC_OBJ_CACHE_DATA: return "d";
+ case HWLOC_OBJ_CACHE_INSTRUCTION: return "i";
+ default: return "unknown";
+ }
+}
+
+int
+hwloc_obj_type_snprintf(char * __hwloc_restrict string, size_t size, hwloc_obj_t obj, int verbose)
+{
+ hwloc_obj_type_t type = obj->type;
+ switch (type) {
+ case HWLOC_OBJ_MISC:
+ case HWLOC_OBJ_SYSTEM:
+ case HWLOC_OBJ_MACHINE:
+ case HWLOC_OBJ_NUMANODE:
+ case HWLOC_OBJ_PACKAGE:
+ case HWLOC_OBJ_CORE:
+ case HWLOC_OBJ_PU:
+ return hwloc_snprintf(string, size, "%s", hwloc_obj_type_string(type));
+ case HWLOC_OBJ_CACHE:
+ return hwloc_snprintf(string, size, "L%u%s%s", obj->attr->cache.depth,
+ hwloc_obj_cache_type_letter(obj->attr->cache.type),
+ verbose ? hwloc_obj_type_string(type): "");
+ case HWLOC_OBJ_GROUP:
+ /* TODO: more pretty presentation? */
+ if (obj->attr->group.depth != (unsigned) -1)
+ return hwloc_snprintf(string, size, "%s%u", hwloc_obj_type_string(type), obj->attr->group.depth);
+ else
+ return hwloc_snprintf(string, size, "%s", hwloc_obj_type_string(type));
+ case HWLOC_OBJ_BRIDGE:
+ if (verbose)
+ return snprintf(string, size, "Bridge %s->%s",
+ obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI ? "PCI" : "Host",
+ "PCI");
+ else
+ return snprintf(string, size, obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI ? "PCIBridge" : "HostBridge");
+ case HWLOC_OBJ_PCI_DEVICE:
+ return snprintf(string, size, "PCI %04x:%04x",
+ obj->attr->pcidev.vendor_id, obj->attr->pcidev.device_id);
+ case HWLOC_OBJ_OS_DEVICE:
+ switch (obj->attr->osdev.type) {
+ case HWLOC_OBJ_OSDEV_BLOCK: return hwloc_snprintf(string, size, "Block");
+ case HWLOC_OBJ_OSDEV_NETWORK: return hwloc_snprintf(string, size, verbose ? "Network" : "Net");
+ case HWLOC_OBJ_OSDEV_OPENFABRICS: return hwloc_snprintf(string, size, "OpenFabrics");
+ case HWLOC_OBJ_OSDEV_DMA: return hwloc_snprintf(string, size, "DMA");
+ case HWLOC_OBJ_OSDEV_GPU: return hwloc_snprintf(string, size, "GPU");
+ case HWLOC_OBJ_OSDEV_COPROC: return hwloc_snprintf(string, size, verbose ? "Co-Processor" : "CoProc");
+ default:
+ *string = '\0';
+ return 0;
+ }
+ break;
+ default:
+ if (size > 0)
+ *string = '\0';
+ return 0;
+ }
+}
+
+int
+hwloc_obj_attr_snprintf(char * __hwloc_restrict string, size_t size, hwloc_obj_t obj, const char * separator, int verbose)
+{
+ const char *prefix = "";
+ char *tmp = string;
+ ssize_t tmplen = size;
+ int ret = 0;
+ int res;
+
+ /* make sure we output at least an empty string */
+ if (size)
+ *string = '\0';
+
+ /* print memory attributes */
+ res = 0;
+ if (verbose) {
+ if (obj->memory.local_memory)
+ res = hwloc_snprintf(tmp, tmplen, "%slocal=%lu%s%stotal=%lu%s",
+ prefix,
+ (unsigned long) hwloc_memory_size_printf_value(obj->memory.local_memory, verbose),
+ hwloc_memory_size_printf_unit(obj->memory.total_memory, verbose),
+ separator,
+ (unsigned long) hwloc_memory_size_printf_value(obj->memory.total_memory, verbose),
+ hwloc_memory_size_printf_unit(obj->memory.local_memory, verbose));
+ else if (obj->memory.total_memory)
+ res = hwloc_snprintf(tmp, tmplen, "%stotal=%lu%s",
+ prefix,
+ (unsigned long) hwloc_memory_size_printf_value(obj->memory.total_memory, verbose),
+ hwloc_memory_size_printf_unit(obj->memory.total_memory, verbose));
+ } else {
+ if (obj->memory.local_memory)
+ res = hwloc_snprintf(tmp, tmplen, "%s%lu%s",
+ prefix,
+ (unsigned long) hwloc_memory_size_printf_value(obj->memory.local_memory, verbose),
+ hwloc_memory_size_printf_unit(obj->memory.local_memory, verbose));
+ }
+ if (res < 0)
+ return -1;
+ ret += res;
+ if (ret > 0)
+ prefix = separator;
+ if (res >= tmplen)
+ res = tmplen>0 ? tmplen - 1 : 0;
+ tmp += res;
+ tmplen -= res;
+
+ /* printf type-specific attributes */
+ res = 0;
+ switch (obj->type) {
+ case HWLOC_OBJ_CACHE:
+ if (verbose) {
+ char assoc[32];
+ if (obj->attr->cache.associativity == -1)
+ snprintf(assoc, sizeof(assoc), "%sfully-associative", separator);
+ else if (obj->attr->cache.associativity == 0)
+ *assoc = '\0';
+ else
+ snprintf(assoc, sizeof(assoc), "%sways=%d", separator, obj->attr->cache.associativity);
+ res = hwloc_snprintf(tmp, tmplen, "%ssize=%lu%s%slinesize=%u%s",
+ prefix,
+ (unsigned long) hwloc_memory_size_printf_value(obj->attr->cache.size, verbose),
+ hwloc_memory_size_printf_unit(obj->attr->cache.size, verbose),
+ separator, obj->attr->cache.linesize,
+ assoc);
+ } else
+ res = hwloc_snprintf(tmp, tmplen, "%s%lu%s",
+ prefix,
+ (unsigned long) hwloc_memory_size_printf_value(obj->attr->cache.size, verbose),
+ hwloc_memory_size_printf_unit(obj->attr->cache.size, verbose));
+ break;
+ case HWLOC_OBJ_BRIDGE:
+ if (verbose) {
+ char up[128], down[64];
+ /* upstream is PCI or HOST */
+ if (obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI) {
+ char linkspeed[64]= "";
+ if (obj->attr->pcidev.linkspeed)
+ snprintf(linkspeed, sizeof(linkspeed), "%slink=%.2fGB/s", separator, obj->attr->pcidev.linkspeed);
+ snprintf(up, sizeof(up), "busid=%04x:%02x:%02x.%01x%sid=%04x:%04x%sclass=%04x(%s)%s",
+ obj->attr->pcidev.domain, obj->attr->pcidev.bus, obj->attr->pcidev.dev, obj->attr->pcidev.func, separator,
+ obj->attr->pcidev.vendor_id, obj->attr->pcidev.device_id, separator,
+ obj->attr->pcidev.class_id, hwloc_pci_class_string(obj->attr->pcidev.class_id), linkspeed);
+ } else
+ *up = '\0';
+ /* downstream is_PCI */
+ snprintf(down, sizeof(down), "buses=%04x:[%02x-%02x]",
+ obj->attr->bridge.downstream.pci.domain, obj->attr->bridge.downstream.pci.secondary_bus, obj->attr->bridge.downstream.pci.subordinate_bus);
+ if (*up)
+ res = snprintf(string, size, "%s%s%s", up, separator, down);
+ else
+ res = snprintf(string, size, "%s", down);
+ }
+ break;
+ case HWLOC_OBJ_PCI_DEVICE:
+ if (verbose) {
+ char linkspeed[64]= "";
+ char busid[16] = "[collapsed]";
+ if (obj->attr->pcidev.linkspeed)
+ snprintf(linkspeed, sizeof(linkspeed), "%slink=%.2fGB/s", separator, obj->attr->pcidev.linkspeed);
+ if (!hwloc_obj_get_info_by_name(obj, "lstopoCollapse"))
+ snprintf(busid, sizeof(busid), "%04x:%02x:%02x.%01x",
+ obj->attr->pcidev.domain, obj->attr->pcidev.bus, obj->attr->pcidev.dev, obj->attr->pcidev.func);
+ res = snprintf(string, size, "busid=%s%sclass=%04x(%s)%s",
+ busid, separator,
+ obj->attr->pcidev.class_id, hwloc_pci_class_string(obj->attr->pcidev.class_id), linkspeed);
+ }
+ break;
+ default:
+ break;
+ }
+ if (res < 0)
+ return -1;
+ ret += res;
+ if (ret > 0)
+ prefix = separator;
+ if (res >= tmplen)
+ res = tmplen>0 ? tmplen - 1 : 0;
+ tmp += res;
+ tmplen -= res;
+
+ /* printf infos */
+ if (verbose) {
+ unsigned i;
+ for(i=0; i<obj->infos_count; i++) {
+ if (!strcmp(obj->infos[i].name, "lstopoCollapse"))
+ continue;
+ if (strchr(obj->infos[i].value, ' '))
+ res = hwloc_snprintf(tmp, tmplen, "%s%s=\"%s\"",
+ prefix,
+ obj->infos[i].name, obj->infos[i].value);
+ else
+ res = hwloc_snprintf(tmp, tmplen, "%s%s=%s",
+ prefix,
+ obj->infos[i].name, obj->infos[i].value);
+ if (res < 0)
+ return -1;
+ ret += res;
+ if (res >= tmplen)
+ res = tmplen>0 ? tmplen - 1 : 0;
+ tmp += res;
+ tmplen -= res;
+ if (ret > 0)
+ prefix = separator;
+ }
+ }
+
+ return ret;
+}
+
+
+int
+hwloc_obj_snprintf(char *string, size_t size,
+ struct hwloc_topology *topology __hwloc_attribute_unused, struct hwloc_obj *l, const char *_indexprefix, int verbose)
+{
+ const char *indexprefix = _indexprefix ? _indexprefix : "#";
+ char os_index[12] = "";
+ char type[64];
+ char attr[128];
+ int attrlen;
+
+ if (l->os_index != (unsigned) -1) {
+ hwloc_snprintf(os_index, 12, "%s%u", indexprefix, l->os_index);
+ }
+
+ hwloc_obj_type_snprintf(type, sizeof(type), l, verbose);
+ attrlen = hwloc_obj_attr_snprintf(attr, sizeof(attr), l, " ", verbose);
+
+ if (attrlen > 0)
+ return hwloc_snprintf(string, size, "%s%s(%s)", type, os_index, attr);
+ else
+ return hwloc_snprintf(string, size, "%s%s", type, os_index);
+}
+
+int hwloc_obj_cpuset_snprintf(char *str, size_t size, size_t nobj, struct hwloc_obj * const *objs)
+{
+ hwloc_bitmap_t set = hwloc_bitmap_alloc();
+ int res;
+ unsigned i;
+
+ hwloc_bitmap_zero(set);
+ for(i=0; i<nobj; i++)
+ if (objs[i]->cpuset)
+ hwloc_bitmap_or(set, set, objs[i]->cpuset);
+
+ res = hwloc_bitmap_snprintf(str, size, set);
+ hwloc_bitmap_free(set);
+ return res;
+}
diff --git a/ext/hwloc/include/hwloc.h b/ext/hwloc/include/hwloc.h
new file mode 100644
index 0000000..6c8d203
--- /dev/null
+++ b/ext/hwloc/include/hwloc.h
@@ -0,0 +1,2206 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria. All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/*=====================================================================
+ * PLEASE GO READ THE DOCUMENTATION!
+ * ------------------------------------------------
+ * $tarball_directory/doc/doxygen-doc/
+ * or
+ * http://www.open-mpi.org/projects/hwloc/doc/
+ *=====================================================================
+ *
+ * FAIR WARNING: Do NOT expect to be able to figure out all the
+ * subtleties of hwloc by simply reading function prototypes and
+ * constant descrptions here in this file.
+ *
+ * Hwloc has wonderful documentation in both PDF and HTML formats for
+ * your reading pleasure. The formal documentation explains a LOT of
+ * hwloc-specific concepts, provides definitions, and discusses the
+ * "big picture" for many of the things that you'll find here in this
+ * header file.
+ *
+ * The PDF/HTML documentation was generated via Doxygen; much of what
+ * you'll see in there is also here in this file. BUT THERE IS A LOT
+ * THAT IS IN THE PDF/HTML THAT IS ***NOT*** IN hwloc.h!
+ *
+ * There are entire paragraph-length descriptions, discussions, and
+ * pretty prictures to explain subtle corner cases, provide concrete
+ * examples, etc.
+ *
+ * Please, go read the documentation. :-)
+ *
+ * Moreover there are several examples of hwloc use under doc/examples
+ * in the source tree.
+ *
+ *=====================================================================*/
+
+/** \file
+ * \brief The hwloc API.
+ *
+ * See hwloc/bitmap.h for bitmap specific macros.
+ * See hwloc/helper.h for high-level topology traversal helpers.
+ * See hwloc/inlines.h for the actual inline code of some functions below.
+ */
+
+#ifndef HWLOC_H
+#define HWLOC_H
+
+#include <hwloc/autogen/config.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+
+/*
+ * Symbol transforms
+ */
+#include <hwloc/rename.h>
+
+/*
+ * Bitmap definitions
+ */
+
+#include <hwloc/bitmap.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_api_version API version
+ * @{
+ */
+
+/** \brief Indicate at build time which hwloc API version is being used. */
+#define HWLOC_API_VERSION 0x00020000
+
+/** \brief Indicate at runtime which hwloc API version was used at build time. */
+HWLOC_DECLSPEC unsigned hwloc_get_api_version(void);
+
+/** \brief Current component and plugin ABI version (see hwloc/plugins.h) */
+#define HWLOC_COMPONENT_ABI 5
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_object_sets Object Sets (hwloc_cpuset_t and hwloc_nodeset_t)
+ *
+ * Hwloc uses bitmaps to represent two distinct kinds of object sets:
+ * CPU sets (::hwloc_cpuset_t) and NUMA node sets (::hwloc_nodeset_t).
+ * These types are both typedefs to a common back end type
+ * (::hwloc_bitmap_t), and therefore all the hwloc bitmap functions
+ * are applicable to both ::hwloc_cpuset_t and ::hwloc_nodeset_t (see
+ * \ref hwlocality_bitmap).
+ *
+ * The rationale for having two different types is that even though
+ * the actions one wants to perform on these types are the same (e.g.,
+ * enable and disable individual items in the set/mask), they're used
+ * in very different contexts: one for specifying which processors to
+ * use and one for specifying which NUMA nodes to use. Hence, the
+ * name difference is really just to reflect the intent of where the
+ * type is used.
+ *
+ * @{
+ */
+
+/** \brief A CPU set is a bitmap whose bits are set according to CPU
+ * physical OS indexes.
+ *
+ * It may be consulted and modified with the bitmap API as any
+ * ::hwloc_bitmap_t (see hwloc/bitmap.h).
+ *
+ * Each bit may be converted into a PU object using
+ * hwloc_get_pu_obj_by_os_index().
+ */
+typedef hwloc_bitmap_t hwloc_cpuset_t;
+/** \brief A non-modifiable ::hwloc_cpuset_t. */
+typedef hwloc_const_bitmap_t hwloc_const_cpuset_t;
+
+/** \brief A node set is a bitmap whose bits are set according to NUMA
+ * memory node physical OS indexes.
+ *
+ * It may be consulted and modified with the bitmap API as any
+ * ::hwloc_bitmap_t (see hwloc/bitmap.h).
+ * Each bit may be converted into a NUMA node object using
+ * hwloc_get_numanode_obj_by_os_index().
+ *
+ * When binding memory on a system without any NUMA node,
+ * the single main memory bank is considered as NUMA node #0.
+ *
+ * See also \ref hwlocality_helper_nodeset_convert.
+ */
+typedef hwloc_bitmap_t hwloc_nodeset_t;
+/** \brief A non-modifiable ::hwloc_nodeset_t.
+ */
+typedef hwloc_const_bitmap_t hwloc_const_nodeset_t;
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_object_types Object Types
+ * @{
+ */
+
+/** \brief Type of topology object.
+ *
+ * \note Do not rely on the ordering or completeness of the values as new ones
+ * may be defined in the future! If you need to compare types, use
+ * hwloc_compare_types() instead.
+ */
+typedef enum {
+ /* ***************************************************************
+ WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
+
+ If new enum values are added here, you MUST also go update the
+ obj_type_order[] and obj_order_type[] arrays in src/topology.c.
+
+ WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
+ *************************************************************** */
+
+ HWLOC_OBJ_SYSTEM, /**< \brief Whole system (may be a cluster of machines).
+ * The whole system that is accessible to hwloc.
+ * That may comprise several machines in SSI systems.
+ */
+ HWLOC_OBJ_MACHINE, /**< \brief Machine.
+ * The typical root object type.
+ * A set of processors and memory with cache
+ * coherency.
+ */
+ HWLOC_OBJ_NUMANODE, /**< \brief NUMA node.
+ * A set of processors around memory which the
+ * processors can directly access.
+ *
+ * There is always at one such object in the topology
+ * even if the machine is not NUMA.
+ */
+ HWLOC_OBJ_PACKAGE, /**< \brief Physical package, what goes into a socket.
+ * In the physical meaning, i.e. that you can add
+ * or remove physically.
+ */
+ HWLOC_OBJ_CACHE, /**< \brief Cache.
+ * Can be L1i, L1d, L2, L3, ...
+ */
+ HWLOC_OBJ_CORE, /**< \brief Core.
+ * A computation unit (may be shared by several
+ * logical processors).
+ */
+ HWLOC_OBJ_PU, /**< \brief Processing Unit, or (Logical) Processor.
+ * An execution unit (may share a core with some
+ * other logical processors, e.g. in the case of
+ * an SMT core).
+ *
+ * Objects of this kind are always reported and can
+ * thus be used as fallback when others are not.
+ */
+
+ HWLOC_OBJ_GROUP, /**< \brief Group objects.
+ * Objects which do not fit in the above but are
+ * detected by hwloc and are useful to take into
+ * account for affinity. For instance, some operating systems
+ * expose their arbitrary processors aggregation this
+ * way. And hwloc may insert such objects to group
+ * NUMA nodes according to their distances.
+ *
+ * These objects are ignored when they do not bring
+ * any structure.
+ */
+
+ HWLOC_OBJ_MISC, /**< \brief Miscellaneous objects.
+ * Objects without particular meaning, that can e.g. be
+ * added by the application for its own use, or by hwloc
+ * for miscellaneous objects such as MemoryDevice.
+ * These objects are not listed in the main children list,
+ * but rather in the dedicated misc children list.
+ * Misc objects may only have Misc objects as children,
+ * and those are in the dedicated misc children list as well.
+ * Misc objects have NULL CPU and node sets.
+ */
+
+ HWLOC_OBJ_BRIDGE, /**< \brief Bridge.
+ * Any bridge that connects the host or an I/O bus,
+ * to another I/O bus.
+ * They are not added to the topology unless I/O discovery
+ * is enabled with hwloc_topology_set_flags().
+ * I/O objects are not listed in the main children list,
+ * but rather in the dedicated io children list.
+ * I/O objects have NULL CPU and node sets.
+ */
+ HWLOC_OBJ_PCI_DEVICE, /**< \brief PCI device.
+ * They are not added to the topology unless I/O discovery
+ * is enabled with hwloc_topology_set_flags().
+ * I/O objects are not listed in the main children list,
+ * but rather in the dedicated io children list.
+ * I/O objects have NULL CPU and node sets.
+ */
+ HWLOC_OBJ_OS_DEVICE, /**< \brief Operating system device.
+ * They are not added to the topology unless I/O discovery
+ * is enabled with hwloc_topology_set_flags().
+ * I/O objects are not listed in the main children list,
+ * but rather in the dedicated io children list.
+ * I/O objects have NULL CPU and node sets.
+ */
+
+ HWLOC_OBJ_TYPE_MAX /**< \private Sentinel value */
+
+ /* ***************************************************************
+ WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
+
+ If new enum values are added here, you MUST also go update the
+ obj_type_order[] and obj_order_type[] arrays in src/topology.c.
+
+ WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
+ *************************************************************** */
+} hwloc_obj_type_t;
+
+/** \brief Cache type. */
+typedef enum hwloc_obj_cache_type_e {
+ HWLOC_OBJ_CACHE_UNIFIED, /**< \brief Unified cache. */
+ HWLOC_OBJ_CACHE_DATA, /**< \brief Data cache. */
+ HWLOC_OBJ_CACHE_INSTRUCTION /**< \brief Instruction cache.
+ * Only used when the HWLOC_TOPOLOGY_FLAG_ICACHES topology flag is set. */
+} hwloc_obj_cache_type_t;
+
+/** \brief Type of one side (upstream or downstream) of an I/O bridge. */
+typedef enum hwloc_obj_bridge_type_e {
+ HWLOC_OBJ_BRIDGE_HOST, /**< \brief Host-side of a bridge, only possible upstream. */
+ HWLOC_OBJ_BRIDGE_PCI /**< \brief PCI-side of a bridge. */
+} hwloc_obj_bridge_type_t;
+
+/** \brief Type of a OS device. */
+typedef enum hwloc_obj_osdev_type_e {
+ HWLOC_OBJ_OSDEV_BLOCK, /**< \brief Operating system block device.
+ * For instance "sda" on Linux. */
+ HWLOC_OBJ_OSDEV_GPU, /**< \brief Operating system GPU device.
+ * For instance ":0.0" for a GL display,
+ * "card0" for a Linux DRM device. */
+ HWLOC_OBJ_OSDEV_NETWORK, /**< \brief Operating system network device.
+ * For instance the "eth0" interface on Linux. */
+ HWLOC_OBJ_OSDEV_OPENFABRICS, /**< \brief Operating system openfabrics device.
+ * For instance the "mlx4_0" InfiniBand HCA device on Linux. */
+ HWLOC_OBJ_OSDEV_DMA, /**< \brief Operating system dma engine device.
+ * For instance the "dma0chan0" DMA channel on Linux. */
+ HWLOC_OBJ_OSDEV_COPROC /**< \brief Operating system co-processor device.
+ * For instance "mic0" for a Xeon Phi (MIC) on Linux,
+ * "opencl0d0" for a OpenCL device,
+ * "cuda0" for a CUDA device. */
+} hwloc_obj_osdev_type_t;
+
+/** \brief Compare the depth of two object types
+ *
+ * Types shouldn't be compared as they are, since newer ones may be added in
+ * the future. This function returns less than, equal to, or greater than zero
+ * respectively if \p type1 objects usually include \p type2 objects, are the
+ * same as \p type2 objects, or are included in \p type2 objects. If the types
+ * can not be compared (because neither is usually contained in the other),
+ * HWLOC_TYPE_UNORDERED is returned. Object types containing CPUs can always
+ * be compared (usually, a system contains machines which contain nodes which
+ * contain packages which contain caches, which contain cores, which contain
+ * processors).
+ *
+ * \note HWLOC_OBJ_PU will always be the deepest.
+ * \note This does not mean that the actual topology will respect that order:
+ * e.g. as of today cores may also contain caches, and packages may also contain
+ * nodes. This is thus just to be seen as a fallback comparison method.
+ */
+HWLOC_DECLSPEC int hwloc_compare_types (hwloc_obj_type_t type1, hwloc_obj_type_t type2) __hwloc_attribute_const;
+
+enum hwloc_compare_types_e {
+ HWLOC_TYPE_UNORDERED = INT_MAX /**< \brief Value returned by hwloc_compare_types when types can not be compared. \hideinitializer */
+};
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_objects Object Structure and Attributes
+ * @{
+ */
+
+union hwloc_obj_attr_u;
+
+/** \brief Object memory */
+struct hwloc_obj_memory_s {
+ hwloc_uint64_t total_memory; /**< \brief Total memory (in bytes) in this object and its children */
+ hwloc_uint64_t local_memory; /**< \brief Local memory (in bytes) */
+
+ /** \brief Size of array \p page_types */
+ unsigned page_types_len;
+ /** \brief Array of local memory page types, \c NULL if no local memory and \p page_types is 0.
+ *
+ * The array is sorted by increasing \p size fields.
+ * It contains \p page_types_len slots.
+ */
+ struct hwloc_obj_memory_page_type_s {
+ hwloc_uint64_t size; /**< \brief Size of pages */
+ hwloc_uint64_t count; /**< \brief Number of pages of this size */
+ } * page_types;
+};
+
+/** \brief Structure of a topology object
+ *
+ * Applications must not modify any field except hwloc_obj.userdata.
+ */
+struct hwloc_obj {
+ /* physical information */
+ hwloc_obj_type_t type; /**< \brief Type of object */
+ unsigned os_index; /**< \brief OS-provided physical index number.
+ * It is not guaranteed unique across the entire machine,
+ * except for PUs and NUMA nodes.
+ */
+ char *name; /**< \brief Object description if any */
+
+ struct hwloc_obj_memory_s memory; /**< \brief Memory attributes */
+
+ union hwloc_obj_attr_u *attr; /**< \brief Object type-specific Attributes,
+ * may be \c NULL if no attribute value was found */
+
+ /* global position */
+ unsigned depth; /**< \brief Vertical index in the hierarchy.
+ * If the topology is symmetric, this is equal to the
+ * parent depth plus one, and also equal to the number
+ * of parent/child links from the root object to here.
+ */
+ unsigned logical_index; /**< \brief Horizontal index in the whole list of similar objects,
+ * hence guaranteed unique across the entire machine.
+ * Could be a "cousin_rank" since it's the rank within the "cousin" list below
+ */
+
+ /* cousins are all objects of the same type (and depth) across the entire topology */
+ struct hwloc_obj *next_cousin; /**< \brief Next object of same type and depth */
+ struct hwloc_obj *prev_cousin; /**< \brief Previous object of same type and depth */
+
+ /* children of the same parent are siblings, even if they may have different type and depth */
+ struct hwloc_obj *parent; /**< \brief Parent, \c NULL if root (system object) */
+ unsigned sibling_rank; /**< \brief Index in parent's \c children[] array. Or the index in parent's I/O or Misc children list. */
+ struct hwloc_obj *next_sibling; /**< \brief Next object below the same parent */
+ struct hwloc_obj *prev_sibling; /**< \brief Previous object below the same parent */
+
+ /* children array below this object (except I/O and Misc children) */
+ unsigned arity; /**< \brief Number of children */
+ struct hwloc_obj **children; /**< \brief Children, \c children[0 .. arity -1] */
+ struct hwloc_obj *first_child; /**< \brief First child */
+ struct hwloc_obj *last_child; /**< \brief Last child */
+
+ int symmetric_subtree; /**< \brief Set if the subtree of normal objects below this object is symmetric,
+ * which means all children and their children have identical subtrees.
+ * I/O and Misc children are ignored.
+ *
+ * If set in the topology root object, lstopo may export the topology
+ * as a synthetic string.
+ */
+
+ /* specific list of I/O children */
+ unsigned io_arity; /**< \brief Number of I/O children */
+ struct hwloc_obj *io_first_child; /**< \brief First I/O child */
+
+ /* specific list of Misc children */
+ unsigned misc_arity; /**< \brief Number of Misc children */
+ struct hwloc_obj *misc_first_child; /**< \brief First Misc child */
+
+ /* cpusets and nodesets */
+ hwloc_cpuset_t cpuset; /**< \brief CPUs covered by this object
+ *
+ * This is the set of CPUs for which there are PU objects in the topology
+ * under this object, i.e. which are known to be physically contained in this
+ * object and known how (the children path between this object and the PU
+ * objects).
+ *
+ * If the HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM configuration flag is set,
+ * some of these CPUs may not be allowed for binding, see allowed_cpuset.
+ *
+ * \note All objects have non-NULL CPU and node sets except Misc and I/O objects.
+ *
+ * \note Its value must not be changed, hwloc_bitmap_dup must be used instead.
+ */
+ hwloc_cpuset_t complete_cpuset; /**< \brief The complete CPU set of logical processors of this object,
+ *
+ * This may include not only the same as the cpuset field, but also the CPUs for
+ * which topology information is unknown or incomplete, the offlines CPUS, and
+ * the CPUs that are ignored when the HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM flag
+ * is not set.
+ * Thus no corresponding PU object may be found in the topology, because the
+ * precise position is undefined. It is however known that it would be somewhere
+ * under this object.
+ *
+ * \note Its value must not be changed, hwloc_bitmap_dup must be used instead.
+ */
+ hwloc_cpuset_t allowed_cpuset; /**< \brief The CPU set of allowed logical processors
+ *
+ * This includes the CPUs contained in this object which are allowed for
+ * binding, i.e. passing them to the hwloc binding functions should not return
+ * permission errors. This is usually restricted by administration rules.
+ *
+ * If the HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM configuration flag is set,
+ * allowed_cpuset may be smaller than cpuset. Otherwise they are identical.
+ *
+ * \note Its value must not be changed, hwloc_bitmap_dup must be used instead.
+ */
+
+ hwloc_nodeset_t nodeset; /**< \brief NUMA nodes covered by this object or containing this object
+ *
+ * This is the set of NUMA nodes for which there are NODE objects in the
+ * topology under or above this object, i.e. which are known to be physically
+ * contained in this object or containing it and known how (the children path
+ * between this object and the NODE objects).
+ *
+ * In the end, these nodes are those that are close to the current object.
+ *
+ * If the HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM configuration flag is set,
+ * some of these nodes may not be allowed for allocation, see allowed_nodeset.
+ *
+ * If there are no NUMA nodes in the machine, all the memory is close to this
+ * object, so only the first bit may be set in \p nodeset.
+ *
+ * \note All objects have non-NULL CPU and node sets except Misc and I/O objects.
+ *
+ * \note Its value must not be changed, hwloc_bitmap_dup must be used instead.
+ */
+ hwloc_nodeset_t complete_nodeset; /**< \brief The complete NUMA node set of this object,
+ *
+ * This may include not only the same as the nodeset field, but also the NUMA
+ * nodes for which topology information is unknown or incomplete, the offlines
+ * nodes, and the nodes that are ignored when the HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM
+ * flag is not set.
+ * Thus no corresponding NODE object may be found in the topology, because the
+ * precise position is undefined. It is however known that it would be
+ * somewhere under this object.
+ *
+ * If there are no NUMA nodes in the machine, all the memory is close to this
+ * object, so only the first bit is set in \p complete_nodeset.
+ *
+ * \note Its value must not be changed, hwloc_bitmap_dup must be used instead.
+ */
+ hwloc_nodeset_t allowed_nodeset; /**< \brief The set of allowed NUMA memory nodes
+ *
+ * This includes the NUMA memory nodes contained in this object which are
+ * allowed for memory allocation, i.e. passing them to NUMA node-directed
+ * memory allocation should not return permission errors. This is usually
+ * restricted by administration rules.
+ *
+ * If the HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM configuration flag is set,
+ * allowed_nodeset may be smaller than nodeset. Otherwise they are identical.
+ *
+ * If there are no NUMA nodes in the machine, all the memory is close to this
+ * object, so only the first bit may be set in \p allowed_nodeset.
+ *
+ * \note Its value must not be changed, hwloc_bitmap_dup must be used instead.
+ */
+
+ struct hwloc_distances_s **distances; /**< \brief Distances between all objects at same depth below this object */
+ unsigned distances_count;
+
+ struct hwloc_obj_info_s *infos; /**< \brief Array of stringified info type=name. */
+ unsigned infos_count; /**< \brief Size of infos array. */
+
+ /* misc */
+ void *userdata; /**< \brief Application-given private data pointer,
+ * initialized to \c NULL, use it as you wish.
+ * See hwloc_topology_set_userdata_export_callback() in hwloc/export.h
+ * if you wish to export this field to XML. */
+};
+/**
+ * \brief Convenience typedef; a pointer to a struct hwloc_obj.
+ */
+typedef struct hwloc_obj * hwloc_obj_t;
+
+/** \brief Object type-specific Attributes */
+union hwloc_obj_attr_u {
+ /** \brief Cache-specific Object Attributes */
+ struct hwloc_cache_attr_s {
+ hwloc_uint64_t size; /**< \brief Size of cache in bytes */
+ unsigned depth; /**< \brief Depth of cache (e.g., L1, L2, ...etc.) */
+ unsigned linesize; /**< \brief Cache-line size in bytes. 0 if unknown */
+ int associativity; /**< \brief Ways of associativity,
+ * -1 if fully associative, 0 if unknown */
+ hwloc_obj_cache_type_t type; /**< \brief Cache type */
+ } cache;
+ /** \brief Group-specific Object Attributes */
+ struct hwloc_group_attr_s {
+ unsigned depth; /**< \brief Depth of group object */
+ } group;
+ /** \brief PCI Device specific Object Attributes */
+ struct hwloc_pcidev_attr_s {
+ unsigned short domain;
+ unsigned char bus, dev, func;
+ unsigned short class_id;
+ unsigned short vendor_id, device_id, subvendor_id, subdevice_id;
+ unsigned char revision;
+ float linkspeed; /* in GB/s */
+ } pcidev;
+ /** \brief Bridge specific Object Attribues */
+ struct hwloc_bridge_attr_s {
+ union {
+ struct hwloc_pcidev_attr_s pci;
+ } upstream;
+ hwloc_obj_bridge_type_t upstream_type;
+ union {
+ struct {
+ unsigned short domain;
+ unsigned char secondary_bus, subordinate_bus;
+ } pci;
+ } downstream;
+ hwloc_obj_bridge_type_t downstream_type;
+ unsigned depth;
+ } bridge;
+ /** \brief OS Device specific Object Attributes */
+ struct hwloc_osdev_attr_s {
+ hwloc_obj_osdev_type_t type;
+ } osdev;
+};
+
+/** \brief Distances between objects
+ *
+ * One object may contain a distance structure describing distances
+ * between all its descendants at a given relative depth. If the
+ * containing object is the root object of the topology, then the
+ * distances are available for all objects in the machine.
+ *
+ * If the \p latency pointer is not \c NULL, the pointed array contains
+ * memory latencies (non-zero values), see below.
+ *
+ * In the future, some other types of distances may be considered.
+ * In these cases, \p latency may be \c NULL.
+ */
+struct hwloc_distances_s {
+ unsigned relative_depth; /**< \brief Relative depth of the considered objects
+ * below the object containing this distance information. */
+ unsigned nbobjs; /**< \brief Number of objects considered in the matrix.
+ * It is the number of descendant objects at \p relative_depth
+ * below the containing object.
+ * It corresponds to the result of hwloc_get_nbobjs_inside_cpuset_by_depth(). */
+
+ float *latency; /**< \brief Matrix of latencies between objects, stored as a one-dimension array.
+ * May be \c NULL if the distances considered here are not latencies.
+ *
+ * Unless defined by the user, this currently contains latencies
+ * between NUMA nodes (as reported in the System Locality Distance Information Table
+ * (SLIT) in the ACPI specification), which may or may not be accurate.
+ * It corresponds to the latency for accessing the memory of one node
+ * from a core in another node.
+ *
+ * Values are normalized to get 1.0 as the minimal value in the matrix.
+ * Latency from i-th to j-th object is stored in slot i*nbobjs+j.
+ */
+ float latency_max; /**< \brief The maximal value in the latency matrix. */
+ float latency_base; /**< \brief The multiplier that should be applied to latency matrix
+ * to retrieve the original OS-provided latencies.
+ * Usually 10 on Linux since ACPI SLIT uses 10 for local latency.
+ */
+};
+
+/** \brief Object info */
+struct hwloc_obj_info_s {
+ char *name; /**< \brief Info name */
+ char *value; /**< \brief Info value */
+};
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_creation Topology Creation and Destruction
+ * @{
+ */
+
+struct hwloc_topology;
+/** \brief Topology context
+ *
+ * To be initialized with hwloc_topology_init() and built with hwloc_topology_load().
+ */
+typedef struct hwloc_topology * hwloc_topology_t;
+
+/** \brief Allocate a topology context.
+ *
+ * \param[out] topologyp is assigned a pointer to the new allocated context.
+ *
+ * \return 0 on success, -1 on error.
+ */
+HWLOC_DECLSPEC int hwloc_topology_init (hwloc_topology_t *topologyp);
+
+/** \brief Build the actual topology
+ *
+ * Build the actual topology once initialized with hwloc_topology_init() and
+ * tuned with \ref hwlocality_configuration and \ref hwlocality_setsource routines.
+ * No other routine may be called earlier using this topology context.
+ *
+ * \param topology is the topology to be loaded with objects.
+ *
+ * \return 0 on success, -1 on error.
+ *
+ * \note On failure, the topology is reinitialized. It should be either
+ * destroyed with hwloc_topology_destroy() or configured and loaded again.
+ *
+ * \note This function may be called only once per topology.
+ *
+ * \sa hwlocality_configuration and hwlocality_setsource
+ */
+HWLOC_DECLSPEC int hwloc_topology_load(hwloc_topology_t topology);
+
+/** \brief Terminate and free a topology context
+ *
+ * \param topology is the topology to be freed
+ */
+HWLOC_DECLSPEC void hwloc_topology_destroy (hwloc_topology_t topology);
+
+/** \brief Duplicate a topology.
+ *
+ * The entire topology structure as well as its objects
+ * are duplicated into a new one.
+ *
+ * This is useful for keeping a backup while modifying a topology.
+ */
+HWLOC_DECLSPEC int hwloc_topology_dup(hwloc_topology_t *newtopology, hwloc_topology_t oldtopology);
+
+/** \brief Run internal checks on a topology structure
+ *
+ * The program aborts if an inconsistency is detected in the given topology.
+ *
+ * \param topology is the topology to be checked
+ *
+ * \note This routine is only useful to developers.
+ *
+ * \note The input topology should have been previously loaded with
+ * hwloc_topology_load().
+ */
+HWLOC_DECLSPEC void hwloc_topology_check(hwloc_topology_t topology);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_levels Object levels, depths and types
+ * @{
+ *
+ * Be sure to see the figure in \ref termsanddefs that shows a
+ * complete topology tree, including depths, child/sibling/cousin
+ * relationships, and an example of an asymmetric topology where one
+ * package has fewer caches than its peers.
+ */
+
+/** \brief Get the depth of the hierarchical tree of objects.
+ *
+ * This is the depth of HWLOC_OBJ_PU objects plus one.
+ */
+HWLOC_DECLSPEC unsigned hwloc_topology_get_depth(hwloc_topology_t __hwloc_restrict topology) __hwloc_attribute_pure;
+
+/** \brief Returns the depth of objects of type \p type.
+ *
+ * If no object of this type is present on the underlying architecture, or if
+ * the OS doesn't provide this kind of information, the function returns
+ * HWLOC_TYPE_DEPTH_UNKNOWN.
+ *
+ * If type is absent but a similar type is acceptable, see also
+ * hwloc_get_type_or_below_depth() and hwloc_get_type_or_above_depth().
+ *
+ * If some objects of the given type exist in different levels,
+ * for instance L1 and L2 caches, or L1i and L1d caches,
+ * the function returns HWLOC_TYPE_DEPTH_MULTIPLE.
+ * See hwloc_get_cache_type_depth() in hwloc/helper.h to better handle this
+ * case.
+ *
+ * If an I/O object type is given, the function returns a virtual value
+ * because I/O objects are stored in special levels that are not CPU-related.
+ * This virtual depth may be passed to other hwloc functions such as
+ * hwloc_get_obj_by_depth() but it should not be considered as an actual
+ * depth by the application. In particular, it should not be compared with
+ * any other object depth or with the entire topology depth.
+ */
+HWLOC_DECLSPEC int hwloc_get_type_depth (hwloc_topology_t topology, hwloc_obj_type_t type);
+
+enum hwloc_get_type_depth_e {
+ HWLOC_TYPE_DEPTH_UNKNOWN = -1, /**< \brief No object of given type exists in the topology. \hideinitializer */
+ HWLOC_TYPE_DEPTH_MULTIPLE = -2, /**< \brief Objects of given type exist at different depth in the topology. \hideinitializer */
+ HWLOC_TYPE_DEPTH_BRIDGE = -3, /**< \brief Virtual depth for bridge object level. \hideinitializer */
+ HWLOC_TYPE_DEPTH_PCI_DEVICE = -4, /**< \brief Virtual depth for PCI device object level. \hideinitializer */
+ HWLOC_TYPE_DEPTH_OS_DEVICE = -5, /**< \brief Virtual depth for software device object level. \hideinitializer */
+ HWLOC_TYPE_DEPTH_MISC = -6 /**< \brief Virtual depth for Misc object. \hideinitializer */
+};
+
+/** \brief Returns the depth of objects of type \p type or below
+ *
+ * If no object of this type is present on the underlying architecture, the
+ * function returns the depth of the first "present" object typically found
+ * inside \p type.
+ *
+ * If some objects of the given type exist in different levels, for instance
+ * L1 and L2 caches, the function returns HWLOC_TYPE_DEPTH_MULTIPLE.
+ */
+static __hwloc_inline int
+hwloc_get_type_or_below_depth (hwloc_topology_t topology, hwloc_obj_type_t type) __hwloc_attribute_pure;
+
+/** \brief Returns the depth of objects of type \p type or above
+ *
+ * If no object of this type is present on the underlying architecture, the
+ * function returns the depth of the first "present" object typically
+ * containing \p type.
+ *
+ * If some objects of the given type exist in different levels, for instance
+ * L1 and L2 caches, the function returns HWLOC_TYPE_DEPTH_MULTIPLE.
+ */
+static __hwloc_inline int
+hwloc_get_type_or_above_depth (hwloc_topology_t topology, hwloc_obj_type_t type) __hwloc_attribute_pure;
+
+/** \brief Returns the type of objects at depth \p depth.
+ *
+ * \return -1 if depth \p depth does not exist.
+ */
+HWLOC_DECLSPEC hwloc_obj_type_t hwloc_get_depth_type (hwloc_topology_t topology, unsigned depth) __hwloc_attribute_pure;
+
+/** \brief Returns the width of level at depth \p depth.
+ */
+HWLOC_DECLSPEC unsigned hwloc_get_nbobjs_by_depth (hwloc_topology_t topology, unsigned depth) __hwloc_attribute_pure;
+
+/** \brief Returns the width of level type \p type
+ *
+ * If no object for that type exists, 0 is returned.
+ * If there are several levels with objects of that type, -1 is returned.
+ */
+static __hwloc_inline int
+hwloc_get_nbobjs_by_type (hwloc_topology_t topology, hwloc_obj_type_t type) __hwloc_attribute_pure;
+
+/** \brief Returns the top-object of the topology-tree.
+ *
+ * Its type is typically ::HWLOC_OBJ_MACHINE but it could be different
+ * for complex topologies.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_root_obj (hwloc_topology_t topology) __hwloc_attribute_pure;
+
+/** \brief Returns the topology object at logical index \p idx from depth \p depth */
+HWLOC_DECLSPEC hwloc_obj_t hwloc_get_obj_by_depth (hwloc_topology_t topology, unsigned depth, unsigned idx) __hwloc_attribute_pure;
+
+/** \brief Returns the topology object at logical index \p idx with type \p type
+ *
+ * If no object for that type exists, \c NULL is returned.
+ * If there are several levels with objects of that type, \c NULL is returned
+ * and ther caller may fallback to hwloc_get_obj_by_depth().
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type, unsigned idx) __hwloc_attribute_pure;
+
+/** \brief Returns the next object at depth \p depth.
+ *
+ * If \p prev is \c NULL, return the first object at depth \p depth.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_by_depth (hwloc_topology_t topology, unsigned depth, hwloc_obj_t prev);
+
+/** \brief Returns the next object of type \p type.
+ *
+ * If \p prev is \c NULL, return the first object at type \p type. If
+ * there are multiple or no depth for given type, return \c NULL and
+ * let the caller fallback to hwloc_get_next_obj_by_depth().
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type,
+ hwloc_obj_t prev);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_object_strings Manipulating Object Type, Sets and Attributes as Strings
+ * @{
+ */
+
+/** \brief Return a stringified topology object type */
+HWLOC_DECLSPEC const char * hwloc_obj_type_string (hwloc_obj_type_t type) __hwloc_attribute_const;
+
+/** \brief Return an object type and attributes from a type string.
+ *
+ * Convert strings such as "Package" or "Cache" into the corresponding types.
+ * Matching is case-insensitive, and only the first letters are actually
+ * required to match.
+ *
+ * Types that have specific attributes, for instance caches and groups,
+ * may be returned in \p depthattrp and \p typeattrp. They are ignored
+ * when these pointers are \c NULL.
+ *
+ * For instance "L2i" or "L2iCache" would return
+ * type HWLOC_OBJ_CACHE in \p typep, 2 in \p depthattrp,
+ * and HWLOC_OBJ_CACHE_TYPE_INSTRUCTION in \p typeattrp
+ * (this last pointer should point to a hwloc_obj_cache_type_t).
+ * "Group3" would return type HWLOC_OBJ_GROUP type and 3 in \p depthattrp.
+ * Attributes that are not specified in the string (for instance "Group"
+ * without a depth, or "L2Cache" without a cache type) are set to -1.
+ *
+ * \p typeattrd is only filled if the size specified in \p typeattrsize
+ * is large enough. It is currently only used for caches, and the required
+ * size is at least the size of hwloc_obj_cache_type_t.
+ *
+ * \return 0 if a type was correctly identified, otherwise -1.
+ *
+ * \note This is an extended version of the now deprecated hwloc_obj_type_of_string()
+ */
+HWLOC_DECLSPEC int hwloc_obj_type_sscanf(const char *string,
+ hwloc_obj_type_t *typep,
+ int *depthattrp,
+ void *typeattrp, size_t typeattrsize);
+
+/** \brief Stringify the type of a given topology object into a human-readable form.
+ *
+ * It differs from hwloc_obj_type_string() because it prints type attributes such
+ * as cache depth and type.
+ *
+ * If \p size is 0, \p string may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_obj_type_snprintf(char * __hwloc_restrict string, size_t size, hwloc_obj_t obj,
+ int verbose);
+
+/** \brief Stringify the attributes of a given topology object into a human-readable form.
+ *
+ * Attribute values are separated by \p separator.
+ *
+ * Only the major attributes are printed in non-verbose mode.
+ *
+ * If \p size is 0, \p string may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_obj_attr_snprintf(char * __hwloc_restrict string, size_t size, hwloc_obj_t obj, const char * __hwloc_restrict separator,
+ int verbose);
+
+/** \brief Stringify the cpuset containing a set of objects.
+ *
+ * If \p size is 0, \p string may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_obj_cpuset_snprintf(char * __hwloc_restrict str, size_t size, size_t nobj, const hwloc_obj_t * __hwloc_restrict objs);
+
+/** \brief Search the given key name in object infos and return the corresponding value.
+ *
+ * If multiple keys match the given name, only the first one is returned.
+ *
+ * \return \c NULL if no such key exists.
+ */
+static __hwloc_inline const char *
+hwloc_obj_get_info_by_name(hwloc_obj_t obj, const char *name) __hwloc_attribute_pure;
+
+/** \brief Add the given info name and value pair to the given object.
+ *
+ * The info is appended to the existing info array even if another key
+ * with the same name already exists.
+ *
+ * The input strings are copied before being added in the object infos.
+ *
+ * \note This function may be used to enforce object colors in the lstopo
+ * graphical output by using "lstopoStyle" as a name and "Background=#rrggbb"
+ * as a value. See CUSTOM COLORS in the lstopo(1) manpage for details.
+ *
+ * \note If \p value contains some non-printable characters, they will
+ * be dropped when exporting to XML, see hwloc_topology_export_xml() in hwloc/export.h.
+ */
+HWLOC_DECLSPEC void hwloc_obj_add_info(hwloc_obj_t obj, const char *name, const char *value);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_cpubinding CPU binding
+ *
+ * It is often useful to call hwloc_bitmap_singlify() first so that a single CPU
+ * remains in the set. This way, the process will not even migrate between
+ * different CPUs inside the given set.
+ * Some operating systems also only support that kind of binding.
+ *
+ * Some operating systems do not provide all hwloc-supported
+ * mechanisms to bind processes, threads, etc.
+ * hwloc_topology_get_support() may be used to query about the actual CPU
+ * binding support in the currently used operating system.
+ *
+ * When the requested binding operation is not available and the
+ * ::HWLOC_CPUBIND_STRICT flag was passed, the function returns -1.
+ * \p errno is set to \c ENOSYS when it is not possible to bind the requested kind of object
+ * processes/threads. errno is set to \c EXDEV when the requested cpuset
+ * can not be enforced (e.g. some systems only allow one CPU, and some
+ * other systems only allow one NUMA node).
+ *
+ * If ::HWLOC_CPUBIND_STRICT was not passed, the function may fail as well,
+ * or the operating system may use a slightly different operation
+ * (with side-effects, smaller binding set, etc.)
+ * when the requested operation is not exactly supported.
+ *
+ * The most portable version that should be preferred over the others,
+ * whenever possible, is the following one which just binds the current program,
+ * assuming it is single-threaded:
+ *
+ * \code
+ * hwloc_set_cpubind(topology, set, 0),
+ * \endcode
+ *
+ * If the program may be multithreaded, the following one should be preferred
+ * to only bind the current thread:
+ *
+ * \code
+ * hwloc_set_cpubind(topology, set, HWLOC_CPUBIND_THREAD),
+ * \endcode
+ *
+ * \sa Some example codes are available under doc/examples/ in the source tree.
+ *
+ * \note To unbind, just call the binding function with either a full cpuset or
+ * a cpuset equal to the system cpuset.
+ *
+ * \note On some operating systems, CPU binding may have effects on memory binding, see
+ * ::HWLOC_CPUBIND_NOMEMBIND
+ *
+ * \note Running lstopo --top or hwloc-ps can be a very convenient tool to check
+ * how binding actually happened.
+ * @{
+ */
+
+/** \brief Process/Thread binding flags.
+ *
+ * These bit flags can be used to refine the binding policy.
+ *
+ * The default (0) is to bind the current process, assumed to be
+ * single-threaded, in a non-strict way. This is the most portable
+ * way to bind as all operating systems usually provide it.
+ *
+ * \note Not all systems support all kinds of binding. See the
+ * "Detailed Description" section of \ref hwlocality_cpubinding for a
+ * description of errors that can occur.
+ */
+typedef enum {
+ /** \brief Bind all threads of the current (possibly) multithreaded process.
+ * \hideinitializer */
+ HWLOC_CPUBIND_PROCESS = (1<<0),
+
+ /** \brief Bind current thread of current process.
+ * \hideinitializer */
+ HWLOC_CPUBIND_THREAD = (1<<1),
+
+ /** \brief Request for strict binding from the OS.
+ *
+ * By default, when the designated CPUs are all busy while other
+ * CPUs are idle, operating systems may execute the thread/process
+ * on those other CPUs instead of the designated CPUs, to let them
+ * progress anyway. Strict binding means that the thread/process
+ * will _never_ execute on other cpus than the designated CPUs, even
+ * when those are busy with other tasks and other CPUs are idle.
+ *
+ * \note Depending on the operating system, strict binding may not
+ * be possible (e.g., the OS does not implement it) or not allowed
+ * (e.g., for an administrative reasons), and the function will fail
+ * in that case.
+ *
+ * When retrieving the binding of a process, this flag checks
+ * whether all its threads actually have the same binding. If the
+ * flag is not given, the binding of each thread will be
+ * accumulated.
+ *
+ * \note This flag is meaningless when retrieving the binding of a
+ * thread.
+ * \hideinitializer
+ */
+ HWLOC_CPUBIND_STRICT = (1<<2),
+
+ /** \brief Avoid any effect on memory binding
+ *
+ * On some operating systems, some CPU binding function would also
+ * bind the memory on the corresponding NUMA node. It is often not
+ * a problem for the application, but if it is, setting this flag
+ * will make hwloc avoid using OS functions that would also bind
+ * memory. This will however reduce the support of CPU bindings,
+ * i.e. potentially return -1 with errno set to ENOSYS in some
+ * cases.
+ *
+ * This flag is only meaningful when used with functions that set
+ * the CPU binding. It is ignored when used with functions that get
+ * CPU binding information.
+ * \hideinitializer
+ */
+ HWLOC_CPUBIND_NOMEMBIND = (1<<3)
+} hwloc_cpubind_flags_t;
+
+/** \brief Bind current process or thread on cpus given in physical bitmap \p set.
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ */
+HWLOC_DECLSPEC int hwloc_set_cpubind(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags);
+
+/** \brief Get current process or thread binding.
+ *
+ * Writes into \p set the physical cpuset which the process or thread (according to \e
+ * flags) was last bound to.
+ */
+HWLOC_DECLSPEC int hwloc_get_cpubind(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+
+/** \brief Bind a process \p pid on cpus given in physical bitmap \p set.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note As a special case on Linux, if a tid (thread ID) is supplied
+ * instead of a pid (process ID) and HWLOC_CPUBIND_THREAD is passed in flags,
+ * the binding is applied to that specific thread.
+ *
+ * \note On non-Linux systems, HWLOC_CPUBIND_THREAD can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_cpuset_t set, int flags);
+
+/** \brief Get the current physical binding of process \p pid.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note As a special case on Linux, if a tid (thread ID) is supplied
+ * instead of a pid (process ID) and HWLOC_CPUBIND_THREAD is passed in flags,
+ * the binding for that specific thread is returned.
+ *
+ * \note On non-Linux systems, HWLOC_CPUBIND_THREAD can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_get_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, int flags);
+
+#ifdef hwloc_thread_t
+/** \brief Bind a thread \p thread on cpus given in physical bitmap \p set.
+ *
+ * \note \p hwloc_thread_t is \p pthread_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note HWLOC_CPUBIND_PROCESS can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_set_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t thread, hwloc_const_cpuset_t set, int flags);
+#endif
+
+#ifdef hwloc_thread_t
+/** \brief Get the current physical binding of thread \p tid.
+ *
+ * \note \p hwloc_thread_t is \p pthread_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note HWLOC_CPUBIND_PROCESS can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_get_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t thread, hwloc_cpuset_t set, int flags);
+#endif
+
+/** \brief Get the last physical CPU where the current process or thread ran.
+ *
+ * The operating system may move some tasks from one processor
+ * to another at any time according to their binding,
+ * so this function may return something that is already
+ * outdated.
+ *
+ * \p flags can include either HWLOC_CPUBIND_PROCESS or HWLOC_CPUBIND_THREAD to
+ * specify whether the query should be for the whole process (union of all CPUs
+ * on which all threads are running), or only the current thread. If the
+ * process is single-threaded, flags can be set to zero to let hwloc use
+ * whichever method is available on the underlying OS.
+ */
+HWLOC_DECLSPEC int hwloc_get_last_cpu_location(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+
+/** \brief Get the last physical CPU where a process ran.
+ *
+ * The operating system may move some tasks from one processor
+ * to another at any time according to their binding,
+ * so this function may return something that is already
+ * outdated.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note As a special case on Linux, if a tid (thread ID) is supplied
+ * instead of a pid (process ID) and HWLOC_CPUBIND_THREAD is passed in flags,
+ * the last CPU location of that specific thread is returned.
+ *
+ * \note On non-Linux systems, HWLOC_CPUBIND_THREAD can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_get_proc_last_cpu_location(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, int flags);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_membinding Memory binding
+ *
+ * Memory binding can be done three ways:
+ *
+ * - explicit memory allocation thanks to hwloc_alloc_membind() and friends:
+ * the binding will have effect on the memory allocated by these functions.
+ * - implicit memory binding through binding policy: hwloc_set_membind() and
+ * friends only define the current policy of the process, which will be
+ * applied to the subsequent calls to malloc() and friends.
+ * - migration of existing memory ranges, thanks to hwloc_set_area_membind()
+ * and friends, which move already-allocated data.
+ *
+ * Not all operating systems support all three ways.
+ * hwloc_topology_get_support() may be used to query about the actual memory
+ * binding support in the currently used operating system.
+ *
+ * When the requested binding operation is not available and the
+ * ::HWLOC_MEMBIND_STRICT flag was passed, the function returns -1.
+ * \p errno will be set to \c ENOSYS when the system does support
+ * the specified action or policy
+ * (e.g., some systems only allow binding memory on a per-thread
+ * basis, whereas other systems only allow binding memory for all
+ * threads in a process).
+ * \p errno will be set to EXDEV when the requested cpuset can not be enforced
+ * (e.g., some systems only allow binding memory to a single NUMA node).
+ *
+ * If ::HWLOC_MEMBIND_STRICT was not passed, the function may fail as well,
+ * or the operating system may use a slightly different operation
+ * (with side-effects, smaller binding set, etc.)
+ * when the requested operation is not exactly supported.
+ *
+ * The most portable form that should be preferred over the others
+ * whenever possible is as follows.
+ * It allocates some memory hopefully bound to the specified set.
+ * To do so, hwloc will possibly have to change the current memory
+ * binding policy in order to actually get the memory bound, if the OS
+ * does not provide any other way to simply allocate bound memory
+ * without changing the policy for all allocations. That is the
+ * difference with hwloc_alloc_membind(), which will never change the
+ * current memory binding policy.
+ *
+ * \code
+ * hwloc_alloc_membind_policy(topology, size, set,
+ * HWLOC_MEMBIND_BIND, 0);
+ * \endcode
+ *
+ * Each hwloc memory binding function is available in two forms: one
+ * that takes a CPU set argument and another that takes a NUMA memory
+ * node set argument (see \ref hwlocality_object_sets and \ref
+ * hwlocality_bitmap for a discussion of CPU sets and NUMA memory node
+ * sets). The names of the latter form end with _nodeset. It is also
+ * possible to convert between CPU set and node set using
+ * hwloc_cpuset_to_nodeset() or hwloc_cpuset_from_nodeset().
+ *
+ * \sa Some example codes are available under doc/examples/ in the source tree.
+ *
+ * \note On some operating systems, memory binding affects the CPU
+ * binding; see ::HWLOC_MEMBIND_NOCPUBIND
+ * @{
+ */
+
+/** \brief Memory binding policy.
+ *
+ * These constants can be used to choose the binding policy. Only one policy can
+ * be used at a time (i.e., the values cannot be OR'ed together).
+ *
+ * Not all systems support all kinds of binding.
+ * hwloc_topology_get_support() may be used to query about the actual memory
+ * binding policy support in the currently used operating system.
+ * See the "Detailed Description" section of \ref hwlocality_membinding
+ * for a description of errors that can occur.
+ */
+typedef enum {
+ /** \brief Reset the memory allocation policy to the system default.
+ * Depending on the operating system, this may correspond to
+ * HWLOC_MEMBIND_FIRSTTOUCH (Linux),
+ * or HWLOC_MEMBIND_BIND (AIX, HP-UX, OSF, Solaris, Windows).
+ * \hideinitializer */
+ HWLOC_MEMBIND_DEFAULT = 0,
+
+ /** \brief Allocate memory
+ * but do not immediately bind it to a specific locality. Instead,
+ * each page in the allocation is bound only when it is first
+ * touched. Pages are individually bound to the local NUMA node of
+ * the first thread that touches it. If there is not enough memory
+ * on the node, allocation may be done in the specified cpuset
+ * before allocating on other nodes.
+ * \hideinitializer */
+ HWLOC_MEMBIND_FIRSTTOUCH = 1,
+
+ /** \brief Allocate memory on the specified nodes.
+ * \hideinitializer */
+ HWLOC_MEMBIND_BIND = 2,
+
+ /** \brief Allocate memory on the given nodes in an interleaved
+ * / round-robin manner. The precise layout of the memory across
+ * multiple NUMA nodes is OS/system specific. Interleaving can be
+ * useful when threads distributed across the specified NUMA nodes
+ * will all be accessing the whole memory range concurrently, since
+ * the interleave will then balance the memory references.
+ * \hideinitializer */
+ HWLOC_MEMBIND_INTERLEAVE = 3,
+
+ /** \brief Replicate memory on the given nodes; reads from this
+ * memory will attempt to be serviced from the NUMA node local to
+ * the reading thread. Replicating can be useful when multiple
+ * threads from the specified NUMA nodes will be sharing the same
+ * read-only data.
+ *
+ * This policy can only be used with existing memory allocations
+ * (i.e., the hwloc_set_*membind*() functions); it cannot be used
+ * with functions that allocate new memory (i.e., the hwloc_alloc*()
+ * functions).
+ * \hideinitializer */
+ HWLOC_MEMBIND_REPLICATE = 4,
+
+ /** \brief For each page bound with this policy, by next time
+ * it is touched (and next time only), it is moved from its current
+ * location to the local NUMA node of the thread where the memory
+ * reference occurred (if it needs to be moved at all).
+ * \hideinitializer */
+ HWLOC_MEMBIND_NEXTTOUCH = 5,
+
+ /** \brief Returned by get_membind() functions when multiple
+ * threads or parts of a memory area have differing memory binding
+ * policies.
+ * \hideinitializer */
+ HWLOC_MEMBIND_MIXED = -1
+} hwloc_membind_policy_t;
+
+/** \brief Memory binding flags.
+ *
+ * These flags can be used to refine the binding policy.
+ * All flags can be logically OR'ed together with the exception of
+ * ::HWLOC_MEMBIND_PROCESS and ::HWLOC_MEMBIND_THREAD;
+ * these two flags are mutually exclusive.
+ *
+ * Not all systems support all kinds of binding.
+ * hwloc_topology_get_support() may be used to query about the actual memory
+ * binding support in the currently used operating system.
+ * See the "Detailed Description" section of \ref hwlocality_membinding
+ * for a description of errors that can occur.
+ */
+typedef enum {
+ /** \brief Set policy for all threads of the specified (possibly
+ * multithreaded) process. This flag is mutually exclusive with
+ * ::HWLOC_MEMBIND_THREAD.
+ * \hideinitializer */
+ HWLOC_MEMBIND_PROCESS = (1<<0),
+
+ /** \brief Set policy for a specific thread of the current process.
+ * This flag is mutually exclusive with ::HWLOC_MEMBIND_PROCESS.
+ * \hideinitializer */
+ HWLOC_MEMBIND_THREAD = (1<<1),
+
+ /** Request strict binding from the OS. The function will fail if
+ * the binding can not be guaranteed / completely enforced.
+ *
+ * This flag has slightly different meanings depending on which
+ * function it is used with.
+ * \hideinitializer */
+ HWLOC_MEMBIND_STRICT = (1<<2),
+
+ /** \brief Migrate existing allocated memory. If the memory cannot
+ * be migrated and the ::HWLOC_MEMBIND_STRICT flag is passed, an error
+ * will be returned.
+ * \hideinitializer */
+ HWLOC_MEMBIND_MIGRATE = (1<<3),
+
+ /** \brief Avoid any effect on CPU binding.
+ *
+ * On some operating systems, some underlying memory binding
+ * functions also bind the application to the corresponding CPU(s).
+ * Using this flag will cause hwloc to avoid using OS functions that
+ * could potentially affect CPU bindings. Note, however, that using
+ * NOCPUBIND may reduce hwloc's overall memory binding
+ * support. Specifically: some of hwloc's memory binding functions
+ * may fail with errno set to ENOSYS when used with NOCPUBIND.
+ * \hideinitializer
+ */
+ HWLOC_MEMBIND_NOCPUBIND = (1<<4)
+} hwloc_membind_flags_t;
+
+/** \brief Set the default memory binding policy of the current
+ * process or thread to prefer the NUMA node(s) specified by physical \p nodeset
+ *
+ * If neither ::HWLOC_MEMBIND_PROCESS nor ::HWLOC_MEMBIND_THREAD is
+ * specified, the current process is assumed to be single-threaded.
+ * This is the most portable form as it permits hwloc to use either
+ * process-based OS functions or thread-based OS functions, depending
+ * on which are available.
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ */
+HWLOC_DECLSPEC int hwloc_set_membind_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+
+/** \brief Set the default memory binding policy of the current
+ * process or thread to prefer the NUMA node(s) near the specified physical \p
+ * cpuset
+ *
+ * If neither ::HWLOC_MEMBIND_PROCESS nor ::HWLOC_MEMBIND_THREAD is
+ * specified, the current process is assumed to be single-threaded.
+ * This is the most portable form as it permits hwloc to use either
+ * process-based OS functions or thread-based OS functions, depending
+ * on which are available.
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ */
+HWLOC_DECLSPEC int hwloc_set_membind(hwloc_topology_t topology, hwloc_const_cpuset_t cpuset, hwloc_membind_policy_t policy, int flags);
+
+/** \brief Query the default memory binding policy and physical locality of the
+ * current process or thread.
+ *
+ * This function has two output parameters: \p nodeset and \p policy.
+ * The values returned in these parameters depend on both the \p flags
+ * passed in and the current memory binding policies and nodesets in
+ * the queried target.
+ *
+ * Passing the ::HWLOC_MEMBIND_PROCESS flag specifies that the query
+ * target is the current policies and nodesets for all the threads in
+ * the current process. Passing ::HWLOC_MEMBIND_THREAD specifies that
+ * the query target is the current policy and nodeset for only the
+ * thread invoking this function.
+ *
+ * If neither of these flags are passed (which is the most portable
+ * method), the process is assumed to be single threaded. This allows
+ * hwloc to use either process-based OS functions or thread-based OS
+ * functions, depending on which are available.
+ *
+ * ::HWLOC_MEMBIND_STRICT is only meaningful when ::HWLOC_MEMBIND_PROCESS
+ * is also specified. In this case, hwloc will check the default
+ * memory policies and nodesets for all threads in the process. If
+ * they are not identical, -1 is returned and errno is set to EXDEV.
+ * If they are identical, the values are returned in \p nodeset and \p
+ * policy.
+ *
+ * Otherwise, if ::HWLOC_MEMBIND_PROCESS is specified (and
+ * ::HWLOC_MEMBIND_STRICT is \em not specified), \p nodeset is set to
+ * the logical OR of all threads' default nodeset. If all threads'
+ * default policies are the same, \p policy is set to that policy. If
+ * they are different, \p policy is set to ::HWLOC_MEMBIND_MIXED.
+ *
+ * In the ::HWLOC_MEMBIND_THREAD case (or when neither
+ * ::HWLOC_MEMBIND_PROCESS or ::HWLOC_MEMBIND_THREAD is specified), there
+ * is only one nodeset and policy; they are returned in \p nodeset and
+ * \p policy, respectively.
+ *
+ * If any other flags are specified, -1 is returned and errno is set
+ * to EINVAL.
+ */
+HWLOC_DECLSPEC int hwloc_get_membind_nodeset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+
+/** \brief Query the default memory binding policy and physical locality of the
+ * current process or thread (the locality is returned in \p cpuset as
+ * CPUs near the locality's actual NUMA node(s)).
+ *
+ * This function has two output parameters: \p cpuset and \p policy.
+ * The values returned in these parameters depend on both the \p flags
+ * passed in and the current memory binding policies and nodesets in
+ * the queried target.
+ *
+ * Passing the ::HWLOC_MEMBIND_PROCESS flag specifies that the query
+ * target is the current policies and nodesets for all the threads in
+ * the current process. Passing ::HWLOC_MEMBIND_THREAD specifies that
+ * the query target is the current policy and nodeset for only the
+ * thread invoking this function.
+ *
+ * If neither of these flags are passed (which is the most portable
+ * method), the process is assumed to be single threaded. This allows
+ * hwloc to use either process-based OS functions or thread-based OS
+ * functions, depending on which are available.
+ *
+ * ::HWLOC_MEMBIND_STRICT is only meaningful when ::HWLOC_MEMBIND_PROCESS
+ * is also specified. In this case, hwloc will check the default
+ * memory policies and nodesets for all threads in the process. If
+ * they are not identical, -1 is returned and errno is set to EXDEV.
+ * If they are identical, the policy is returned in \p policy. \p
+ * cpuset is set to the union of CPUs near the NUMA node(s) in the
+ * nodeset.
+ *
+ * Otherwise, if ::HWLOC_MEMBIND_PROCESS is specified (and
+ * ::HWLOC_MEMBIND_STRICT is \em not specified), the default nodeset
+ * from each thread is logically OR'ed together. \p cpuset is set to
+ * the union of CPUs near the NUMA node(s) in the resulting nodeset.
+ * If all threads' default policies are the same, \p policy is set to
+ * that policy. If they are different, \p policy is set to
+ * ::HWLOC_MEMBIND_MIXED.
+ *
+ * In the ::HWLOC_MEMBIND_THREAD case (or when neither
+ * ::HWLOC_MEMBIND_PROCESS or ::HWLOC_MEMBIND_THREAD is specified), there
+ * is only one nodeset and policy. The policy is returned in \p
+ * policy; \p cpuset is set to the union of CPUs near the NUMA node(s)
+ * in the \p nodeset.
+ *
+ * If any other flags are specified, -1 is returned and errno is set
+ * to EINVAL.
+ */
+HWLOC_DECLSPEC int hwloc_get_membind(hwloc_topology_t topology, hwloc_cpuset_t cpuset, hwloc_membind_policy_t * policy, int flags);
+
+/** \brief Set the default memory binding policy of the specified
+ * process to prefer the NUMA node(s) specified by physical \p nodeset
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ */
+HWLOC_DECLSPEC int hwloc_set_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+
+/** \brief Set the default memory binding policy of the specified
+ * process to prefer the NUMA node(s) near the specified physical \p cpuset
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ */
+HWLOC_DECLSPEC int hwloc_set_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_cpuset_t cpuset, hwloc_membind_policy_t policy, int flags);
+
+/** \brief Query the default memory binding policy and physical locality of the
+ * specified process.
+ *
+ * This function has two output parameters: \p nodeset and \p policy.
+ * The values returned in these parameters depend on both the \p flags
+ * passed in and the current memory binding policies and nodesets in
+ * the queried target.
+ *
+ * Passing the ::HWLOC_MEMBIND_PROCESS flag specifies that the query
+ * target is the current policies and nodesets for all the threads in
+ * the specified process. If ::HWLOC_MEMBIND_PROCESS is not specified
+ * (which is the most portable method), the process is assumed to be
+ * single threaded. This allows hwloc to use either process-based OS
+ * functions or thread-based OS functions, depending on which are
+ * available.
+ *
+ * Note that it does not make sense to pass ::HWLOC_MEMBIND_THREAD to
+ * this function.
+ *
+ * If ::HWLOC_MEMBIND_STRICT is specified, hwloc will check the default
+ * memory policies and nodesets for all threads in the specified
+ * process. If they are not identical, -1 is returned and errno is
+ * set to EXDEV. If they are identical, the values are returned in \p
+ * nodeset and \p policy.
+ *
+ * Otherwise, \p nodeset is set to the logical OR of all threads'
+ * default nodeset. If all threads' default policies are the same, \p
+ * policy is set to that policy. If they are different, \p policy is
+ * set to ::HWLOC_MEMBIND_MIXED.
+ *
+ * If any other flags are specified, -1 is returned and errno is set
+ * to EINVAL.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ */
+HWLOC_DECLSPEC int hwloc_get_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+
+/** \brief Query the default memory binding policy and physical locality of the
+ * specified process (the locality is returned in \p cpuset as CPUs
+ * near the locality's actual NUMA node(s)).
+ *
+ * This function has two output parameters: \p cpuset and \p policy.
+ * The values returned in these parameters depend on both the \p flags
+ * passed in and the current memory binding policies and nodesets in
+ * the queried target.
+ *
+ * Passing the ::HWLOC_MEMBIND_PROCESS flag specifies that the query
+ * target is the current policies and nodesets for all the threads in
+ * the specified process. If ::HWLOC_MEMBIND_PROCESS is not specified
+ * (which is the most portable method), the process is assumed to be
+ * single threaded. This allows hwloc to use either process-based OS
+ * functions or thread-based OS functions, depending on which are
+ * available.
+ *
+ * Note that it does not make sense to pass ::HWLOC_MEMBIND_THREAD to
+ * this function.
+ *
+ * If ::HWLOC_MEMBIND_STRICT is specified, hwloc will check the default
+ * memory policies and nodesets for all threads in the specified
+ * process. If they are not identical, -1 is returned and errno is
+ * set to EXDEV. If they are identical, the policy is returned in \p
+ * policy. \p cpuset is set to the union of CPUs near the NUMA
+ * node(s) in the nodeset.
+ *
+ * Otherwise, the default nodeset from each thread is logically OR'ed
+ * together. \p cpuset is set to the union of CPUs near the NUMA
+ * node(s) in the resulting nodeset. If all threads' default policies
+ * are the same, \p policy is set to that policy. If they are
+ * different, \p policy is set to ::HWLOC_MEMBIND_MIXED.
+ *
+ * If any other flags are specified, -1 is returned and errno is set
+ * to EINVAL.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ */
+HWLOC_DECLSPEC int hwloc_get_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t cpuset, hwloc_membind_policy_t * policy, int flags);
+
+/** \brief Bind the already-allocated memory identified by (addr, len)
+ * to the NUMA node(s) in physical \p nodeset.
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ */
+HWLOC_DECLSPEC int hwloc_set_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+
+/** \brief Bind the already-allocated memory identified by (addr, len)
+ * to the NUMA node(s) near physical \p cpuset.
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ */
+HWLOC_DECLSPEC int hwloc_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_cpuset_t cpuset, hwloc_membind_policy_t policy, int flags);
+
+/** \brief Query the physical NUMA node(s) and binding policy of the memory
+ * identified by (\p addr, \p len ).
+ *
+ * This function has two output parameters: \p nodeset and \p policy.
+ * The values returned in these parameters depend on both the \p flags
+ * passed in and the memory binding policies and nodesets of the pages
+ * in the address range.
+ *
+ * If ::HWLOC_MEMBIND_STRICT is specified, the target pages are first
+ * checked to see if they all have the same memory binding policy and
+ * nodeset. If they do not, -1 is returned and errno is set to EXDEV.
+ * If they are identical across all pages, the nodeset and policy are
+ * returned in \p nodeset and \p policy, respectively.
+ *
+ * If ::HWLOC_MEMBIND_STRICT is not specified, \p nodeset is set to the
+ * union of all NUMA node(s) containing pages in the address range.
+ * If all pages in the target have the same policy, it is returned in
+ * \p policy. Otherwise, \p policy is set to ::HWLOC_MEMBIND_MIXED.
+ *
+ * If any other flags are specified, -1 is returned and errno is set
+ * to EINVAL.
+ */
+HWLOC_DECLSPEC int hwloc_get_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+
+/** \brief Query the CPUs near the physical NUMA node(s) and binding policy of
+ * the memory identified by (\p addr, \p len ).
+ *
+ * This function has two output parameters: \p cpuset and \p policy.
+ * The values returned in these parameters depend on both the \p flags
+ * passed in and the memory binding policies and nodesets of the pages
+ * in the address range.
+ *
+ * If ::HWLOC_MEMBIND_STRICT is specified, the target pages are first
+ * checked to see if they all have the same memory binding policy and
+ * nodeset. If they do not, -1 is returned and errno is set to EXDEV.
+ * If they are identical across all pages, the policy is returned in
+ * \p policy. \p cpuset is set to the union of CPUs near the NUMA
+ * node(s) in the nodeset.
+ *
+ * If ::HWLOC_MEMBIND_STRICT is not specified, the union of all NUMA
+ * node(s) containing pages in the address range is calculated. \p
+ * cpuset is then set to the CPUs near the NUMA node(s) in this union.
+ * If all pages in the target have the same policy, it is returned in
+ * \p policy. Otherwise, \p policy is set to ::HWLOC_MEMBIND_MIXED.
+ *
+ * If any other flags are specified, -1 is returned and errno is set
+ * to EINVAL.
+ */
+HWLOC_DECLSPEC int hwloc_get_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_cpuset_t cpuset, hwloc_membind_policy_t * policy, int flags);
+
+/** \brief Allocate some memory
+ *
+ * This is equivalent to malloc(), except that it tries to allocate
+ * page-aligned memory from the OS.
+ *
+ * \note The allocated memory should be freed with hwloc_free().
+ */
+HWLOC_DECLSPEC void *hwloc_alloc(hwloc_topology_t topology, size_t len);
+
+/** \brief Allocate some memory on the given physical nodeset \p nodeset
+ *
+ * \return NULL with errno set to ENOSYS if the action is not supported
+ * and ::HWLOC_MEMBIND_STRICT is given
+ * \return NULL with errno set to EXDEV if the binding cannot be enforced
+ * and ::HWLOC_MEMBIND_STRICT is given
+ * \return NULL with errno set to ENOMEM if the memory allocation failed
+ * even before trying to bind.
+ *
+ * \note The allocated memory should be freed with hwloc_free().
+ */
+HWLOC_DECLSPEC void *hwloc_alloc_membind_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc;
+
+/** \brief Allocate some memory on memory nodes near the given physical cpuset \p cpuset
+ *
+ * \return NULL with errno set to ENOSYS if the action is not supported
+ * and ::HWLOC_MEMBIND_STRICT is given
+ * \return NULL with errno set to EXDEV if the binding cannot be enforced
+ * and ::HWLOC_MEMBIND_STRICT is given
+ * \return NULL with errno set to ENOMEM if the memory allocation failed
+ * even before trying to bind.
+ *
+ * \note The allocated memory should be freed with hwloc_free().
+ */
+HWLOC_DECLSPEC void *hwloc_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_cpuset_t cpuset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc;
+
+/** \brief Allocate some memory on the given nodeset \p nodeset
+ *
+ * This is similar to hwloc_alloc_membind except that it is allowed to change
+ * the current memory binding policy, thus providing more binding support, at
+ * the expense of changing the current state.
+ */
+static __hwloc_inline void *
+hwloc_alloc_membind_policy_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc;
+
+/** \brief Allocate some memory on the memory nodes near given cpuset \p cpuset
+ *
+ * This is similar to hwloc_alloc_membind_policy_nodeset, but for a given cpuset.
+ */
+static __hwloc_inline void *
+hwloc_alloc_membind_policy(hwloc_topology_t topology, size_t len, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc;
+
+/** \brief Free memory that was previously allocated by hwloc_alloc()
+ * or hwloc_alloc_membind().
+ */
+HWLOC_DECLSPEC int hwloc_free(hwloc_topology_t topology, void *addr, size_t len);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_setsource Changing the Source of Topology Discovery
+ *
+ * If none of the functions below is called, the default is to detect all the objects
+ * of the machine that the caller is allowed to access.
+ *
+ * This default behavior may also be modified through environment variables
+ * if the application did not modify it already.
+ * Setting HWLOC_XMLFILE in the environment enforces the discovery from a XML
+ * file as if hwloc_topology_set_xml() had been called.
+ * Setting HWLOC_SYNTHETIC enforces a synthetic topology as if
+ * hwloc_topology_set_synthetic() had been called.
+ * Setting HWLOC_FSROOT switches to reading the topology from the specified Linux
+ * filesystem root.
+ *
+ * Finally, HWLOC_THISSYSTEM enforces the return value of
+ * hwloc_topology_is_thissystem().
+ *
+ * @{
+ */
+
+/** \brief Change which pid the topology is viewed from
+ *
+ * On some systems, processes may have different views of the machine, for
+ * instance the set of allowed CPUs. By default, hwloc exposes the view from
+ * the current process. Calling hwloc_topology_set_pid() permits to make it
+ * expose the topology of the machine from the point of view of another
+ * process.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note -1 is returned and errno is set to ENOSYS on platforms that do not
+ * support this feature.
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_pid(hwloc_topology_t __hwloc_restrict topology, hwloc_pid_t pid);
+
+/** \brief Enable synthetic topology.
+ *
+ * Gather topology information from the given \p description,
+ * a space-separated string of numbers describing
+ * the arity of each level.
+ * Each number may be prefixed with a type and a colon to enforce the type
+ * of a level. If only some level types are enforced, hwloc will try to
+ * choose the other types according to usual topologies, but it may fail
+ * and you may have to specify more level types manually.
+ * See also the \ref synthetic.
+ *
+ * Setting the environment variable HWLOC_SYNTHETIC
+ * may also result in this behavior.
+ *
+ * If \p description was properly parsed and describes a valid topology
+ * configuration, this function returns 0.
+ * Otherwise -1 is returned and errno is set to EINVAL.
+ *
+ * Note that this function does not actually load topology
+ * information; it just tells hwloc where to load it from. You'll
+ * still need to invoke hwloc_topology_load() to actually load the
+ * topology information.
+ *
+ * \note For convenience, this backend provides empty binding hooks which just
+ * return success.
+ *
+ * \note On success, the synthetic component replaces the previously enabled
+ * component (if any), but the topology is not actually modified until
+ * hwloc_topology_load().
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_synthetic(hwloc_topology_t __hwloc_restrict topology, const char * __hwloc_restrict description);
+
+/** \brief Enable XML-file based topology.
+ *
+ * Gather topology information from the XML file given at \p xmlpath.
+ * Setting the environment variable HWLOC_XMLFILE may also result in this behavior.
+ * This file may have been generated earlier with hwloc_topology_export_xml() in hwloc/export.h,
+ * or lstopo file.xml.
+ *
+ * Note that this function does not actually load topology
+ * information; it just tells hwloc where to load it from. You'll
+ * still need to invoke hwloc_topology_load() to actually load the
+ * topology information.
+ *
+ * \return -1 with errno set to EINVAL on failure to read the XML file.
+ *
+ * \note See also hwloc_topology_set_userdata_import_callback()
+ * for importing application-specific object userdata.
+ *
+ * \note For convenience, this backend provides empty binding hooks which just
+ * return success. To have hwloc still actually call OS-specific hooks, the
+ * HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM has to be set to assert that the loaded
+ * file is really the underlying system.
+ *
+ * \note On success, the XML component replaces the previously enabled
+ * component (if any), but the topology is not actually modified until
+ * hwloc_topology_load().
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_xml(hwloc_topology_t __hwloc_restrict topology, const char * __hwloc_restrict xmlpath);
+
+/** \brief Enable XML based topology using a memory buffer (instead of
+ * a file, as with hwloc_topology_set_xml()).
+ *
+ * Gather topology information from the XML memory buffer given at \p
+ * buffer and of length \p size. This buffer may have been filled
+ * earlier with hwloc_topology_export_xmlbuffer() in hwloc/export.h.
+ *
+ * Note that this function does not actually load topology
+ * information; it just tells hwloc where to load it from. You'll
+ * still need to invoke hwloc_topology_load() to actually load the
+ * topology information.
+ *
+ * \return -1 with errno set to EINVAL on failure to read the XML buffer.
+ *
+ * \note See also hwloc_topology_set_userdata_import_callback()
+ * for importing application-specific object userdata.
+ *
+ * \note For convenience, this backend provides empty binding hooks which just
+ * return success. To have hwloc still actually call OS-specific hooks, the
+ * HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM has to be set to assert that the loaded
+ * file is really the underlying system.
+ *
+ * \note On success, the XML component replaces the previously enabled
+ * component (if any), but the topology is not actually modified until
+ * hwloc_topology_load().
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_xmlbuffer(hwloc_topology_t __hwloc_restrict topology, const char * __hwloc_restrict buffer, int size);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_configuration Topology Detection Configuration and Query
+ *
+ * Several functions can optionally be called between hwloc_topology_init() and
+ * hwloc_topology_load() to configure how the detection should be performed,
+ * e.g. to ignore some objects types, define a synthetic topology, etc.
+ *
+ * @{
+ */
+
+/** \brief Flags to be set onto a topology context before load.
+ *
+ * Flags should be given to hwloc_topology_set_flags().
+ * They may also be returned by hwloc_topology_get_flags().
+ */
+enum hwloc_topology_flags_e {
+ /** \brief Detect the whole system, ignore reservations.
+ *
+ * Gather all resources, even if some were disabled by the administrator.
+ * For instance, ignore Linux Cgroup/Cpusets and gather all processors and memory nodes.
+ *
+ * When this flag is set, each object has allowed_cpuset <= cpuset <= complete_cpuset.
+ * Otherwise allowed_cpuset = cpuset <= complete_cpuset.
+ * The same applies to nodesets.
+ * \hideinitializer
+ */
+ HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM = (1UL<<0),
+
+ /** \brief Assume that the selected backend provides the topology for the
+ * system on which we are running.
+ *
+ * This forces hwloc_topology_is_thissystem to return 1, i.e. makes hwloc assume that
+ * the selected backend provides the topology for the system on which we are running,
+ * even if it is not the OS-specific backend but the XML backend for instance.
+ * This means making the binding functions actually call the OS-specific
+ * system calls and really do binding, while the XML backend would otherwise
+ * provide empty hooks just returning success.
+ *
+ * Setting the environment variable HWLOC_THISSYSTEM may also result in the
+ * same behavior.
+ *
+ * This can be used for efficiency reasons to first detect the topology once,
+ * save it to an XML file, and quickly reload it later through the XML
+ * backend, but still having binding functions actually do bind.
+ * \hideinitializer
+ */
+ HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM = (1UL<<1),
+
+ /** \brief Detect PCI devices.
+ *
+ * By default, I/O devices are ignored. This flag enables I/O device
+ * detection using the pci backend. Only the common PCI devices (GPUs,
+ * NICs, block devices, ...) and host bridges (objects that connect the host
+ * objects to an I/O subsystem) will be added to the topology.
+ * Additionally it also enables MemoryDevice misc objects.
+ * Uncommon devices and other bridges (such as PCI-to-PCI bridges) will be
+ * ignored.
+ * \hideinitializer
+ */
+ HWLOC_TOPOLOGY_FLAG_IO_DEVICES = (1UL<<2),
+
+ /** \brief Detect PCI bridges.
+ *
+ * This flag should be combined with HWLOC_TOPOLOGY_FLAG_IO_DEVICES to enable
+ * the detection of both common devices and of all useful bridges (bridges that
+ * have at least one device behind them).
+ * \hideinitializer
+ */
+ HWLOC_TOPOLOGY_FLAG_IO_BRIDGES = (1UL<<3),
+
+ /** \brief Detect the whole PCI hierarchy.
+ *
+ * This flag enables detection of all I/O devices (even the uncommon ones)
+ * and bridges (even those that have no device behind them) using the pci
+ * backend.
+ * This implies HWLOC_TOPOLOGY_FLAG_IO_DEVICES.
+ * \hideinitializer
+ */
+ HWLOC_TOPOLOGY_FLAG_WHOLE_IO = (1UL<<4),
+
+ /** \brief Detect instruction caches.
+ *
+ * This flag enables detection of Instruction caches,
+ * instead of only Data and Unified caches.
+ * \hideinitializer
+ */
+ HWLOC_TOPOLOGY_FLAG_ICACHES = (1UL<<5)
+};
+
+/** \brief Set OR'ed flags to non-yet-loaded topology.
+ *
+ * Set a OR'ed set of ::hwloc_topology_flags_e onto a topology that was not yet loaded.
+ *
+ * If this function is called multiple times, the last invokation will erase
+ * and replace the set of flags that was previously set.
+ *
+ * The flags set in a topology may be retrieved with hwloc_topology_get_flags()
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_flags (hwloc_topology_t topology, unsigned long flags);
+
+/** \brief Get OR'ed flags of a topology.
+ *
+ * Get the OR'ed set of ::hwloc_topology_flags_e of a topology.
+ *
+ * \return the flags previously set with hwloc_topology_set_flags().
+ */
+HWLOC_DECLSPEC unsigned long hwloc_topology_get_flags (hwloc_topology_t topology);
+
+/** \brief Does the topology context come from this system?
+ *
+ * \return 1 if this topology context was built using the system
+ * running this program.
+ * \return 0 instead (for instance if using another file-system root,
+ * a XML topology file, or a synthetic topology).
+ */
+HWLOC_DECLSPEC int hwloc_topology_is_thissystem(hwloc_topology_t __hwloc_restrict topology) __hwloc_attribute_pure;
+
+/** \brief Flags describing actual discovery support for this topology. */
+struct hwloc_topology_discovery_support {
+ /** \brief Detecting the number of PU objects is supported. */
+ unsigned char pu;
+};
+
+/** \brief Flags describing actual PU binding support for this topology. */
+struct hwloc_topology_cpubind_support {
+ /** Binding the whole current process is supported. */
+ unsigned char set_thisproc_cpubind;
+ /** Getting the binding of the whole current process is supported. */
+ unsigned char get_thisproc_cpubind;
+ /** Binding a whole given process is supported. */
+ unsigned char set_proc_cpubind;
+ /** Getting the binding of a whole given process is supported. */
+ unsigned char get_proc_cpubind;
+ /** Binding the current thread only is supported. */
+ unsigned char set_thisthread_cpubind;
+ /** Getting the binding of the current thread only is supported. */
+ unsigned char get_thisthread_cpubind;
+ /** Binding a given thread only is supported. */
+ unsigned char set_thread_cpubind;
+ /** Getting the binding of a given thread only is supported. */
+ unsigned char get_thread_cpubind;
+ /** Getting the last processors where the whole current process ran is supported */
+ unsigned char get_thisproc_last_cpu_location;
+ /** Getting the last processors where a whole process ran is supported */
+ unsigned char get_proc_last_cpu_location;
+ /** Getting the last processors where the current thread ran is supported */
+ unsigned char get_thisthread_last_cpu_location;
+};
+
+/** \brief Flags describing actual memory binding support for this topology. */
+struct hwloc_topology_membind_support {
+ /** Binding the whole current process is supported. */
+ unsigned char set_thisproc_membind;
+ /** Getting the binding of the whole current process is supported. */
+ unsigned char get_thisproc_membind;
+ /** Binding a whole given process is supported. */
+ unsigned char set_proc_membind;
+ /** Getting the binding of a whole given process is supported. */
+ unsigned char get_proc_membind;
+ /** Binding the current thread only is supported. */
+ unsigned char set_thisthread_membind;
+ /** Getting the binding of the current thread only is supported. */
+ unsigned char get_thisthread_membind;
+ /** Binding a given memory area is supported. */
+ unsigned char set_area_membind;
+ /** Getting the binding of a given memory area is supported. */
+ unsigned char get_area_membind;
+ /** Allocating a bound memory area is supported. */
+ unsigned char alloc_membind;
+ /** First-touch policy is supported. */
+ unsigned char firsttouch_membind;
+ /** Bind policy is supported. */
+ unsigned char bind_membind;
+ /** Interleave policy is supported. */
+ unsigned char interleave_membind;
+ /** Replication policy is supported. */
+ unsigned char replicate_membind;
+ /** Next-touch migration policy is supported. */
+ unsigned char nexttouch_membind;
+
+ /** Migration flags is supported. */
+ unsigned char migrate_membind;
+};
+
+/** \brief Set of flags describing actual support for this topology.
+ *
+ * This is retrieved with hwloc_topology_get_support() and will be valid until
+ * the topology object is destroyed. Note: the values are correct only after
+ * discovery.
+ */
+struct hwloc_topology_support {
+ struct hwloc_topology_discovery_support *discovery;
+ struct hwloc_topology_cpubind_support *cpubind;
+ struct hwloc_topology_membind_support *membind;
+};
+
+/** \brief Retrieve the topology support. */
+HWLOC_DECLSPEC const struct hwloc_topology_support *hwloc_topology_get_support(hwloc_topology_t __hwloc_restrict topology);
+
+/** \brief Ignore an object type.
+ *
+ * Ignore all objects from the given type.
+ * The bottom-level type HWLOC_OBJ_PU and the HWLOC_OBJ_NUMANODE level may not be ignored.
+ * The top-level object of the hierarchy will never be ignored, even if this function
+ * succeeds.
+ * I/O objects may not be ignored, topology flags should be used to configure
+ * their discovery instead.
+ */
+HWLOC_DECLSPEC int hwloc_topology_ignore_type(hwloc_topology_t topology, hwloc_obj_type_t type);
+
+/** \brief Ignore an object type if it does not bring any structure.
+ *
+ * Ignore all objects from the given type as long as they do not bring any structure:
+ * Each ignored object should have a single children or be the only child of its parent.
+ * The bottom-level type HWLOC_OBJ_PU and the HWLOC_OBJ_NUMANODE level may not be ignored.
+ * I/O objects may not be ignored, topology flags should be used to configure
+ * their discovery instead.
+ * Group objects are always ignored if they do not bring any structure
+ * since they are designed to add structure to the topology.
+ * Misc objects cannot be ignored based on the structure since they are only annotations
+ * outside of the main topology structure.
+ */
+HWLOC_DECLSPEC int hwloc_topology_ignore_type_keep_structure(hwloc_topology_t topology, hwloc_obj_type_t type);
+
+/** \brief Ignore all objects that do not bring any structure.
+ *
+ * Ignore all objects that do not bring any structure:
+ * Each ignored object should have a single children or be the only child of its parent.
+ * I/O objects may not be ignored, topology flags should be used to configure
+ * their discovery instead.
+ */
+HWLOC_DECLSPEC int hwloc_topology_ignore_all_keep_structure(hwloc_topology_t topology);
+
+/** \brief Provide a distance matrix.
+ *
+ * Provide the matrix of distances between a set of objects of the given type.
+ * The set may or may not contain all the existing objects of this type.
+ * The objects are specified by their OS/physical index in the \p os_index
+ * array. The \p distances matrix follows the same order.
+ * The distance from object i to object j in the i*nbobjs+j.
+ *
+ * A single latency matrix may be defined for each type.
+ * If another distance matrix already exists for the given type,
+ * either because the user specified it or because the OS offers it,
+ * it will be replaced by the given one.
+ * If \p nbobjs is \c 0, \p os_index is \c NULL and \p distances is \c NULL,
+ * the existing distance matrix for the given type is removed.
+ *
+ * \note Distance matrices are ignored in multi-node topologies.
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_distance_matrix(hwloc_topology_t __hwloc_restrict topology,
+ hwloc_obj_type_t type, unsigned nbobjs,
+ unsigned *os_index, float *distances);
+
+/** \brief Set the topology-specific userdata pointer.
+ *
+ * Each topology may store one application-given private data pointer.
+ * It is initialized to \c NULL.
+ * hwloc will never modify it.
+ *
+ * Use it as you wish, after hwloc_topology_init() and until hwloc_topolog_destroy().
+ *
+ * This pointer is not exported to XML.
+ */
+HWLOC_DECLSPEC void hwloc_topology_set_userdata(hwloc_topology_t topology, const void *userdata);
+
+/** \brief Retrieve the topology-specific userdata pointer.
+ *
+ * Retrieve the application-given private data pointer that was
+ * previously set with hwloc_topology_set_userdata().
+ */
+HWLOC_DECLSPEC void * hwloc_topology_get_userdata(hwloc_topology_t topology);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_tinker Modifying a loaded Topology
+ * @{
+ */
+
+/** \brief Flags to be given to hwloc_topology_restrict(). */
+enum hwloc_restrict_flags_e {
+ /** \brief Adapt distance matrices according to objects being removed during restriction.
+ * If this flag is not set, distance matrices are removed.
+ * \hideinitializer
+ */
+ HWLOC_RESTRICT_FLAG_ADAPT_DISTANCES = (1<<0),
+
+ /** \brief Move Misc objects to ancestors if their parents are removed during restriction.
+ * If this flag is not set, Misc objects are removed when their parents are removed.
+ * \hideinitializer
+ */
+ HWLOC_RESTRICT_FLAG_ADAPT_MISC = (1<<1),
+
+ /** \brief Move I/O objects to ancestors if their parents are removed during restriction.
+ * If this flag is not set, I/O devices and bridges are removed when their parents are removed.
+ * \hideinitializer
+ */
+ HWLOC_RESTRICT_FLAG_ADAPT_IO = (1<<2)
+};
+
+/** \brief Restrict the topology to the given CPU set.
+ *
+ * Topology \p topology is modified so as to remove all objects that
+ * are not included (or partially included) in the CPU set \p cpuset.
+ * All objects CPU and node sets are restricted accordingly.
+ *
+ * \p flags is a OR'ed set of ::hwloc_restrict_flags_e.
+ *
+ * \note This call may not be reverted by restricting back to a larger
+ * cpuset. Once dropped during restriction, objects may not be brought
+ * back, except by loading another topology with hwloc_topology_load().
+ *
+ * \return 0 on success.
+ *
+ * \return -1 with errno set to EINVAL if the input cpuset is invalid.
+ * The topology is not modified in this case.
+ *
+ * \return -1 with errno set to ENOMEM on failure to allocate internal data.
+ * The topology is reinitialized in this case. It should be either
+ * destroyed with hwloc_topology_destroy() or configured and loaded again.
+ */
+HWLOC_DECLSPEC int hwloc_topology_restrict(hwloc_topology_t __hwloc_restrict topology, hwloc_const_cpuset_t cpuset, unsigned long flags);
+
+/** \brief Add a MISC object as a leaf of the topology
+ *
+ * A new MISC object will be created and inserted into the topology at the
+ * position given by parent. It is appended to the list of existing Misc children,
+ * without ever adding any intermediate hierarchy level. This is useful for
+ * annotating the topology without actually changing the hierarchy.
+ *
+ * \p name will be copied to the setup the new object attributes.
+ * However, the new leaf object will not have any \p cpuset.
+ *
+ * \return the newly-created object
+ *
+ * \note If \p name contains some non-printable characters, they will
+ * be dropped when exporting to XML, see hwloc_topology_export_xml() in hwloc/export.h.
+ */
+HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_insert_misc_object(hwloc_topology_t topology, hwloc_obj_t parent, const char *name);
+
+/** \brief Allocate a Group object to insert later with hwloc_topology_insert_group_object().
+ *
+ * This function returns a new Group object.
+ * The caller should (at least) initialize its sets before inserting the object.
+ * See hwloc_topology_insert_group_object().
+ *
+ * Custom name/value info pairs may be added with hwloc_obj_add_info() after
+ * insertion. For instance the Type info key allows to display something else
+ * than "Group" as the type name for this object in lstopo.
+ *
+ * It is recommended not to set any other object attribute before insertion,
+ * since the Group may get discarded during insertion.
+ *
+ * The object will be destroyed if passed to hwloc_topology_insert_group_object()
+ * without any set defined.
+ */
+HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_alloc_group_object(hwloc_topology_t topology);
+
+/** \brief Add more structure to the topology by adding an intermediate Group
+ *
+ * The caller should first allocate a new Group object with hwloc_topology_alloc_group_object().
+ * Then it must initialize some of its sets to specify the final location of the Group
+ * in the topology.
+ * Then the object can be passed to this function for actual insertion in the topology.
+ *
+ * Either the cpuset or nodeset field (or both, if compatible) may be used to do so.
+ * If inserting with respect to the complete topology (including disallowed, offline
+ * or unknown object), complete_cpuset and/or complete_nodeset may be used instead.
+ * It grouping several objects, hwloc_obj_add_other_obj_sets() is an easy way to
+ * build the Group sets iteratively.
+ *
+ * \return The inserted object if it was properly inserted.
+ *
+ * \return An existing object if the Group was discarded because the topology already
+ * contained an object at the same location (the Group did not add any locality information).
+ * Any name/info key pair set before inserting is appended to the existing object.
+ *
+ * \return \c NULL if the insertion failed because of conflicting sets in topology tree.
+ *
+ * \return \c NULL if Group objects are always ignored in the topology.
+ *
+ * \return \c NULL if the object was discarded because no set was initialized in the Group
+ * before insert, or all of them were empty.
+ */
+HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_insert_group_object(hwloc_topology_t topology, hwloc_obj_t group);
+
+/** \brief Setup object cpusets/nodesets by OR'ing another object's sets.
+ *
+ * For each defined cpuset or nodeset in \p src, allocate the corresponding set
+ * in \p dst and add \p src to it by OR'ing sets.
+ *
+ * This function is convenient between hwloc_topology_alloc_group_object()
+ * and hwloc_topology_insert_group_object(). It builds the sets of the new Group
+ * that will be inserted as a new intermediate parent of several objects.
+ */
+HWLOC_DECLSPEC int hwloc_obj_add_other_obj_sets(hwloc_obj_t dst, hwloc_obj_t src);
+
+/** @} */
+
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+/* high-level helpers */
+#include <hwloc/helper.h>
+
+/* inline code of some functions above */
+#include <hwloc/inlines.h>
+
+/* exporting to XML or synthetic */
+#include <hwloc/export.h>
+
+/* topology diffs */
+#include <hwloc/diff.h>
+
+/* deprecated headers */
+#include <hwloc/deprecated.h>
+
+#endif /* HWLOC_H */
diff --git a/ext/hwloc/include/hwloc/autogen/config.h b/ext/hwloc/include/hwloc/autogen/config.h
new file mode 100644
index 0000000..3c243ed
--- /dev/null
+++ b/ext/hwloc/include/hwloc/autogen/config.h
@@ -0,0 +1,202 @@
+/* include/hwloc/autogen/config.h. Generated from config.h.in by configure. */
+/* -*- c -*-
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria. All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/* The configuration file */
+
+#ifndef HWLOC_CONFIG_H
+#define HWLOC_CONFIG_H
+
+#if (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 95))
+# define __hwloc_restrict __restrict
+#else
+# if __STDC_VERSION__ >= 199901L
+# define __hwloc_restrict restrict
+# else
+# define __hwloc_restrict
+# endif
+#endif
+
+/* Note that if we're compiling C++, then just use the "inline"
+ keyword, since it's part of C++ */
+#if defined(c_plusplus) || defined(__cplusplus)
+# define __hwloc_inline inline
+#elif defined(_MSC_VER) || defined(__HP_cc)
+# define __hwloc_inline __inline
+#else
+# define __hwloc_inline __inline__
+#endif
+
+/*
+ * Note: this is public. We can not assume anything from the compiler used
+ * by the application and thus the HWLOC_HAVE_* macros below are not
+ * fetched from the autoconf result here. We only automatically use a few
+ * well-known easy cases.
+ */
+
+/* Some handy constants to make the logic below a little more readable */
+#if defined(__cplusplus) && \
+ (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR >= 4))
+#define GXX_ABOVE_3_4 1
+#else
+#define GXX_ABOVE_3_4 0
+#endif
+
+#if !defined(__cplusplus) && \
+ (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 95))
+#define GCC_ABOVE_2_95 1
+#else
+#define GCC_ABOVE_2_95 0
+#endif
+
+#if !defined(__cplusplus) && \
+ (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96))
+#define GCC_ABOVE_2_96 1
+#else
+#define GCC_ABOVE_2_96 0
+#endif
+
+#if !defined(__cplusplus) && \
+ (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3))
+#define GCC_ABOVE_3_3 1
+#else
+#define GCC_ABOVE_3_3 0
+#endif
+
+/* Maybe before gcc 2.95 too */
+#ifdef HWLOC_HAVE_ATTRIBUTE_UNUSED
+#define __HWLOC_HAVE_ATTRIBUTE_UNUSED HWLOC_HAVE_ATTRIBUTE_UNUSED
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_UNUSED (GXX_ABOVE_3_4 || GCC_ABOVE_2_95)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_UNUSED 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_UNUSED
+# define __hwloc_attribute_unused __attribute__((__unused__))
+#else
+# define __hwloc_attribute_unused
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_MALLOC
+#define __HWLOC_HAVE_ATTRIBUTE_MALLOC HWLOC_HAVE_ATTRIBUTE_MALLOC
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_MALLOC (GXX_ABOVE_3_4 || GCC_ABOVE_2_96)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_MALLOC 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_MALLOC
+# define __hwloc_attribute_malloc __attribute__((__malloc__))
+#else
+# define __hwloc_attribute_malloc
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_CONST
+#define __HWLOC_HAVE_ATTRIBUTE_CONST HWLOC_HAVE_ATTRIBUTE_CONST
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_CONST (GXX_ABOVE_3_4 || GCC_ABOVE_2_95)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_CONST 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_CONST
+# define __hwloc_attribute_const __attribute__((__const__))
+#else
+# define __hwloc_attribute_const
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_PURE
+#define __HWLOC_HAVE_ATTRIBUTE_PURE HWLOC_HAVE_ATTRIBUTE_PURE
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_PURE (GXX_ABOVE_3_4 || GCC_ABOVE_2_96)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_PURE 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_PURE
+# define __hwloc_attribute_pure __attribute__((__pure__))
+#else
+# define __hwloc_attribute_pure
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_DEPRECATED
+#define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED HWLOC_HAVE_ATTRIBUTE_DEPRECATED
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED (GXX_ABOVE_3_4 || GCC_ABOVE_3_3)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_DEPRECATED
+# define __hwloc_attribute_deprecated __attribute__((__deprecated__))
+#else
+# define __hwloc_attribute_deprecated
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS
+#define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS (GXX_ABOVE_3_4 || GCC_ABOVE_3_3)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS
+# define __hwloc_attribute_may_alias __attribute__((__may_alias__))
+#else
+# define __hwloc_attribute_may_alias
+#endif
+
+#ifdef HWLOC_C_HAVE_VISIBILITY
+# if HWLOC_C_HAVE_VISIBILITY
+# define HWLOC_DECLSPEC __attribute__((__visibility__("default")))
+# else
+# define HWLOC_DECLSPEC
+# endif
+#else
+# define HWLOC_DECLSPEC
+#endif
+
+/* Defined to 1 on Linux */
+#define HWLOC_LINUX_SYS 1
+
+/* Defined to 1 if the CPU_SET macro works */
+#define HWLOC_HAVE_CPU_SET 1
+
+/* Defined to 1 if you have the `windows.h' header. */
+/* #undef HWLOC_HAVE_WINDOWS_H */
+#define hwloc_pid_t pid_t
+#define hwloc_thread_t pthread_t
+
+#ifdef HWLOC_HAVE_WINDOWS_H
+
+# include <windows.h>
+typedef DWORDLONG hwloc_uint64_t;
+
+#else /* HWLOC_HAVE_WINDOWS_H */
+
+# ifdef hwloc_thread_t
+# include <pthread.h>
+# endif /* hwloc_thread_t */
+
+/* Defined to 1 if you have the <stdint.h> header file. */
+# define HWLOC_HAVE_STDINT_H 1
+
+# include <unistd.h>
+# ifdef HWLOC_HAVE_STDINT_H
+# include <stdint.h>
+# endif
+typedef uint64_t hwloc_uint64_t;
+
+#endif /* HWLOC_HAVE_WINDOWS_H */
+
+/* Whether we need to re-define all the hwloc public symbols or not */
+#define HWLOC_SYM_TRANSFORM 1
+
+/* The hwloc symbol prefix */
+#define HWLOC_SYM_PREFIX likwid_
+
+/* The hwloc symbol prefix in all caps */
+#define HWLOC_SYM_PREFIX_CAPS LIKWID_
+
+#endif /* HWLOC_CONFIG_H */
diff --git a/ext/hwloc/include/hwloc/autogen/config.h.in b/ext/hwloc/include/hwloc/autogen/config.h.in
new file mode 100644
index 0000000..e101b0a
--- /dev/null
+++ b/ext/hwloc/include/hwloc/autogen/config.h.in
@@ -0,0 +1,201 @@
+/* -*- c -*-
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria. All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/* The configuration file */
+
+#ifndef HWLOC_CONFIG_H
+#define HWLOC_CONFIG_H
+
+#if (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 95))
+# define __hwloc_restrict __restrict
+#else
+# if __STDC_VERSION__ >= 199901L
+# define __hwloc_restrict restrict
+# else
+# define __hwloc_restrict
+# endif
+#endif
+
+/* Note that if we're compiling C++, then just use the "inline"
+ keyword, since it's part of C++ */
+#if defined(c_plusplus) || defined(__cplusplus)
+# define __hwloc_inline inline
+#elif defined(_MSC_VER) || defined(__HP_cc)
+# define __hwloc_inline __inline
+#else
+# define __hwloc_inline __inline__
+#endif
+
+/*
+ * Note: this is public. We can not assume anything from the compiler used
+ * by the application and thus the HWLOC_HAVE_* macros below are not
+ * fetched from the autoconf result here. We only automatically use a few
+ * well-known easy cases.
+ */
+
+/* Some handy constants to make the logic below a little more readable */
+#if defined(__cplusplus) && \
+ (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR >= 4))
+#define GXX_ABOVE_3_4 1
+#else
+#define GXX_ABOVE_3_4 0
+#endif
+
+#if !defined(__cplusplus) && \
+ (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 95))
+#define GCC_ABOVE_2_95 1
+#else
+#define GCC_ABOVE_2_95 0
+#endif
+
+#if !defined(__cplusplus) && \
+ (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96))
+#define GCC_ABOVE_2_96 1
+#else
+#define GCC_ABOVE_2_96 0
+#endif
+
+#if !defined(__cplusplus) && \
+ (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3))
+#define GCC_ABOVE_3_3 1
+#else
+#define GCC_ABOVE_3_3 0
+#endif
+
+/* Maybe before gcc 2.95 too */
+#ifdef HWLOC_HAVE_ATTRIBUTE_UNUSED
+#define __HWLOC_HAVE_ATTRIBUTE_UNUSED HWLOC_HAVE_ATTRIBUTE_UNUSED
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_UNUSED (GXX_ABOVE_3_4 || GCC_ABOVE_2_95)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_UNUSED 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_UNUSED
+# define __hwloc_attribute_unused __attribute__((__unused__))
+#else
+# define __hwloc_attribute_unused
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_MALLOC
+#define __HWLOC_HAVE_ATTRIBUTE_MALLOC HWLOC_HAVE_ATTRIBUTE_MALLOC
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_MALLOC (GXX_ABOVE_3_4 || GCC_ABOVE_2_96)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_MALLOC 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_MALLOC
+# define __hwloc_attribute_malloc __attribute__((__malloc__))
+#else
+# define __hwloc_attribute_malloc
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_CONST
+#define __HWLOC_HAVE_ATTRIBUTE_CONST HWLOC_HAVE_ATTRIBUTE_CONST
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_CONST (GXX_ABOVE_3_4 || GCC_ABOVE_2_95)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_CONST 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_CONST
+# define __hwloc_attribute_const __attribute__((__const__))
+#else
+# define __hwloc_attribute_const
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_PURE
+#define __HWLOC_HAVE_ATTRIBUTE_PURE HWLOC_HAVE_ATTRIBUTE_PURE
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_PURE (GXX_ABOVE_3_4 || GCC_ABOVE_2_96)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_PURE 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_PURE
+# define __hwloc_attribute_pure __attribute__((__pure__))
+#else
+# define __hwloc_attribute_pure
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_DEPRECATED
+#define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED HWLOC_HAVE_ATTRIBUTE_DEPRECATED
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED (GXX_ABOVE_3_4 || GCC_ABOVE_3_3)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_DEPRECATED
+# define __hwloc_attribute_deprecated __attribute__((__deprecated__))
+#else
+# define __hwloc_attribute_deprecated
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS
+#define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS (GXX_ABOVE_3_4 || GCC_ABOVE_3_3)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS
+# define __hwloc_attribute_may_alias __attribute__((__may_alias__))
+#else
+# define __hwloc_attribute_may_alias
+#endif
+
+#ifdef HWLOC_C_HAVE_VISIBILITY
+# if HWLOC_C_HAVE_VISIBILITY
+# define HWLOC_DECLSPEC __attribute__((__visibility__("default")))
+# else
+# define HWLOC_DECLSPEC
+# endif
+#else
+# define HWLOC_DECLSPEC
+#endif
+
+/* Defined to 1 on Linux */
+#undef HWLOC_LINUX_SYS
+
+/* Defined to 1 if the CPU_SET macro works */
+#undef HWLOC_HAVE_CPU_SET
+
+/* Defined to 1 if you have the `windows.h' header. */
+#undef HWLOC_HAVE_WINDOWS_H
+#undef hwloc_pid_t
+#undef hwloc_thread_t
+
+#ifdef HWLOC_HAVE_WINDOWS_H
+
+# include <windows.h>
+typedef DWORDLONG hwloc_uint64_t;
+
+#else /* HWLOC_HAVE_WINDOWS_H */
+
+# ifdef hwloc_thread_t
+# include <pthread.h>
+# endif /* hwloc_thread_t */
+
+/* Defined to 1 if you have the <stdint.h> header file. */
+# undef HWLOC_HAVE_STDINT_H
+
+# include <unistd.h>
+# ifdef HWLOC_HAVE_STDINT_H
+# include <stdint.h>
+# endif
+typedef uint64_t hwloc_uint64_t;
+
+#endif /* HWLOC_HAVE_WINDOWS_H */
+
+/* Whether we need to re-define all the hwloc public symbols or not */
+#undef HWLOC_SYM_TRANSFORM
+
+/* The hwloc symbol prefix */
+#undef HWLOC_SYM_PREFIX
+
+/* The hwloc symbol prefix in all caps */
+#undef HWLOC_SYM_PREFIX_CAPS
+
+#endif /* HWLOC_CONFIG_H */
diff --git a/ext/hwloc/include/hwloc/autogen/stamp-h2 b/ext/hwloc/include/hwloc/autogen/stamp-h2
new file mode 100644
index 0000000..804e0ac
--- /dev/null
+++ b/ext/hwloc/include/hwloc/autogen/stamp-h2
@@ -0,0 +1 @@
+timestamp for include/hwloc/autogen/config.h
diff --git a/ext/hwloc/include/hwloc/bitmap.h b/ext/hwloc/include/hwloc/bitmap.h
new file mode 100644
index 0000000..bb18f65
--- /dev/null
+++ b/ext/hwloc/include/hwloc/bitmap.h
@@ -0,0 +1,359 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria. All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief The bitmap API, for use in hwloc itself.
+ */
+
+#ifndef HWLOC_BITMAP_H
+#define HWLOC_BITMAP_H
+
+#include <hwloc/autogen/config.h>
+#include <assert.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_bitmap The bitmap API
+ *
+ * The ::hwloc_bitmap_t type represents a set of objects, typically OS
+ * processors -- which may actually be hardware threads (represented
+ * by ::hwloc_cpuset_t, which is a typedef for ::hwloc_bitmap_t) -- or
+ * memory nodes (represented by ::hwloc_nodeset_t, which is also a
+ * typedef for ::hwloc_bitmap_t).
+ *
+ * <em>Both CPU and node sets are always indexed by OS physical number.</em>
+ *
+ * \note CPU sets and nodesets are described in \ref hwlocality_object_sets.
+ *
+ * A bitmap may be of infinite size.
+ *
+ * \note Several examples of using the bitmap API are available under the
+ * doc/examples/ directory in the source tree.
+ * Regression tests such as tests/hwloc_bitmap*.c also make intensive use
+ * of this API.
+ * @{
+ */
+
+
+/** \brief
+ * Set of bits represented as an opaque pointer to an internal bitmap.
+ */
+typedef struct hwloc_bitmap_s * hwloc_bitmap_t;
+/** \brief a non-modifiable ::hwloc_bitmap_t */
+typedef const struct hwloc_bitmap_s * hwloc_const_bitmap_t;
+
+
+/*
+ * Bitmap allocation, freeing and copying.
+ */
+
+/** \brief Allocate a new empty bitmap.
+ *
+ * \returns A valid bitmap or \c NULL.
+ *
+ * The bitmap should be freed by a corresponding call to
+ * hwloc_bitmap_free().
+ */
+HWLOC_DECLSPEC hwloc_bitmap_t hwloc_bitmap_alloc(void) __hwloc_attribute_malloc;
+
+/** \brief Allocate a new full bitmap. */
+HWLOC_DECLSPEC hwloc_bitmap_t hwloc_bitmap_alloc_full(void) __hwloc_attribute_malloc;
+
+/** \brief Free bitmap \p bitmap.
+ *
+ * If \p bitmap is \c NULL, no operation is performed.
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_free(hwloc_bitmap_t bitmap);
+
+/** \brief Duplicate bitmap \p bitmap by allocating a new bitmap and copying \p bitmap contents.
+ *
+ * If \p bitmap is \c NULL, \c NULL is returned.
+ */
+HWLOC_DECLSPEC hwloc_bitmap_t hwloc_bitmap_dup(hwloc_const_bitmap_t bitmap) __hwloc_attribute_malloc;
+
+/** \brief Copy the contents of bitmap \p src into the already allocated bitmap \p dst */
+HWLOC_DECLSPEC void hwloc_bitmap_copy(hwloc_bitmap_t dst, hwloc_const_bitmap_t src);
+
+
+/*
+ * Bitmap/String Conversion
+ */
+
+/** \brief Stringify a bitmap.
+ *
+ * Up to \p buflen characters may be written in buffer \p buf.
+ *
+ * If \p buflen is 0, \p buf may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap);
+
+/** \brief Stringify a bitmap into a newly allocated string.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_asprintf(char ** strp, hwloc_const_bitmap_t bitmap);
+
+/** \brief Parse a bitmap string and stores it in bitmap \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_sscanf(hwloc_bitmap_t bitmap, const char * __hwloc_restrict string);
+
+/** \brief Stringify a bitmap in the list format.
+ *
+ * Lists are comma-separated indexes or ranges.
+ * Ranges are dash separated indexes.
+ * The last range may not have a ending indexes if the bitmap is infinite.
+ *
+ * Up to \p buflen characters may be written in buffer \p buf.
+ *
+ * If \p buflen is 0, \p buf may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_list_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap);
+
+/** \brief Stringify a bitmap into a newly allocated list string.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_list_asprintf(char ** strp, hwloc_const_bitmap_t bitmap);
+
+/** \brief Parse a list string and stores it in bitmap \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_list_sscanf(hwloc_bitmap_t bitmap, const char * __hwloc_restrict string);
+
+/** \brief Stringify a bitmap in the taskset-specific format.
+ *
+ * The taskset command manipulates bitmap strings that contain a single
+ * (possible very long) hexadecimal number starting with 0x.
+ *
+ * Up to \p buflen characters may be written in buffer \p buf.
+ *
+ * If \p buflen is 0, \p buf may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_taskset_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap);
+
+/** \brief Stringify a bitmap into a newly allocated taskset-specific string.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_taskset_asprintf(char ** strp, hwloc_const_bitmap_t bitmap);
+
+/** \brief Parse a taskset-specific bitmap string and stores it in bitmap \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_taskset_sscanf(hwloc_bitmap_t bitmap, const char * __hwloc_restrict string);
+
+
+/*
+ * Building bitmaps.
+ */
+
+/** \brief Empty the bitmap \p bitmap */
+HWLOC_DECLSPEC void hwloc_bitmap_zero(hwloc_bitmap_t bitmap);
+
+/** \brief Fill bitmap \p bitmap with all possible indexes (even if those objects don't exist or are otherwise unavailable) */
+HWLOC_DECLSPEC void hwloc_bitmap_fill(hwloc_bitmap_t bitmap);
+
+/** \brief Empty the bitmap \p bitmap and add bit \p id */
+HWLOC_DECLSPEC void hwloc_bitmap_only(hwloc_bitmap_t bitmap, unsigned id);
+
+/** \brief Fill the bitmap \p and clear the index \p id */
+HWLOC_DECLSPEC void hwloc_bitmap_allbut(hwloc_bitmap_t bitmap, unsigned id);
+
+/** \brief Setup bitmap \p bitmap from unsigned long \p mask */
+HWLOC_DECLSPEC void hwloc_bitmap_from_ulong(hwloc_bitmap_t bitmap, unsigned long mask);
+
+/** \brief Setup bitmap \p bitmap from unsigned long \p mask used as \p i -th subset */
+HWLOC_DECLSPEC void hwloc_bitmap_from_ith_ulong(hwloc_bitmap_t bitmap, unsigned i, unsigned long mask);
+
+
+/*
+ * Modifying bitmaps.
+ */
+
+/** \brief Add index \p id in bitmap \p bitmap */
+HWLOC_DECLSPEC void hwloc_bitmap_set(hwloc_bitmap_t bitmap, unsigned id);
+
+/** \brief Add indexes from \p begin to \p end in bitmap \p bitmap.
+ *
+ * If \p end is \c -1, the range is infinite.
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_set_range(hwloc_bitmap_t bitmap, unsigned begin, int end);
+
+/** \brief Replace \p i -th subset of bitmap \p bitmap with unsigned long \p mask */
+HWLOC_DECLSPEC void hwloc_bitmap_set_ith_ulong(hwloc_bitmap_t bitmap, unsigned i, unsigned long mask);
+
+/** \brief Remove index \p id from bitmap \p bitmap */
+HWLOC_DECLSPEC void hwloc_bitmap_clr(hwloc_bitmap_t bitmap, unsigned id);
+
+/** \brief Remove indexes from \p begin to \p end in bitmap \p bitmap.
+ *
+ * If \p end is \c -1, the range is infinite.
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_clr_range(hwloc_bitmap_t bitmap, unsigned begin, int end);
+
+/** \brief Keep a single index among those set in bitmap \p bitmap
+ *
+ * May be useful before binding so that the process does not
+ * have a chance of migrating between multiple logical CPUs
+ * in the original mask.
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_singlify(hwloc_bitmap_t bitmap);
+
+
+/*
+ * Consulting bitmaps.
+ */
+
+/** \brief Convert the beginning part of bitmap \p bitmap into unsigned long \p mask */
+HWLOC_DECLSPEC unsigned long hwloc_bitmap_to_ulong(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Convert the \p i -th subset of bitmap \p bitmap into unsigned long mask */
+HWLOC_DECLSPEC unsigned long hwloc_bitmap_to_ith_ulong(hwloc_const_bitmap_t bitmap, unsigned i) __hwloc_attribute_pure;
+
+/** \brief Test whether index \p id is part of bitmap \p bitmap */
+HWLOC_DECLSPEC int hwloc_bitmap_isset(hwloc_const_bitmap_t bitmap, unsigned id) __hwloc_attribute_pure;
+
+/** \brief Test whether bitmap \p bitmap is empty */
+HWLOC_DECLSPEC int hwloc_bitmap_iszero(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Test whether bitmap \p bitmap is completely full */
+HWLOC_DECLSPEC int hwloc_bitmap_isfull(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Compute the first index (least significant bit) in bitmap \p bitmap
+ *
+ * \return -1 if no index is set.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_first(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Compute the next index in bitmap \p bitmap which is after index \p prev
+ *
+ * If \p prev is -1, the first index is returned.
+ *
+ * \return -1 if no index with higher index is bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_next(hwloc_const_bitmap_t bitmap, int prev) __hwloc_attribute_pure;
+
+/** \brief Compute the last index (most significant bit) in bitmap \p bitmap
+ *
+ * \return -1 if no index is bitmap, or if the index bitmap is infinite.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_last(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Compute the "weight" of bitmap \p bitmap (i.e., number of
+ * indexes that are in the bitmap).
+ *
+ * \return the number of indexes that are in the bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_weight(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Loop macro iterating on bitmap \p bitmap
+ * \hideinitializer
+ *
+ * \p index is the loop variable; it should be an unsigned int. The
+ * first iteration will set \p index to the lowest index in the bitmap.
+ * Successive iterations will iterate through, in order, all remaining
+ * indexes that in the bitmap. To be specific: each iteration will return a
+ * value for \p index such that hwloc_bitmap_isset(bitmap, index) is true.
+ *
+ * The assert prevents the loop from being infinite if the bitmap is infinite.
+ */
+#define hwloc_bitmap_foreach_begin(id, bitmap) \
+do { \
+ assert(hwloc_bitmap_weight(bitmap) != -1); \
+ for (id = hwloc_bitmap_first(bitmap); \
+ (unsigned) id != (unsigned) -1; \
+ id = hwloc_bitmap_next(bitmap, id)) { \
+/** \brief End of loop. Needs a terminating ';'.
+ * \hideinitializer
+ *
+ * \sa hwloc_bitmap_foreach_begin */
+#define hwloc_bitmap_foreach_end() \
+ } \
+} while (0)
+
+
+/*
+ * Combining bitmaps.
+ */
+
+/** \brief Or bitmaps \p bitmap1 and \p bitmap2 and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap1 or \p bitmap2
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_or (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
+
+/** \brief And bitmaps \p bitmap1 and \p bitmap2 and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap1 or \p bitmap2
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_and (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
+
+/** \brief And bitmap \p bitmap1 and the negation of \p bitmap2 and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap1 or \p bitmap2
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_andnot (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
+
+/** \brief Xor bitmaps \p bitmap1 and \p bitmap2 and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap1 or \p bitmap2
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_xor (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
+
+/** \brief Negate bitmap \p bitmap and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_not (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap);
+
+
+/*
+ * Comparing bitmaps.
+ */
+
+/** \brief Test whether bitmaps \p bitmap1 and \p bitmap2 intersects */
+HWLOC_DECLSPEC int hwloc_bitmap_intersects (hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+
+/** \brief Test whether bitmap \p sub_bitmap is part of bitmap \p super_bitmap */
+HWLOC_DECLSPEC int hwloc_bitmap_isincluded (hwloc_const_bitmap_t sub_bitmap, hwloc_const_bitmap_t super_bitmap) __hwloc_attribute_pure;
+
+/** \brief Test whether bitmap \p bitmap1 is equal to bitmap \p bitmap2 */
+HWLOC_DECLSPEC int hwloc_bitmap_isequal (hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+
+/** \brief Compare bitmaps \p bitmap1 and \p bitmap2 using their lowest index.
+ *
+ * Smaller least significant bit is smaller.
+ * The empty bitmap is considered higher than anything.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_compare_first(hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+
+/** \brief Compare bitmaps \p bitmap1 and \p bitmap2 in lexicographic order.
+ *
+ * Lexicographic comparison of bitmaps, starting for their highest indexes.
+ * Compare last indexes first, then second, etc.
+ * The empty bitmap is considered lower than anything.
+ *
+ * \note This is different from the non-existing hwloc_bitmap_compare_last()
+ * which would only compare the highest index of each bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_compare(hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_BITMAP_H */
diff --git a/ext/hwloc/include/hwloc/cuda.h b/ext/hwloc/include/hwloc/cuda.h
new file mode 100644
index 0000000..a02d677
--- /dev/null
+++ b/ext/hwloc/include/hwloc/cuda.h
@@ -0,0 +1,224 @@
+/*
+ * Copyright © 2010-2015 Inria. All rights reserved.
+ * Copyright © 2010-2011 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and the CUDA Driver API.
+ *
+ * Applications that use both hwloc and the CUDA Driver API may want to
+ * include this file so as to get topology information for CUDA devices.
+ *
+ */
+
+#ifndef HWLOC_CUDA_H
+#define HWLOC_CUDA_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+#include <hwloc/helper.h>
+#ifdef HWLOC_LINUX_SYS
+#include <hwloc/linux.h>
+#endif
+
+#include <cuda.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_cuda Interoperability with the CUDA Driver API
+ *
+ * This interface offers ways to retrieve topology information about
+ * CUDA devices when using the CUDA Driver API.
+ *
+ * @{
+ */
+
+/** \brief Return the domain, bus and device IDs of the CUDA device \p cudevice.
+ *
+ * Device \p cudevice must match the local machine.
+ */
+static __hwloc_inline int
+hwloc_cuda_get_device_pci_ids(hwloc_topology_t topology __hwloc_attribute_unused,
+ CUdevice cudevice, int *domain, int *bus, int *dev)
+{
+ CUresult cres;
+
+#if CUDA_VERSION >= 4000
+ cres = cuDeviceGetAttribute(domain, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, cudevice);
+ if (cres != CUDA_SUCCESS) {
+ errno = ENOSYS;
+ return -1;
+ }
+#else
+ *domain = 0;
+#endif
+ cres = cuDeviceGetAttribute(bus, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, cudevice);
+ if (cres != CUDA_SUCCESS) {
+ errno = ENOSYS;
+ return -1;
+ }
+ cres = cuDeviceGetAttribute(dev, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, cudevice);
+ if (cres != CUDA_SUCCESS) {
+ errno = ENOSYS;
+ return -1;
+ }
+
+ return 0;
+}
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close to device \p cudevice.
+ *
+ * Return the CPU set describing the locality of the CUDA device \p cudevice.
+ *
+ * Topology \p topology and device \p cudevice must match the local machine.
+ * I/O devices detection and the CUDA component are not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * If more information about the device is needed, OS objects should
+ * be used instead, see hwloc_cuda_get_device_osdev()
+ * and hwloc_cuda_get_device_osdev_by_index().
+ *
+ * This function is currently only implemented in a meaningful way for
+ * Linux; other systems will simply get a full cpuset.
+ */
+static __hwloc_inline int
+hwloc_cuda_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
+ CUdevice cudevice, hwloc_cpuset_t set)
+{
+#ifdef HWLOC_LINUX_SYS
+ /* If we're on Linux, use the sysfs mechanism to get the local cpus */
+#define HWLOC_CUDA_DEVICE_SYSFS_PATH_MAX 128
+ char path[HWLOC_CUDA_DEVICE_SYSFS_PATH_MAX];
+ FILE *sysfile = NULL;
+ int domainid, busid, deviceid;
+
+ if (hwloc_cuda_get_device_pci_ids(topology, cudevice, &domainid, &busid, &deviceid))
+ return -1;
+
+ if (!hwloc_topology_is_thissystem(topology)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ sprintf(path, "/sys/bus/pci/devices/%04x:%02x:%02x.0/local_cpus", domainid, busid, deviceid);
+ sysfile = fopen(path, "r");
+ if (!sysfile)
+ return -1;
+
+ hwloc_linux_parse_cpumap_file(sysfile, set);
+ if (hwloc_bitmap_iszero(set))
+ hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+
+ fclose(sysfile);
+#else
+ /* Non-Linux systems simply get a full cpuset */
+ hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#endif
+ return 0;
+}
+
+/** \brief Get the hwloc PCI device object corresponding to the
+ * CUDA device \p cudevice.
+ *
+ * Return the PCI device object describing the CUDA device \p cudevice.
+ * Return NULL if there is none.
+ *
+ * Topology \p topology and device \p cudevice must match the local machine.
+ * I/O devices detection must be enabled in topology \p topology.
+ * The CUDA component is not needed in the topology.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_cuda_get_device_pcidev(hwloc_topology_t topology, CUdevice cudevice)
+{
+ int domain, bus, dev;
+
+ if (hwloc_cuda_get_device_pci_ids(topology, cudevice, &domain, &bus, &dev))
+ return NULL;
+
+ return hwloc_get_pcidev_by_busid(topology, domain, bus, dev, 0);
+}
+
+/** \brief Get the hwloc OS device object corresponding to CUDA device \p cudevice.
+ *
+ * Return the hwloc OS device object that describes the given
+ * CUDA device \p cudevice. Return NULL if there is none.
+ *
+ * Topology \p topology and device \p cudevice must match the local machine.
+ * I/O devices detection and the NVML component must be enabled in the topology.
+ * If not, the locality of the object may still be found using
+ * hwloc_cuda_get_device_cpuset().
+ *
+ * \note The corresponding hwloc PCI device may be found by looking
+ * at the result parent pointer.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_cuda_get_device_osdev(hwloc_topology_t topology, CUdevice cudevice)
+{
+ hwloc_obj_t osdev = NULL;
+ int domain, bus, dev;
+
+ if (hwloc_cuda_get_device_pci_ids(topology, cudevice, &domain, &bus, &dev))
+ return NULL;
+
+ osdev = NULL;
+ while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+ hwloc_obj_t pcidev = osdev->parent;
+ if (strncmp(osdev->name, "cuda", 4))
+ continue;
+ if (pcidev
+ && pcidev->type == HWLOC_OBJ_PCI_DEVICE
+ && (int) pcidev->attr->pcidev.domain == domain
+ && (int) pcidev->attr->pcidev.bus == bus
+ && (int) pcidev->attr->pcidev.dev == dev
+ && pcidev->attr->pcidev.func == 0)
+ return osdev;
+ }
+
+ return NULL;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * CUDA device whose index is \p idx.
+ *
+ * Return the OS device object describing the CUDA device whose
+ * index is \p idx. Return NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the CUDA component must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ *
+ * \note This function is identical to hwloc_cudart_get_device_osdev_by_index().
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_cuda_get_device_osdev_by_index(hwloc_topology_t topology, unsigned idx)
+{
+ hwloc_obj_t osdev = NULL;
+ while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+ if (HWLOC_OBJ_OSDEV_COPROC == osdev->attr->osdev.type
+ && osdev->name
+ && !strncmp("cuda", osdev->name, 4)
+ && atoi(osdev->name + 4) == (int) idx)
+ return osdev;
+ }
+ return NULL;
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_CUDA_H */
diff --git a/ext/hwloc/include/hwloc/cudart.h b/ext/hwloc/include/hwloc/cudart.h
new file mode 100644
index 0000000..759c3cf
--- /dev/null
+++ b/ext/hwloc/include/hwloc/cudart.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright © 2010-2015 Inria. All rights reserved.
+ * Copyright © 2010-2011 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and the CUDA Runtime API.
+ *
+ * Applications that use both hwloc and the CUDA Runtime API may want to
+ * include this file so as to get topology information for CUDA devices.
+ *
+ */
+
+#ifndef HWLOC_CUDART_H
+#define HWLOC_CUDART_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+#include <hwloc/helper.h>
+#ifdef HWLOC_LINUX_SYS
+#include <hwloc/linux.h>
+#endif
+
+#include <cuda.h> /* for CUDA_VERSION */
+#include <cuda_runtime_api.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_cudart Interoperability with the CUDA Runtime API
+ *
+ * This interface offers ways to retrieve topology information about
+ * CUDA devices when using the CUDA Runtime API.
+ *
+ * @{
+ */
+
+/** \brief Return the domain, bus and device IDs of the CUDA device whose index is \p idx.
+ *
+ * Device index \p idx must match the local machine.
+ */
+static __hwloc_inline int
+hwloc_cudart_get_device_pci_ids(hwloc_topology_t topology __hwloc_attribute_unused,
+ int idx, int *domain, int *bus, int *dev)
+{
+ cudaError_t cerr;
+ struct cudaDeviceProp prop;
+
+ cerr = cudaGetDeviceProperties(&prop, idx);
+ if (cerr) {
+ errno = ENOSYS;
+ return -1;
+ }
+
+#if CUDA_VERSION >= 4000
+ *domain = prop.pciDomainID;
+#else
+ *domain = 0;
+#endif
+
+ *bus = prop.pciBusID;
+ *dev = prop.pciDeviceID;
+
+ return 0;
+}
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close to device \p idx.
+ *
+ * Return the CPU set describing the locality of the CUDA device
+ * whose index is \p idx.
+ *
+ * Topology \p topology and device \p idx must match the local machine.
+ * I/O devices detection and the CUDA component are not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * If more information about the device is needed, OS objects should
+ * be used instead, see hwloc_cudart_get_device_osdev_by_index().
+ *
+ * This function is currently only implemented in a meaningful way for
+ * Linux; other systems will simply get a full cpuset.
+ */
+static __hwloc_inline int
+hwloc_cudart_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
+ int idx, hwloc_cpuset_t set)
+{
+#ifdef HWLOC_LINUX_SYS
+ /* If we're on Linux, use the sysfs mechanism to get the local cpus */
+#define HWLOC_CUDART_DEVICE_SYSFS_PATH_MAX 128
+ char path[HWLOC_CUDART_DEVICE_SYSFS_PATH_MAX];
+ FILE *sysfile = NULL;
+ int domain, bus, dev;
+
+ if (hwloc_cudart_get_device_pci_ids(topology, idx, &domain, &bus, &dev))
+ return -1;
+
+ if (!hwloc_topology_is_thissystem(topology)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ sprintf(path, "/sys/bus/pci/devices/%04x:%02x:%02x.0/local_cpus", domain, bus, dev);
+ sysfile = fopen(path, "r");
+ if (!sysfile)
+ return -1;
+
+ hwloc_linux_parse_cpumap_file(sysfile, set);
+ if (hwloc_bitmap_iszero(set))
+ hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+
+ fclose(sysfile);
+#else
+ /* Non-Linux systems simply get a full cpuset */
+ hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#endif
+ return 0;
+}
+
+/** \brief Get the hwloc PCI device object corresponding to the
+ * CUDA device whose index is \p idx.
+ *
+ * Return the PCI device object describing the CUDA device whose
+ * index is \p idx. Return NULL if there is none.
+ *
+ * Topology \p topology and device \p idx must match the local machine.
+ * I/O devices detection must be enabled in topology \p topology.
+ * The CUDA component is not needed in the topology.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_cudart_get_device_pcidev(hwloc_topology_t topology, int idx)
+{
+ int domain, bus, dev;
+
+ if (hwloc_cudart_get_device_pci_ids(topology, idx, &domain, &bus, &dev))
+ return NULL;
+
+ return hwloc_get_pcidev_by_busid(topology, domain, bus, dev, 0);
+}
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * CUDA device whose index is \p idx.
+ *
+ * Return the OS device object describing the CUDA device whose
+ * index is \p idx. Return NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the CUDA component must be enabled in the topology.
+ * If not, the locality of the object may still be found using
+ * hwloc_cudart_get_device_cpuset().
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ *
+ * \note This function is identical to hwloc_cuda_get_device_osdev_by_index().
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_cudart_get_device_osdev_by_index(hwloc_topology_t topology, unsigned idx)
+{
+ hwloc_obj_t osdev = NULL;
+ while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+ if (HWLOC_OBJ_OSDEV_COPROC == osdev->attr->osdev.type
+ && osdev->name
+ && !strncmp("cuda", osdev->name, 4)
+ && atoi(osdev->name + 4) == (int) idx)
+ return osdev;
+ }
+ return NULL;
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_CUDART_H */
diff --git a/ext/hwloc/include/hwloc/deprecated.h b/ext/hwloc/include/hwloc/deprecated.h
new file mode 100644
index 0000000..c4370b6
--- /dev/null
+++ b/ext/hwloc/include/hwloc/deprecated.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria. All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2010 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/**
+ * This file contains the inline code of functions declared in hwloc.h
+ */
+
+#ifndef HWLOC_DEPRECATED_H
+#define HWLOC_DEPRECATED_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* backward compat with v1.10 before Socket->Package renaming */
+#define HWLOC_OBJ_SOCKET HWLOC_OBJ_PACKAGE
+/* backward compat with v1.10 before Node->NUMANode clarification */
+#define HWLOC_OBJ_NODE HWLOC_OBJ_NUMANODE
+
+/** \brief Return an object type from the string
+ *
+ * \return -1 if unrecognized.
+ */
+HWLOC_DECLSPEC hwloc_obj_type_t hwloc_obj_type_of_string (const char * string) __hwloc_attribute_pure __hwloc_attribute_deprecated;
+
+/** \brief Stringify a given topology object into a human-readable form.
+ *
+ * \note This function is deprecated in favor of hwloc_obj_type_snprintf()
+ * and hwloc_obj_attr_snprintf() since it is not very flexible and
+ * only prints physical/OS indexes.
+ *
+ * Fill string \p string up to \p size characters with the description
+ * of topology object \p obj in topology \p topology.
+ *
+ * If \p verbose is set, a longer description is used. Otherwise a
+ * short description is used.
+ *
+ * \p indexprefix is used to prefix the \p os_index attribute number of
+ * the object in the description. If \c NULL, the \c # character is used.
+ *
+ * If \p size is 0, \p string may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_obj_snprintf(char * __hwloc_restrict string, size_t size,
+ hwloc_topology_t topology, hwloc_obj_t obj,
+ const char * __hwloc_restrict indexprefix, int verbose) __hwloc_attribute_deprecated;
+
+/** \brief Distribute \p n items over the topology under \p root
+ *
+ * Array \p cpuset will be filled with \p n cpusets recursively distributed
+ * linearly over the topology under \p root, down to depth \p until (which can
+ * be INT_MAX to distribute down to the finest level).
+ *
+ * This is typically useful when an application wants to distribute \p n
+ * threads over a machine, giving each of them as much private cache as
+ * possible and keeping them locally in number order.
+ *
+ * The caller may typically want to also call hwloc_bitmap_singlify()
+ * before binding a thread so that it does not move at all.
+ *
+ * \note This function requires the \p root object to have a CPU set.
+ */
+static __hwloc_inline void
+hwloc_distribute(hwloc_topology_t topology, hwloc_obj_t root, hwloc_cpuset_t *set, unsigned n, unsigned until) __hwloc_attribute_deprecated;
+static __hwloc_inline void
+hwloc_distribute(hwloc_topology_t topology, hwloc_obj_t root, hwloc_cpuset_t *set, unsigned n, unsigned until)
+{
+ hwloc_distrib(topology, &root, 1, set, n, until, 0);
+}
+
+/** \brief Distribute \p n items over the topology under \p roots
+ *
+ * This is the same as hwloc_distribute, but takes an array of roots instead of
+ * just one root.
+ *
+ * \note This function requires the \p roots objects to have a CPU set.
+ */
+static __hwloc_inline void
+hwloc_distributev(hwloc_topology_t topology, hwloc_obj_t *roots, unsigned n_roots, hwloc_cpuset_t *set, unsigned n, unsigned until) __hwloc_attribute_deprecated;
+static __hwloc_inline void
+hwloc_distributev(hwloc_topology_t topology, hwloc_obj_t *roots, unsigned n_roots, hwloc_cpuset_t *set, unsigned n, unsigned until)
+{
+ hwloc_distrib(topology, roots, n_roots, set, n, until, 0);
+}
+
+/** \brief Insert a misc object by parent.
+ *
+ * Identical to hwloc_topology_insert_misc_object().
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_topology_insert_misc_object_by_parent(hwloc_topology_t topology, hwloc_obj_t parent, const char *name) __hwloc_attribute_deprecated;
+static __hwloc_inline hwloc_obj_t
+hwloc_topology_insert_misc_object_by_parent(hwloc_topology_t topology, hwloc_obj_t parent, const char *name)
+{
+ return hwloc_topology_insert_misc_object(topology, parent, name);
+}
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_INLINES_H */
diff --git a/ext/hwloc/include/hwloc/diff.h b/ext/hwloc/include/hwloc/diff.h
new file mode 100644
index 0000000..3f1beb1
--- /dev/null
+++ b/ext/hwloc/include/hwloc/diff.h
@@ -0,0 +1,299 @@
+/*
+ * Copyright © 2013-2014 Inria. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Topology differences.
+ */
+
+#ifndef HWLOC_DIFF_H
+#define HWLOC_DIFF_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#elif 0
+}
+#endif
+
+
+/** \defgroup hwlocality_diff Topology differences
+ *
+ * Applications that manipulate many similar topologies, for instance
+ * one for each node of a homogeneous cluster, may want to compress
+ * topologies to reduce the memory footprint.
+ *
+ * This file offers a way to manipulate the difference between topologies
+ * and export/import it to/from XML.
+ * Compression may therefore be achieved by storing one topology
+ * entirely while the others are only described by their differences
+ * with the former.
+ * The actual topology can be reconstructed when actually needed by
+ * applying the precomputed difference to the reference topology.
+ *
+ * This interface targets very similar nodes.
+ * Only very simple differences between topologies are actually
+ * supported, for instance a change in the memory size, the name
+ * of the object, or some info attribute.
+ * More complex differences such as adding or removing objects cannot
+ * be represented in the difference structures and therefore return
+ * errors.
+ *
+ * It means that there is no need to apply the difference when
+ * looking at the tree organization (how many levels, how many
+ * objects per level, what kind of objects, CPU and node sets, etc)
+ * and when binding to objects.
+ * However the difference must be applied when looking at object
+ * attributes such as the name, the memory size or info attributes.
+ *
+ * @{
+ */
+
+
+/** \brief Type of one object attribute difference.
+ */
+typedef enum hwloc_topology_diff_obj_attr_type_e {
+ /** \brief The object local memory is modified.
+ * The union is a hwloc_topology_diff_obj_attr_uint64_s
+ * (and the index field is ignored).
+ */
+ HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE,
+
+ /** \brief The object name is modified.
+ * The union is a hwloc_topology_diff_obj_attr_string_s
+ * (and the name field is ignored).
+ */
+
+ HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME,
+ /** \brief the value of an info attribute is modified.
+ * The union is a hwloc_topology_diff_obj_attr_string_s.
+ */
+ HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO
+} hwloc_topology_diff_obj_attr_type_t;
+
+/** \brief One object attribute difference.
+ */
+union hwloc_topology_diff_obj_attr_u {
+ struct hwloc_topology_diff_obj_attr_generic_s {
+ /* each part of the union must start with these */
+ hwloc_topology_diff_obj_attr_type_t type;
+ } generic;
+
+ /** \brief Integer attribute modification with an optional index. */
+ struct hwloc_topology_diff_obj_attr_uint64_s {
+ /* used for storing integer attributes */
+ hwloc_topology_diff_obj_attr_type_t type;
+ hwloc_uint64_t index; /* not used for SIZE */
+ hwloc_uint64_t oldvalue;
+ hwloc_uint64_t newvalue;
+ } uint64;
+
+ /** \brief String attribute modification with an optional name */
+ struct hwloc_topology_diff_obj_attr_string_s {
+ /* used for storing name and info pairs */
+ hwloc_topology_diff_obj_attr_type_t type;
+ char *name; /* not used for NAME */
+ char *oldvalue;
+ char *newvalue;
+ } string;
+};
+
+
+/** \brief Type of one element of a difference list.
+ */
+typedef enum hwloc_topology_diff_type_e {
+ /*< \brief An object attribute was changed.
+ * The union is a hwloc_topology_diff_obj_attr_s.
+ */
+ HWLOC_TOPOLOGY_DIFF_OBJ_ATTR,
+
+ /*< \brief The difference is too complex,
+ * it cannot be represented. The difference below
+ * this object has not been checked.
+ * hwloc_topology_diff_build() will return 1.
+ *
+ * The union is a hwloc_topology_diff_too_complex_s.
+ */
+ HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX
+} hwloc_topology_diff_type_t;
+
+/** \brief One element of a difference list between two topologies.
+ */
+typedef union hwloc_topology_diff_u {
+ struct hwloc_topology_diff_generic_s {
+ /* each part of the union must start with these */
+ hwloc_topology_diff_type_t type;
+ union hwloc_topology_diff_u * next; /* pointer to the next element of the list, or NULL */
+ } generic;
+
+ /* A difference in an object attribute. */
+ struct hwloc_topology_diff_obj_attr_s {
+ hwloc_topology_diff_type_t type; /* must be HWLOC_TOPOLOGY_DIFF_OBJ_ATTR */
+ union hwloc_topology_diff_u * next;
+ /* List of attribute differences for a single object */
+ unsigned obj_depth;
+ unsigned obj_index;
+ union hwloc_topology_diff_obj_attr_u diff;
+ } obj_attr;
+
+ /* A difference that is too complex. */
+ struct hwloc_topology_diff_too_complex_s {
+ hwloc_topology_diff_type_t type; /* must be HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX */
+ union hwloc_topology_diff_u * next;
+ /* Where we had to stop computing the diff in the first topology */
+ unsigned obj_depth;
+ unsigned obj_index;
+ } too_complex;
+} * hwloc_topology_diff_t;
+
+
+/** \brief Compute the difference between 2 topologies.
+ *
+ * The difference is stored as a list of hwloc_topology_diff_t entries
+ * starting at \p diff.
+ * It is computed by doing a depth-first traversal of both topology trees
+ * simultaneously.
+ *
+ * If the difference between 2 objects is too complex to be represented
+ * (for instance if some objects have different types, or different numbers
+ * of children), a special diff entry of type HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX
+ * is queued.
+ * The computation of the diff does not continue below these objects.
+ * So each such diff entry means that the difference between two subtrees
+ * could not be computed.
+ *
+ * \return 0 if the difference can be represented properly.
+ *
+ * \return 0 with \p diff pointing to NULL if there is no difference
+ * between the topologies.
+ *
+ * \return 1 if the difference is too complex (see above). Some entries in
+ * the list will be of type HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX.
+ *
+ * \return -1 on any other error.
+ *
+ * \note \p flags is currently not used. It should be 0.
+ *
+ * \note The output diff has to be freed with hwloc_topology_diff_destroy().
+ *
+ * \note The output diff can only be exported to XML or passed to
+ * hwloc_topology_diff_apply() if 0 was returned, i.e. if no entry of type
+ * HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX is listed.
+ *
+ * \note The output diff may be modified by removing some entries from
+ * the list. The removed entries should be freed by passing them to
+ * to hwloc_topology_diff_destroy() (possible as another list).
+*/
+HWLOC_DECLSPEC int hwloc_topology_diff_build(hwloc_topology_t topology, hwloc_topology_t newtopology, unsigned long flags, hwloc_topology_diff_t *diff);
+
+/** \brief Flags to be given to hwloc_topology_diff_apply().
+ */
+enum hwloc_topology_diff_apply_flags_e {
+ /** \brief Apply topology diff in reverse direction.
+ * \hideinitializer
+ */
+ HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE = (1UL<<0)
+};
+
+/** \brief Apply a topology diff to an existing topology.
+ *
+ * \p flags is an OR'ed set of hwloc_topology_diff_apply_flags_e.
+ *
+ * The new topology is modified in place. hwloc_topology_dup()
+ * may be used to duplicate it before patching.
+ *
+ * If the difference cannot be applied entirely, all previous applied
+ * elements are unapplied before returning.
+ *
+ * \return 0 on success.
+ *
+ * \return -N if applying the difference failed while trying
+ * to apply the N-th part of the difference. For instance -1
+ * is returned if the very first difference element could not
+ * be applied.
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_apply(hwloc_topology_t topology, hwloc_topology_diff_t diff, unsigned long flags);
+
+/** \brief Destroy a list of topology differences.
+ *
+ * \note The \p topology parameter must be a valid topology
+ * but it is not required that it is related to \p diff.
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_destroy(hwloc_topology_t topology, hwloc_topology_diff_t diff);
+
+/** \brief Load a list of topology differences from a XML file.
+ *
+ * If not \c NULL, \p refname will be filled with the identifier
+ * string of the reference topology for the difference file,
+ * if any was specified in the XML file.
+ * This identifier is usually the name of the other XML file
+ * that contains the reference topology.
+ *
+ * \note The \p topology parameter must be a valid topology
+ * but it is not required that it is related to \p diff.
+ *
+ * \note the pointer returned in refname should later be freed
+ * by the caller.
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_load_xml(hwloc_topology_t topology, const char *xmlpath, hwloc_topology_diff_t *diff, char **refname);
+
+/** \brief Export a list of topology differences to a XML file.
+ *
+ * If not \c NULL, \p refname defines an identifier string
+ * for the reference topology which was used as a base when
+ * computing this difference.
+ * This identifier is usually the name of the other XML file
+ * that contains the reference topology.
+ * This attribute is given back when reading the diff from XML.
+ *
+ * \note The \p topology parameter must be a valid topology
+ * but it is not required that it is related to \p diff.
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_export_xml(hwloc_topology_t topology, hwloc_topology_diff_t diff, const char *refname, const char *xmlpath);
+
+/** \brief Load a list of topology differences from a XML buffer.
+ *
+ * If not \c NULL, \p refname will be filled with the identifier
+ * string of the reference topology for the difference file,
+ * if any was specified in the XML file.
+ * This identifier is usually the name of the other XML file
+ * that contains the reference topology.
+ *
+ * \note The \p topology parameter must be a valid topology
+ * but it is not required that it is related to \p diff.
+ *
+ * \note the pointer returned in refname should later be freed
+ * by the caller.
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_load_xmlbuffer(hwloc_topology_t topology, const char *xmlbuffer, int buflen, hwloc_topology_diff_t *diff, char **refname);
+
+/** \brief Export a list of topology differences to a XML buffer.
+ *
+ * If not \c NULL, \p refname defines an identifier string
+ * for the reference topology which was used as a base when
+ * computing this difference.
+ * This identifier is usually the name of the other XML file
+ * that contains the reference topology.
+ * This attribute is given back when reading the diff from XML.
+ *
+ * \note The XML buffer should later be freed with hwloc_free_xmlbuffer().
+ *
+ * \note The \p topology parameter must be a valid topology
+ * but it is not required that it is related to \p diff.
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_export_xmlbuffer(hwloc_topology_t topology, hwloc_topology_diff_t diff, const char *refname, char **xmlbuffer, int *buflen);
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_HELPER_H */
diff --git a/ext/hwloc/include/hwloc/export.h b/ext/hwloc/include/hwloc/export.h
new file mode 100644
index 0000000..194ee6c
--- /dev/null
+++ b/ext/hwloc/include/hwloc/export.h
@@ -0,0 +1,221 @@
+/*
+ * Copyright © 2009-2015 Inria. All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Exporting Topologies to XML or to Synthetic strings.
+ */
+
+#ifndef HWLOC_EXPORT_H
+#define HWLOC_EXPORT_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#elif 0
+}
+#endif
+
+
+/** \defgroup hwlocality_xmlexport Exporting Topologies to XML
+ * @{
+ */
+
+/** \brief Export the topology into an XML file.
+ *
+ * This file may be loaded later through hwloc_topology_set_xml().
+ *
+ * \return -1 if a failure occured.
+ *
+ * \note See also hwloc_topology_set_userdata_export_callback()
+ * for exporting application-specific object userdata.
+ *
+ * \note The topology-specific userdata pointer is ignored when exporting to XML.
+ *
+ * \note Only printable characters may be exported to XML string attributes.
+ * Any other character, especially any non-ASCII character, will be silently
+ * dropped.
+ *
+ * \note If \p name is "-", the XML output is sent to the standard output.
+ */
+HWLOC_DECLSPEC int hwloc_topology_export_xml(hwloc_topology_t topology, const char *xmlpath);
+
+/** \brief Export the topology into a newly-allocated XML memory buffer.
+ *
+ * \p xmlbuffer is allocated by the callee and should be freed with
+ * hwloc_free_xmlbuffer() later in the caller.
+ *
+ * This memory buffer may be loaded later through hwloc_topology_set_xmlbuffer().
+ *
+ * \return -1 if a failure occured.
+ *
+ * \note See also hwloc_topology_set_userdata_export_callback()
+ * for exporting application-specific object userdata.
+ *
+ * \note The topology-specific userdata pointer is ignored when exporting to XML.
+ *
+ * \note Only printable characters may be exported to XML string attributes.
+ * Any other character, especially any non-ASCII character, will be silently
+ * dropped.
+ */
+HWLOC_DECLSPEC int hwloc_topology_export_xmlbuffer(hwloc_topology_t topology, char **xmlbuffer, int *buflen);
+
+/** \brief Free a buffer allocated by hwloc_topology_export_xmlbuffer() */
+HWLOC_DECLSPEC void hwloc_free_xmlbuffer(hwloc_topology_t topology, char *xmlbuffer);
+
+/** \brief Set the application-specific callback for exporting object userdata
+ *
+ * The object userdata pointer is not exported to XML by default because hwloc
+ * does not know what it contains.
+ *
+ * This function lets applications set \p export_cb to a callback function
+ * that converts this opaque userdata into an exportable string.
+ *
+ * \p export_cb is invoked during XML export for each object whose
+ * \p userdata pointer is not \c NULL.
+ * The callback should use hwloc_export_obj_userdata() or
+ * hwloc_export_obj_userdata_base64() to actually export
+ * something to XML (possibly multiple times per object).
+ *
+ * \p export_cb may be set to \c NULL if userdata should not be exported to XML.
+ *
+ * \note The topology-specific userdata pointer is ignored when exporting to XML.
+ */
+HWLOC_DECLSPEC void hwloc_topology_set_userdata_export_callback(hwloc_topology_t topology,
+ void (*export_cb)(void *reserved, hwloc_topology_t topology, hwloc_obj_t obj));
+
+/** \brief Export some object userdata to XML
+ *
+ * This function may only be called from within the export() callback passed
+ * to hwloc_topology_set_userdata_export_callback().
+ * It may be invoked one of multiple times to export some userdata to XML.
+ * The \p buffer content of length \p length is stored with optional name
+ * \p name.
+ *
+ * When importing this XML file, the import() callback (if set) will be
+ * called exactly as many times as hwloc_export_obj_userdata() was called
+ * during export(). It will receive the corresponding \p name, \p buffer
+ * and \p length arguments.
+ *
+ * \p reserved, \p topology and \p obj must be the first three parameters
+ * that were given to the export callback.
+ *
+ * Only printable characters may be exported to XML string attributes.
+ * If a non-printable character is passed in \p name or \p buffer,
+ * the function returns -1 with errno set to EINVAL.
+ *
+ * If exporting binary data, the application should first encode into
+ * printable characters only (or use hwloc_export_obj_userdata_base64()).
+ * It should also take care of portability issues if the export may
+ * be reimported on a different architecture.
+ */
+HWLOC_DECLSPEC int hwloc_export_obj_userdata(void *reserved, hwloc_topology_t topology, hwloc_obj_t obj, const char *name, const void *buffer, size_t length);
+
+/** \brief Encode and export some object userdata to XML
+ *
+ * This function is similar to hwloc_export_obj_userdata() but it encodes
+ * the input buffer into printable characters before exporting.
+ * On import, decoding is automatically performed before the data is given
+ * to the import() callback if any.
+ *
+ * This function may only be called from within the export() callback passed
+ * to hwloc_topology_set_userdata_export_callback().
+ *
+ * The function does not take care of portability issues if the export
+ * may be reimported on a different architecture.
+ */
+HWLOC_DECLSPEC int hwloc_export_obj_userdata_base64(void *reserved, hwloc_topology_t topology, hwloc_obj_t obj, const char *name, const void *buffer, size_t length);
+
+/** \brief Set the application-specific callback for importing userdata
+ *
+ * On XML import, userdata is ignored by default because hwloc does not know
+ * how to store it in memory.
+ *
+ * This function lets applications set \p import_cb to a callback function
+ * that will get the XML-stored userdata and store it in the object as expected
+ * by the application.
+ *
+ * \p import_cb is called during hwloc_topology_load() as many times as
+ * hwloc_export_obj_userdata() was called during export. The topology
+ * is not entirely setup yet. Object attributes are ready to consult,
+ * but links between objects are not.
+ *
+ * \p import_cb may be \c NULL if userdata should be ignored during import.
+ *
+ * \note \p buffer contains \p length characters followed by a null byte ('\0').
+ *
+ * \note This function should be called before hwloc_topology_load().
+ *
+ * \note The topology-specific userdata pointer is ignored when importing from XML.
+ */
+HWLOC_DECLSPEC void hwloc_topology_set_userdata_import_callback(hwloc_topology_t topology,
+ void (*import_cb)(hwloc_topology_t topology, hwloc_obj_t obj, const char *name, const void *buffer, size_t length));
+
+/** @} */
+
+
+/** \defgroup hwlocality_syntheticexport Exporting Topologies to Synthetic
+ * @{
+ */
+
+/** \brief Flags for exporting synthetic topologies.
+ *
+ * Flags to be given as a OR'ed set to hwloc_topology_export_synthetic().
+ */
+enum hwloc_topology_export_synthetic_flags_e {
+ /** \brief Export extended types such as L2dcache as basic types such as Cache.
+ *
+ * This is required if loading the synthetic description with hwloc < 1.9.
+ * \hideinitializer
+ */
+ HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES = (1UL<<0),
+
+ /** \brief Do not export level attributes.
+ *
+ * Ignore level attributes such as memory/cache sizes or PU indexes.
+ * This is required if loading the synthetic description with hwloc < 1.10.
+ * \hideinitializer
+ */
+ HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS = (1UL<<1)
+};
+
+/** \brief Export the topology as a synthetic string.
+ *
+ * At most \p buflen characters will be written in \p buffer,
+ * including the terminating \0.
+ *
+ * This exported string may be given back to hwloc_topology_set_synthetic().
+ *
+ * \p flags is a OR'ed set of hwloc_topology_export_synthetic_flags_e.
+ *
+ * \return The number of characters that were written,
+ * not including the terminating \0.
+ *
+ * \return -1 if the topology could not be exported,
+ * for instance if it is not symmetric.
+ *
+ * \note I/O and Misc children are ignored, the synthetic string only
+ * describes normal children.
+ *
+ * \note A 1024-byte buffer should be large enough for exporting
+ * topologies in the vast majority of cases.
+ */
+ HWLOC_DECLSPEC int hwloc_topology_export_synthetic(hwloc_topology_t topology, char *buffer, size_t buflen, unsigned long flags);
+
+/** @} */
+
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_EXPORT_H */
diff --git a/ext/hwloc/include/hwloc/gl.h b/ext/hwloc/include/hwloc/gl.h
new file mode 100644
index 0000000..4b8b3f2
--- /dev/null
+++ b/ext/hwloc/include/hwloc/gl.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright © 2012 Blue Brain Project, EPFL. All rights reserved.
+ * Copyright © 2012-2013 Inria. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and OpenGL displays.
+ *
+ * Applications that use both hwloc and OpenGL may want to include
+ * this file so as to get topology information for OpenGL displays.
+ */
+
+#ifndef HWLOC_GL_H
+#define HWLOC_GL_H
+
+#include <hwloc.h>
+
+#include <stdio.h>
+#include <string.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_gl Interoperability with OpenGL displays
+ *
+ * This interface offers ways to retrieve topology information about
+ * OpenGL displays.
+ *
+ * Only the NVIDIA display locality information is currently available,
+ * using the NV-CONTROL X11 extension and the NVCtrl library.
+ *
+ * @{
+ */
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * OpenGL display given by port and device index.
+ *
+ * Return the OS device object describing the OpenGL display
+ * whose port (server) is \p port and device (screen) is \p device.
+ * Return NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the GL component must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_gl_get_display_osdev_by_port_device(hwloc_topology_t topology,
+ unsigned port, unsigned device)
+{
+ unsigned x = (unsigned) -1, y = (unsigned) -1;
+ hwloc_obj_t osdev = NULL;
+ while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+ if (HWLOC_OBJ_OSDEV_GPU == osdev->attr->osdev.type
+ && osdev->name
+ && sscanf(osdev->name, ":%u.%u", &x, &y) == 2
+ && port == x && device == y)
+ return osdev;
+ }
+ errno = EINVAL;
+ return NULL;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * OpenGL display given by name.
+ *
+ * Return the OS device object describing the OpenGL display
+ * whose name is \p name, built as ":port.device" such as ":0.0" .
+ * Return NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the GL component must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_gl_get_display_osdev_by_name(hwloc_topology_t topology,
+ const char *name)
+{
+ hwloc_obj_t osdev = NULL;
+ while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+ if (HWLOC_OBJ_OSDEV_GPU == osdev->attr->osdev.type
+ && osdev->name
+ && !strcmp(name, osdev->name))
+ return osdev;
+ }
+ errno = EINVAL;
+ return NULL;
+}
+
+/** \brief Get the OpenGL display port and device corresponding
+ * to the given hwloc OS object.
+ *
+ * Return the OpenGL display port (server) in \p port and device (screen)
+ * in \p screen that correspond to the given hwloc OS device object.
+ * Return \c -1 if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the GL component must be enabled in the topology.
+ */
+static __hwloc_inline int
+hwloc_gl_get_display_by_osdev(hwloc_topology_t topology __hwloc_attribute_unused,
+ hwloc_obj_t osdev,
+ unsigned *port, unsigned *device)
+{
+ unsigned x = -1, y = -1;
+ if (HWLOC_OBJ_OSDEV_GPU == osdev->attr->osdev.type
+ && sscanf(osdev->name, ":%u.%u", &x, &y) == 2) {
+ *port = x;
+ *device = y;
+ return 0;
+ }
+ errno = EINVAL;
+ return -1;
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_GL_H */
+
diff --git a/ext/hwloc/include/hwloc/glibc-sched.h b/ext/hwloc/include/hwloc/glibc-sched.h
new file mode 100644
index 0000000..1f9ba7c
--- /dev/null
+++ b/ext/hwloc/include/hwloc/glibc-sched.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2013 inria. All rights reserved.
+ * Copyright © 2009-2011 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and glibc scheduling routines.
+ *
+ * Applications that use both hwloc and glibc scheduling routines such as
+ * sched_getaffinity() or pthread_attr_setaffinity_np() may want to include
+ * this file so as to ease conversion between their respective types.
+ */
+
+#ifndef HWLOC_GLIBC_SCHED_H
+#define HWLOC_GLIBC_SCHED_H
+
+#include <hwloc.h>
+#include <hwloc/helper.h>
+#include <assert.h>
+
+#if !defined _GNU_SOURCE || !defined _SCHED_H || (!defined CPU_SETSIZE && !defined sched_priority)
+#error Please make sure to include sched.h before including glibc-sched.h, and define _GNU_SOURCE before any inclusion of sched.h
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifdef HWLOC_HAVE_CPU_SET
+
+
+/** \defgroup hwlocality_glibc_sched Interoperability with glibc sched affinity
+ *
+ * This interface offers ways to convert between hwloc cpusets and glibc cpusets
+ * such as those manipulated by sched_getaffinity() or pthread_attr_setaffinity_np().
+ *
+ * \note Topology \p topology must match the current machine.
+ *
+ * @{
+ */
+
+
+/** \brief Convert hwloc CPU set \p toposet into glibc sched affinity CPU set \p schedset
+ *
+ * This function may be used before calling sched_setaffinity or any other function
+ * that takes a cpu_set_t as input parameter.
+ *
+ * \p schedsetsize should be sizeof(cpu_set_t) unless \p schedset was dynamically allocated with CPU_ALLOC
+ */
+static __hwloc_inline int
+hwloc_cpuset_to_glibc_sched_affinity(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t hwlocset,
+ cpu_set_t *schedset, size_t schedsetsize)
+{
+#ifdef CPU_ZERO_S
+ unsigned cpu;
+ CPU_ZERO_S(schedsetsize, schedset);
+ hwloc_bitmap_foreach_begin(cpu, hwlocset)
+ CPU_SET_S(cpu, schedsetsize, schedset);
+ hwloc_bitmap_foreach_end();
+#else /* !CPU_ZERO_S */
+ unsigned cpu;
+ CPU_ZERO(schedset);
+ assert(schedsetsize == sizeof(cpu_set_t));
+ hwloc_bitmap_foreach_begin(cpu, hwlocset)
+ CPU_SET(cpu, schedset);
+ hwloc_bitmap_foreach_end();
+#endif /* !CPU_ZERO_S */
+ return 0;
+}
+
+/** \brief Convert glibc sched affinity CPU set \p schedset into hwloc CPU set
+ *
+ * This function may be used before calling sched_setaffinity or any other function
+ * that takes a cpu_set_t as input parameter.
+ *
+ * \p schedsetsize should be sizeof(cpu_set_t) unless \p schedset was dynamically allocated with CPU_ALLOC
+ */
+static __hwloc_inline int
+hwloc_cpuset_from_glibc_sched_affinity(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_cpuset_t hwlocset,
+ const cpu_set_t *schedset, size_t schedsetsize)
+{
+ int cpu;
+#ifdef CPU_ZERO_S
+ int count;
+#endif
+ hwloc_bitmap_zero(hwlocset);
+#ifdef CPU_ZERO_S
+ count = CPU_COUNT_S(schedsetsize, schedset);
+ cpu = 0;
+ while (count) {
+ if (CPU_ISSET_S(cpu, schedsetsize, schedset)) {
+ hwloc_bitmap_set(hwlocset, cpu);
+ count--;
+ }
+ cpu++;
+ }
+#else /* !CPU_ZERO_S */
+ /* sched.h does not support dynamic cpu_set_t (introduced in glibc 2.7),
+ * assume we have a very old interface without CPU_COUNT (added in 2.6)
+ */
+ assert(schedsetsize == sizeof(cpu_set_t));
+ for(cpu=0; cpu<CPU_SETSIZE; cpu++)
+ if (CPU_ISSET(cpu, schedset))
+ hwloc_bitmap_set(hwlocset, cpu);
+#endif /* !CPU_ZERO_S */
+ return 0;
+}
+
+/** @} */
+
+
+#endif /* CPU_SET */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_GLIBC_SCHED_H */
diff --git a/ext/hwloc/include/hwloc/helper.h b/ext/hwloc/include/hwloc/helper.h
new file mode 100644
index 0000000..883b87d
--- /dev/null
+++ b/ext/hwloc/include/hwloc/helper.h
@@ -0,0 +1,1249 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria. All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2010 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief High-level hwloc traversal helpers.
+ */
+
+#ifndef HWLOC_HELPER_H
+#define HWLOC_HELPER_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+#include <stdlib.h>
+#include <errno.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_helper_find_inside Finding Objects inside a CPU set
+ * @{
+ */
+
+/** \brief Get the first largest object included in the given cpuset \p set.
+ *
+ * \return the first object that is included in \p set and whose parent is not.
+ *
+ * This is convenient for iterating over all largest objects within a CPU set
+ * by doing a loop getting the first largest object and clearing its CPU set
+ * from the remaining CPU set.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_first_largest_obj_inside_cpuset(hwloc_topology_t topology, hwloc_const_cpuset_t set)
+{
+ hwloc_obj_t obj = hwloc_get_root_obj(topology);
+ if (!hwloc_bitmap_intersects(obj->cpuset, set))
+ return NULL;
+ while (!hwloc_bitmap_isincluded(obj->cpuset, set)) {
+ /* while the object intersects without being included, look at its children */
+ hwloc_obj_t child = obj->first_child;
+ while (child) {
+ if (hwloc_bitmap_intersects(child->cpuset, set))
+ break;
+ child = child->next_sibling;
+ }
+ if (!child)
+ /* no child intersects, return their father */
+ return obj;
+ /* found one intersecting child, look at its children */
+ obj = child;
+ }
+ /* obj is included, return it */
+ return obj;
+}
+
+/** \brief Get the set of largest objects covering exactly a given cpuset \p set
+ *
+ * \return the number of objects returned in \p objs.
+ */
+HWLOC_DECLSPEC int hwloc_get_largest_objs_inside_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+ hwloc_obj_t * __hwloc_restrict objs, int max);
+
+/** \brief Return the next object at depth \p depth included in CPU set \p set.
+ *
+ * If \p prev is \c NULL, return the first object at depth \p depth
+ * included in \p set. The next invokation should pass the previous
+ * return value in \p prev so as to obtain the next object in \p set.
+ *
+ * \note This function cannot work if objects at the given depth do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+ unsigned depth, hwloc_obj_t prev)
+{
+ hwloc_obj_t next = hwloc_get_next_obj_by_depth(topology, depth, prev);
+ if (!next)
+ return NULL;
+ while (next && !hwloc_bitmap_isincluded(next->cpuset, set))
+ next = next->next_cousin;
+ return next;
+}
+
+/** \brief Return the next object of type \p type included in CPU set \p set.
+ *
+ * If there are multiple or no depth for given type, return \c NULL
+ * and let the caller fallback to
+ * hwloc_get_next_obj_inside_cpuset_by_depth().
+ *
+ * \note This function cannot work if objects of the given type do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+ hwloc_obj_type_t type, hwloc_obj_t prev)
+{
+ int depth = hwloc_get_type_depth(topology, type);
+ if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+ return NULL;
+ return hwloc_get_next_obj_inside_cpuset_by_depth(topology, set, depth, prev);
+}
+
+/** \brief Return the (logically) \p idx -th object at depth \p depth included in CPU set \p set.
+ *
+ * \note This function cannot work if objects at the given depth do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+ unsigned depth, unsigned idx) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+ unsigned depth, unsigned idx)
+{
+ hwloc_obj_t obj = hwloc_get_obj_by_depth (topology, depth, 0);
+ unsigned count = 0;
+ if (!obj)
+ return NULL;
+ while (obj) {
+ if (hwloc_bitmap_isincluded(obj->cpuset, set)) {
+ if (count == idx)
+ return obj;
+ count++;
+ }
+ obj = obj->next_cousin;
+ }
+ return NULL;
+}
+
+/** \brief Return the \p idx -th object of type \p type included in CPU set \p set.
+ *
+ * If there are multiple or no depth for given type, return \c NULL
+ * and let the caller fallback to
+ * hwloc_get_obj_inside_cpuset_by_depth().
+ *
+ * \note This function cannot work if objects of the given type do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+ hwloc_obj_type_t type, unsigned idx) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+ hwloc_obj_type_t type, unsigned idx)
+{
+ int depth = hwloc_get_type_depth(topology, type);
+ if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+ return NULL;
+ return hwloc_get_obj_inside_cpuset_by_depth(topology, set, depth, idx);
+}
+
+/** \brief Return the number of objects at depth \p depth included in CPU set \p set.
+ *
+ * \note This function cannot work if objects at the given depth do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline unsigned
+hwloc_get_nbobjs_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+ unsigned depth) __hwloc_attribute_pure;
+static __hwloc_inline unsigned
+hwloc_get_nbobjs_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+ unsigned depth)
+{
+ hwloc_obj_t obj = hwloc_get_obj_by_depth (topology, depth, 0);
+ unsigned count = 0;
+ if (!obj)
+ return 0;
+ while (obj) {
+ if (hwloc_bitmap_isincluded(obj->cpuset, set))
+ count++;
+ obj = obj->next_cousin;
+ }
+ return count;
+}
+
+/** \brief Return the number of objects of type \p type included in CPU set \p set.
+ *
+ * If no object for that type exists inside CPU set \p set, 0 is
+ * returned. If there are several levels with objects of that type
+ * inside CPU set \p set, -1 is returned.
+ *
+ * \note This function cannot work if objects of the given type do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline int
+hwloc_get_nbobjs_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+ hwloc_obj_type_t type) __hwloc_attribute_pure;
+static __hwloc_inline int
+hwloc_get_nbobjs_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+ hwloc_obj_type_t type)
+{
+ int depth = hwloc_get_type_depth(topology, type);
+ if (depth == HWLOC_TYPE_DEPTH_UNKNOWN)
+ return 0;
+ if (depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+ return -1; /* FIXME: agregate nbobjs from different levels? */
+ return hwloc_get_nbobjs_inside_cpuset_by_depth(topology, set, depth);
+}
+
+/** \brief Return the logical index among the objects included in CPU set \p set.
+ *
+ * Consult all objects in the same level as \p obj and inside CPU set \p set
+ * in the logical order, and return the index of \p obj within them.
+ * If \p set covers the entire topology, this is the logical index of \p obj.
+ * Otherwise, this is similar to a logical index within the part of the topology
+ * defined by CPU set \p set.
+ *
+ * \note This function cannot work if obj does not have CPU sets (I/O objects).
+ */
+static __hwloc_inline int
+hwloc_get_obj_index_inside_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set,
+ hwloc_obj_t obj) __hwloc_attribute_pure;
+static __hwloc_inline int
+hwloc_get_obj_index_inside_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set,
+ hwloc_obj_t obj)
+{
+ int idx = 0;
+ if (!hwloc_bitmap_isincluded(obj->cpuset, set))
+ return -1;
+ /* count how many objects are inside the cpuset on the way from us to the beginning of the level */
+ while ((obj = obj->prev_cousin) != NULL)
+ if (hwloc_bitmap_isincluded(obj->cpuset, set))
+ idx++;
+ return idx;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_find_covering Finding Objects covering at least CPU set
+ * @{
+ */
+
+/** \brief Get the child covering at least CPU set \p set.
+ *
+ * \return \c NULL if no child matches or if \p set is empty.
+ *
+ * \note This function cannot work if parent does not have a CPU set (I/O objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_child_covering_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set,
+ hwloc_obj_t parent) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_child_covering_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set,
+ hwloc_obj_t parent)
+{
+ hwloc_obj_t child;
+ if (hwloc_bitmap_iszero(set))
+ return NULL;
+ child = parent->first_child;
+ while (child) {
+ if (child->cpuset && hwloc_bitmap_isincluded(set, child->cpuset))
+ return child;
+ child = child->next_sibling;
+ }
+ return NULL;
+}
+
+/** \brief Get the lowest object covering at least CPU set \p set
+ *
+ * \return \c NULL if no object matches or if \p set is empty.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set)
+{
+ struct hwloc_obj *current = hwloc_get_root_obj(topology);
+ if (hwloc_bitmap_iszero(set) || !hwloc_bitmap_isincluded(set, current->cpuset))
+ return NULL;
+ while (1) {
+ hwloc_obj_t child = hwloc_get_child_covering_cpuset(topology, set, current);
+ if (!child)
+ return current;
+ current = child;
+ }
+}
+
+/** \brief Iterate through same-depth objects covering at least CPU set \p set
+ *
+ * If object \p prev is \c NULL, return the first object at depth \p
+ * depth covering at least part of CPU set \p set. The next
+ * invokation should pass the previous return value in \p prev so as
+ * to obtain the next object covering at least another part of \p set.
+ *
+ * \note This function cannot work if objects at the given depth do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_covering_cpuset_by_depth(hwloc_topology_t topology, hwloc_const_cpuset_t set,
+ unsigned depth, hwloc_obj_t prev)
+{
+ hwloc_obj_t next = hwloc_get_next_obj_by_depth(topology, depth, prev);
+ if (!next)
+ return NULL;
+ while (next && !hwloc_bitmap_intersects(set, next->cpuset))
+ next = next->next_cousin;
+ return next;
+}
+
+/** \brief Iterate through same-type objects covering at least CPU set \p set
+ *
+ * If object \p prev is \c NULL, return the first object of type \p
+ * type covering at least part of CPU set \p set. The next invokation
+ * should pass the previous return value in \p prev so as to obtain
+ * the next object of type \p type covering at least another part of
+ * \p set.
+ *
+ * If there are no or multiple depths for type \p type, \c NULL is returned.
+ * The caller may fallback to hwloc_get_next_obj_covering_cpuset_by_depth()
+ * for each depth.
+ *
+ * \note This function cannot work if objects of the given type do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_covering_cpuset_by_type(hwloc_topology_t topology, hwloc_const_cpuset_t set,
+ hwloc_obj_type_t type, hwloc_obj_t prev)
+{
+ int depth = hwloc_get_type_depth(topology, type);
+ if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+ return NULL;
+ return hwloc_get_next_obj_covering_cpuset_by_depth(topology, set, depth, prev);
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_ancestors Looking at Ancestor and Child Objects
+ * @{
+ *
+ * Be sure to see the figure in \ref termsanddefs that shows a
+ * complete topology tree, including depths, child/sibling/cousin
+ * relationships, and an example of an asymmetric topology where one
+ * package has fewer caches than its peers.
+ */
+
+/** \brief Returns the ancestor object of \p obj at depth \p depth. */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_ancestor_obj_by_depth (hwloc_topology_t topology __hwloc_attribute_unused, unsigned depth, hwloc_obj_t obj) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_ancestor_obj_by_depth (hwloc_topology_t topology __hwloc_attribute_unused, unsigned depth, hwloc_obj_t obj)
+{
+ hwloc_obj_t ancestor = obj;
+ if (obj->depth < depth)
+ return NULL;
+ while (ancestor && ancestor->depth > depth)
+ ancestor = ancestor->parent;
+ return ancestor;
+}
+
+/** \brief Returns the ancestor object of \p obj with type \p type. */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_ancestor_obj_by_type (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_type_t type, hwloc_obj_t obj) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_ancestor_obj_by_type (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_type_t type, hwloc_obj_t obj)
+{
+ hwloc_obj_t ancestor = obj->parent;
+ while (ancestor && ancestor->type != type)
+ ancestor = ancestor->parent;
+ return ancestor;
+}
+
+/** \brief Returns the common parent object to objects lvl1 and lvl2 */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_common_ancestor_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj1, hwloc_obj_t obj2) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_common_ancestor_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj1, hwloc_obj_t obj2)
+{
+ /* the loop isn't so easy since intermediate ancestors may have
+ * different depth, causing us to alternate between using obj1->parent
+ * and obj2->parent. Also, even if at some point we find ancestors of
+ * of the same depth, their ancestors may have different depth again.
+ */
+ while (obj1 != obj2) {
+ while (obj1->depth > obj2->depth)
+ obj1 = obj1->parent;
+ while (obj2->depth > obj1->depth)
+ obj2 = obj2->parent;
+ if (obj1 != obj2 && obj1->depth == obj2->depth) {
+ obj1 = obj1->parent;
+ obj2 = obj2->parent;
+ }
+ }
+ return obj1;
+}
+
+/** \brief Returns true if \p obj is inside the subtree beginning with ancestor object \p subtree_root.
+ *
+ * \note This function cannot work if \p obj and \p subtree_root objects do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline int
+hwloc_obj_is_in_subtree (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj, hwloc_obj_t subtree_root) __hwloc_attribute_pure;
+static __hwloc_inline int
+hwloc_obj_is_in_subtree (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj, hwloc_obj_t subtree_root)
+{
+ return obj->cpuset && subtree_root->cpuset && hwloc_bitmap_isincluded(obj->cpuset, subtree_root->cpuset);
+}
+
+/** \brief Return the next child.
+ *
+ * Return the next child among the normal children list, then among the I/O
+ * children list, then among the Misc children list.
+ *
+ * If \p prev is \c NULL, return the first child.
+ *
+ * Return \c NULL when there is no next child.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_child (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t parent, hwloc_obj_t prev)
+{
+ hwloc_obj_t obj;
+ int state = 0;
+ if (prev) {
+ if (prev->type == HWLOC_OBJ_MISC)
+ state = 2;
+ else if (prev->type == HWLOC_OBJ_BRIDGE || prev->type == HWLOC_OBJ_PCI_DEVICE || prev->type == HWLOC_OBJ_OS_DEVICE)
+ state = 1;
+ obj = prev->next_sibling;
+ } else {
+ obj = parent->first_child;
+ }
+ if (!obj && state == 0) {
+ obj = parent->io_first_child;
+ state = 1;
+ }
+ if (!obj && state == 1) {
+ obj = parent->misc_first_child;
+ state = 2;
+ }
+ return obj;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_find_cache Looking at Cache Objects
+ * @{
+ */
+
+/** \brief Find the depth of cache objects matching cache depth and type.
+ *
+ * Return the depth of the topology level that contains cache objects
+ * whose attributes match \p cachedepth and \p cachetype. This function
+ * intends to disambiguate the case where hwloc_get_type_depth() returns
+ * \p HWLOC_TYPE_DEPTH_MULTIPLE.
+ *
+ * If no cache level matches, \p HWLOC_TYPE_DEPTH_UNKNOWN is returned.
+ *
+ * If \p cachetype is \p HWLOC_OBJ_CACHE_UNIFIED, the depth of the
+ * unique matching unified cache level is returned.
+ *
+ * If \p cachetype is \p HWLOC_OBJ_CACHE_DATA or \p HWLOC_OBJ_CACHE_INSTRUCTION,
+ * either a matching cache, or a unified cache is returned.
+ *
+ * If \p cachetype is \c -1, it is ignored and multiple levels may
+ * match. The function returns either the depth of a uniquely matching
+ * level or \p HWLOC_TYPE_DEPTH_MULTIPLE.
+ */
+static __hwloc_inline int
+hwloc_get_cache_type_depth (hwloc_topology_t topology,
+ unsigned cachelevel, hwloc_obj_cache_type_t cachetype)
+{
+ int depth;
+ int found = HWLOC_TYPE_DEPTH_UNKNOWN;
+ for (depth=0; ; depth++) {
+ hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, depth, 0);
+ if (!obj)
+ break;
+ if (obj->type != HWLOC_OBJ_CACHE || obj->attr->cache.depth != cachelevel)
+ /* doesn't match, try next depth */
+ continue;
+ if (cachetype == (hwloc_obj_cache_type_t) -1) {
+ if (found != HWLOC_TYPE_DEPTH_UNKNOWN) {
+ /* second match, return MULTIPLE */
+ return HWLOC_TYPE_DEPTH_MULTIPLE;
+ }
+ /* first match, mark it as found */
+ found = depth;
+ continue;
+ }
+ if (obj->attr->cache.type == cachetype || obj->attr->cache.type == HWLOC_OBJ_CACHE_UNIFIED)
+ /* exact match (either unified is alone, or we match instruction or data), return immediately */
+ return depth;
+ }
+ /* went to the bottom, return what we found */
+ return found;
+}
+
+/** \brief Get the first cache covering a cpuset \p set
+ *
+ * \return \c NULL if no cache matches.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_cache_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_cache_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set)
+{
+ hwloc_obj_t current = hwloc_get_obj_covering_cpuset(topology, set);
+ while (current) {
+ if (current->type == HWLOC_OBJ_CACHE)
+ return current;
+ current = current->parent;
+ }
+ return NULL;
+}
+
+/** \brief Get the first cache shared between an object and somebody else.
+ *
+ * \return \c NULL if no cache matches or if an invalid object is given.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_shared_cache_covering_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_shared_cache_covering_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj)
+{
+ hwloc_obj_t current = obj->parent;
+ if (!obj->cpuset)
+ return NULL;
+ while (current) {
+ if (!hwloc_bitmap_isequal(current->cpuset, obj->cpuset)
+ && current->type == HWLOC_OBJ_CACHE)
+ return current;
+ current = current->parent;
+ }
+ return NULL;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_find_misc Finding objects, miscellaneous helpers
+ * @{
+ *
+ * Be sure to see the figure in \ref termsanddefs that shows a
+ * complete topology tree, including depths, child/sibling/cousin
+ * relationships, and an example of an asymmetric topology where one
+ * package has fewer caches than its peers.
+ */
+
+/** \brief Returns the object of type ::HWLOC_OBJ_PU with \p os_index.
+ *
+ * This function is useful for converting a CPU set into the PU
+ * objects it contains.
+ * When retrieving the current binding (e.g. with hwloc_get_cpubind()),
+ * one may iterate over the bits of the resulting CPU set with
+ * hwloc_bitmap_foreach_begin(), and find the corresponding PUs
+ * with this function.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_pu_obj_by_os_index(hwloc_topology_t topology, unsigned os_index) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_pu_obj_by_os_index(hwloc_topology_t topology, unsigned os_index)
+{
+ hwloc_obj_t obj = NULL;
+ while ((obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_PU, obj)) != NULL)
+ if (obj->os_index == os_index)
+ return obj;
+ return NULL;
+}
+
+/** \brief Returns the object of type ::HWLOC_OBJ_NUMANODE with \p os_index.
+ *
+ * This function is useful for converting a nodeset into the NUMA node
+ * objects it contains.
+ * When retrieving the current binding (e.g. with hwloc_get_membind_nodeset()),
+ * one may iterate over the bits of the resulting nodeset with
+ * hwloc_bitmap_foreach_begin(), and find the corresponding NUMA nodes
+ * with this function.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_numanode_obj_by_os_index(hwloc_topology_t topology, unsigned os_index) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_numanode_obj_by_os_index(hwloc_topology_t topology, unsigned os_index)
+{
+ hwloc_obj_t obj = NULL;
+ while ((obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE, obj)) != NULL)
+ if (obj->os_index == os_index)
+ return obj;
+ return NULL;
+}
+
+/** \brief Do a depth-first traversal of the topology to find and sort
+ *
+ * all objects that are at the same depth than \p src.
+ * Report in \p objs up to \p max physically closest ones to \p src.
+ *
+ * \return the number of objects returned in \p objs.
+ *
+ * \return 0 if \p src is an I/O object.
+ *
+ * \note This function requires the \p src object to have a CPU set.
+ */
+/* TODO: rather provide an iterator? Provide a way to know how much should be allocated? By returning the total number of objects instead? */
+HWLOC_DECLSPEC unsigned hwloc_get_closest_objs (hwloc_topology_t topology, hwloc_obj_t src, hwloc_obj_t * __hwloc_restrict objs, unsigned max);
+
+/** \brief Find an object below another object, both specified by types and indexes.
+ *
+ * Start from the top system object and find object of type \p type1
+ * and logical index \p idx1. Then look below this object and find another
+ * object of type \p type2 and logical index \p idx2. Indexes are specified
+ * within the parent, not withing the entire system.
+ *
+ * For instance, if type1 is PACKAGE, idx1 is 2, type2 is CORE and idx2
+ * is 3, return the fourth core object below the third package.
+ *
+ * \note This function requires these objects to have a CPU set.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_below_by_type (hwloc_topology_t topology,
+ hwloc_obj_type_t type1, unsigned idx1,
+ hwloc_obj_type_t type2, unsigned idx2) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_below_by_type (hwloc_topology_t topology,
+ hwloc_obj_type_t type1, unsigned idx1,
+ hwloc_obj_type_t type2, unsigned idx2)
+{
+ hwloc_obj_t obj;
+ obj = hwloc_get_obj_by_type (topology, type1, idx1);
+ if (!obj)
+ return NULL;
+ return hwloc_get_obj_inside_cpuset_by_type(topology, obj->cpuset, type2, idx2);
+}
+
+/** \brief Find an object below a chain of objects specified by types and indexes.
+ *
+ * This is a generalized version of hwloc_get_obj_below_by_type().
+ *
+ * Arrays \p typev and \p idxv must contain \p nr types and indexes.
+ *
+ * Start from the top system object and walk the arrays \p typev and \p idxv.
+ * For each type and logical index couple in the arrays, look under the previously found
+ * object to find the index-th object of the given type.
+ * Indexes are specified within the parent, not withing the entire system.
+ *
+ * For instance, if nr is 3, typev contains NODE, PACKAGE and CORE,
+ * and idxv contains 0, 1 and 2, return the third core object below
+ * the second package below the first NUMA node.
+ *
+ * \note This function requires all these objects and the root object
+ * to have a CPU set.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_below_array_by_type (hwloc_topology_t topology, int nr, hwloc_obj_type_t *typev, unsigned *idxv) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_below_array_by_type (hwloc_topology_t topology, int nr, hwloc_obj_type_t *typev, unsigned *idxv)
+{
+ hwloc_obj_t obj = hwloc_get_root_obj(topology);
+ int i;
+ for(i=0; i<nr; i++) {
+ if (!obj)
+ return NULL;
+ obj = hwloc_get_obj_inside_cpuset_by_type(topology, obj->cpuset, typev[i], idxv[i]);
+ }
+ return obj;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_distribute Distributing items over a topology
+ * @{
+ */
+
+/** \brief Flags to be given to hwloc_distrib().
+ */
+enum hwloc_distrib_flags_e {
+ /** \brief Distrib in reverse order, starting from the last objects.
+ * \hideinitializer
+ */
+ HWLOC_DISTRIB_FLAG_REVERSE = (1UL<<0)
+};
+
+/** \brief Distribute \p n items over the topology under \p roots
+ *
+ * Array \p set will be filled with \p n cpusets recursively distributed
+ * linearly over the topology under objects \p roots, down to depth \p until
+ * (which can be INT_MAX to distribute down to the finest level).
+ *
+ * \p n_roots is usually 1 and \p roots only contains the topology root object
+ * so as to distribute over the entire topology.
+ *
+ * This is typically useful when an application wants to distribute \p n
+ * threads over a machine, giving each of them as much private cache as
+ * possible and keeping them locally in number order.
+ *
+ * The caller may typically want to also call hwloc_bitmap_singlify()
+ * before binding a thread so that it does not move at all.
+ *
+ * \p flags should be 0 or a OR'ed set of ::hwloc_distrib_flags_e.
+ *
+ * \note This function requires the \p roots objects to have a CPU set.
+ *
+ * \note This function replaces the now deprecated hwloc_distribute()
+ * and hwloc_distributev() functions.
+ */
+static __hwloc_inline int
+hwloc_distrib(hwloc_topology_t topology,
+ hwloc_obj_t *roots, unsigned n_roots,
+ hwloc_cpuset_t *set,
+ unsigned n,
+ unsigned until, unsigned long flags)
+{
+ unsigned i;
+ unsigned tot_weight;
+ unsigned given, givenweight;
+ hwloc_cpuset_t *cpusetp = set;
+
+ if (flags & ~HWLOC_DISTRIB_FLAG_REVERSE) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ tot_weight = 0;
+ for (i = 0; i < n_roots; i++)
+ tot_weight += hwloc_bitmap_weight(roots[i]->cpuset);
+
+ for (i = 0, given = 0, givenweight = 0; i < n_roots; i++) {
+ unsigned chunk, weight;
+ hwloc_obj_t root = roots[flags & HWLOC_DISTRIB_FLAG_REVERSE ? n_roots-1-i : i];
+ hwloc_cpuset_t cpuset = root->cpuset;
+ weight = hwloc_bitmap_weight(cpuset);
+ if (!weight)
+ continue;
+ /* Give to root a chunk proportional to its weight.
+ * If previous chunks got rounded-up, we may get a bit less. */
+ chunk = (( (givenweight+weight) * n + tot_weight-1) / tot_weight)
+ - (( givenweight * n + tot_weight-1) / tot_weight);
+ if (!root->arity || chunk <= 1 || root->depth >= until) {
+ /* We can't split any more, put everything there. */
+ if (chunk) {
+ /* Fill cpusets with ours */
+ unsigned j;
+ for (j=0; j < chunk; j++)
+ cpusetp[j] = hwloc_bitmap_dup(cpuset);
+ } else {
+ /* We got no chunk, just merge our cpuset to a previous one
+ * (the first chunk cannot be empty)
+ * so that this root doesn't get ignored.
+ */
+ assert(given);
+ hwloc_bitmap_or(cpusetp[-1], cpusetp[-1], cpuset);
+ }
+ } else {
+ /* Still more to distribute, recurse into children */
+ hwloc_distrib(topology, root->children, root->arity, cpusetp, chunk, until, flags);
+ }
+ cpusetp += chunk;
+ given += chunk;
+ givenweight += weight;
+ }
+
+ return 0;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_topology_sets CPU and node sets of entire topologies
+ * @{
+ */
+/** \brief Get complete CPU set
+ *
+ * \return the complete CPU set of logical processors of the system.
+ *
+ * \note The returned cpuset is not newly allocated and should thus not be
+ * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
+ */
+static __hwloc_inline hwloc_const_cpuset_t
+hwloc_topology_get_complete_cpuset(hwloc_topology_t topology) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_const_cpuset_t
+hwloc_topology_get_complete_cpuset(hwloc_topology_t topology)
+{
+ return hwloc_get_root_obj(topology)->complete_cpuset;
+}
+
+/** \brief Get topology CPU set
+ *
+ * \return the CPU set of logical processors of the system for which hwloc
+ * provides topology information. This is equivalent to the cpuset of the
+ * system object.
+ *
+ * \note The returned cpuset is not newly allocated and should thus not be
+ * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
+ */
+static __hwloc_inline hwloc_const_cpuset_t
+hwloc_topology_get_topology_cpuset(hwloc_topology_t topology) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_const_cpuset_t
+hwloc_topology_get_topology_cpuset(hwloc_topology_t topology)
+{
+ return hwloc_get_root_obj(topology)->cpuset;
+}
+
+/** \brief Get allowed CPU set
+ *
+ * \return the CPU set of allowed logical processors of the system.
+ *
+ * \note The returned cpuset is not newly allocated and should thus not be
+ * changed or freed, hwloc_bitmap_dup() must be used to obtain a local copy.
+ */
+static __hwloc_inline hwloc_const_cpuset_t
+hwloc_topology_get_allowed_cpuset(hwloc_topology_t topology) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_const_cpuset_t
+hwloc_topology_get_allowed_cpuset(hwloc_topology_t topology)
+{
+ return hwloc_get_root_obj(topology)->allowed_cpuset;
+}
+
+/** \brief Get complete node set
+ *
+ * \return the complete node set of memory of the system.
+ *
+ * \note The returned nodeset is not newly allocated and should thus not be
+ * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
+ */
+static __hwloc_inline hwloc_const_nodeset_t
+hwloc_topology_get_complete_nodeset(hwloc_topology_t topology) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_const_nodeset_t
+hwloc_topology_get_complete_nodeset(hwloc_topology_t topology)
+{
+ return hwloc_get_root_obj(topology)->complete_nodeset;
+}
+
+/** \brief Get topology node set
+ *
+ * \return the node set of memory of the system for which hwloc
+ * provides topology information. This is equivalent to the nodeset of the
+ * system object.
+ *
+ * \note The returned nodeset is not newly allocated and should thus not be
+ * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
+ */
+static __hwloc_inline hwloc_const_nodeset_t
+hwloc_topology_get_topology_nodeset(hwloc_topology_t topology) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_const_nodeset_t
+hwloc_topology_get_topology_nodeset(hwloc_topology_t topology)
+{
+ return hwloc_get_root_obj(topology)->nodeset;
+}
+
+/** \brief Get allowed node set
+ *
+ * \return the node set of allowed memory of the system.
+ *
+ * \note The returned nodeset is not newly allocated and should thus not be
+ * changed or freed, hwloc_bitmap_dup() must be used to obtain a local copy.
+ */
+static __hwloc_inline hwloc_const_nodeset_t
+hwloc_topology_get_allowed_nodeset(hwloc_topology_t topology) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_const_nodeset_t
+hwloc_topology_get_allowed_nodeset(hwloc_topology_t topology)
+{
+ return hwloc_get_root_obj(topology)->allowed_nodeset;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_nodeset_convert Converting between CPU sets and node sets
+ *
+ * There are two semantics for converting cpusets to nodesets depending on how
+ * non-NUMA machines are handled.
+ *
+ * When manipulating nodesets for memory binding, non-NUMA machines should be
+ * considered as having a single NUMA node. The standard conversion routines
+ * below should be used so that marking the first bit of the nodeset means
+ * that memory should be bound to a non-NUMA whole machine.
+ *
+ * When manipulating nodesets as an actual list of NUMA nodes without any
+ * need to handle memory binding on non-NUMA machines, the strict conversion
+ * routines may be used instead.
+ * @{
+ */
+
+/** \brief Convert a CPU set into a NUMA node set and handle non-NUMA cases
+ *
+ * If some NUMA nodes have no CPUs at all, this function never sets their
+ * indexes in the output node set, even if a full CPU set is given in input.
+ *
+ * If the topology contains no NUMA nodes, the machine is considered
+ * as a single memory node, and the following behavior is used:
+ * If \p cpuset is empty, \p nodeset will be emptied as well.
+ * Otherwise \p nodeset will be entirely filled.
+ */
+static __hwloc_inline void
+hwloc_cpuset_to_nodeset(hwloc_topology_t topology, hwloc_const_cpuset_t _cpuset, hwloc_nodeset_t nodeset)
+{
+ int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+ hwloc_obj_t obj = NULL;
+ assert(depth != HWLOC_TYPE_DEPTH_UNKNOWN);
+ hwloc_bitmap_zero(nodeset);
+ while ((obj = hwloc_get_next_obj_covering_cpuset_by_depth(topology, _cpuset, depth, obj)) != NULL)
+ hwloc_bitmap_set(nodeset, obj->os_index);
+}
+
+/** \brief Convert a CPU set into a NUMA node set without handling non-NUMA cases
+ *
+ * This is the strict variant of ::hwloc_cpuset_to_nodeset. It does not fix
+ * non-NUMA cases. If the topology contains some NUMA nodes, behave exactly
+ * the same. However, if the topology contains no NUMA nodes, return an empty
+ * nodeset.
+ */
+static __hwloc_inline void
+hwloc_cpuset_to_nodeset_strict(struct hwloc_topology *topology, hwloc_const_cpuset_t _cpuset, hwloc_nodeset_t nodeset)
+{
+ int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+ hwloc_obj_t obj = NULL;
+ assert(depth != HWLOC_TYPE_DEPTH_UNKNOWN);
+ hwloc_bitmap_zero(nodeset);
+ while ((obj = hwloc_get_next_obj_covering_cpuset_by_depth(topology, _cpuset, depth, obj)) != NULL)
+ hwloc_bitmap_set(nodeset, obj->os_index);
+}
+
+/** \brief Convert a NUMA node set into a CPU set and handle non-NUMA cases
+ *
+ * If the topology contains no NUMA nodes, the machine is considered
+ * as a single memory node, and the following behavior is used:
+ * If \p nodeset is empty, \p cpuset will be emptied as well.
+ * Otherwise \p cpuset will be entirely filled.
+ * This is useful for manipulating memory binding sets.
+ */
+static __hwloc_inline void
+hwloc_cpuset_from_nodeset(hwloc_topology_t topology, hwloc_cpuset_t _cpuset, hwloc_const_nodeset_t nodeset)
+{
+ int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+ hwloc_obj_t obj = NULL;
+ assert(depth != HWLOC_TYPE_DEPTH_UNKNOWN);
+ hwloc_bitmap_zero(_cpuset);
+ while ((obj = hwloc_get_next_obj_by_depth(topology, depth, obj)) != NULL) {
+ if (hwloc_bitmap_isset(nodeset, obj->os_index))
+ /* no need to check obj->cpuset because objects in levels always have a cpuset */
+ hwloc_bitmap_or(_cpuset, _cpuset, obj->cpuset);
+ }
+}
+
+/** \brief Convert a NUMA node set into a CPU set without handling non-NUMA cases
+ *
+ * This is the strict variant of ::hwloc_cpuset_from_nodeset. It does not fix
+ * non-NUMA cases. If the topology contains some NUMA nodes, behave exactly
+ * the same. However, if the topology contains no NUMA nodes, return an empty
+ * cpuset.
+ */
+static __hwloc_inline void
+hwloc_cpuset_from_nodeset_strict(struct hwloc_topology *topology, hwloc_cpuset_t _cpuset, hwloc_const_nodeset_t nodeset)
+{
+ int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+ hwloc_obj_t obj = NULL;
+ assert(depth != HWLOC_TYPE_DEPTH_UNKNOWN);
+ hwloc_bitmap_zero(_cpuset);
+ while ((obj = hwloc_get_next_obj_by_depth(topology, depth, obj)) != NULL)
+ if (hwloc_bitmap_isset(nodeset, obj->os_index))
+ /* no need to check obj->cpuset because objects in levels always have a cpuset */
+ hwloc_bitmap_or(_cpuset, _cpuset, obj->cpuset);
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_distances Manipulating Distances
+ * @{
+ */
+
+/** \brief Get the distances between all objects at the given depth.
+ *
+ * \return a distances structure containing a matrix with all distances
+ * between all objects at the given depth.
+ *
+ * Slot i+nbobjs*j contains the distance from the object of logical index i
+ * the object of logical index j.
+ *
+ * \note This function only returns matrices covering the whole topology,
+ * without any unknown distance value. Those matrices are available in
+ * top-level object of the hierarchy. Matrices of lower objects are not
+ * reported here since they cover only part of the machine.
+ *
+ * The returned structure belongs to the hwloc library. The caller should
+ * not modify or free it.
+ *
+ * \return \c NULL if no such distance matrix exists.
+ */
+
+static __hwloc_inline const struct hwloc_distances_s *
+hwloc_get_whole_distance_matrix_by_depth(hwloc_topology_t topology, unsigned depth)
+{
+ hwloc_obj_t root = hwloc_get_root_obj(topology);
+ unsigned i;
+ for(i=0; i<root->distances_count; i++)
+ if (root->distances[i]->relative_depth == depth)
+ return root->distances[i];
+ return NULL;
+}
+
+/** \brief Get the distances between all objects of a given type.
+ *
+ * \return a distances structure containing a matrix with all distances
+ * between all objects of the given type.
+ *
+ * Slot i+nbobjs*j contains the distance from the object of logical index i
+ * the object of logical index j.
+ *
+ * \note This function only returns matrices covering the whole topology,
+ * without any unknown distance value. Those matrices are available in
+ * top-level object of the hierarchy. Matrices of lower objects are not
+ * reported here since they cover only part of the machine.
+ *
+ * The returned structure belongs to the hwloc library. The caller should
+ * not modify or free it.
+ *
+ * \return \c NULL if no such distance matrix exists.
+ */
+
+static __hwloc_inline const struct hwloc_distances_s *
+hwloc_get_whole_distance_matrix_by_type(hwloc_topology_t topology, hwloc_obj_type_t type)
+{
+ int depth = hwloc_get_type_depth(topology, type);
+ if (depth < 0)
+ return NULL;
+ return hwloc_get_whole_distance_matrix_by_depth(topology, depth);
+}
+
+/** \brief Get distances for the given depth and covering some objects
+ *
+ * Return a distance matrix that describes depth \p depth and covers at
+ * least object \p obj and all its children.
+ *
+ * When looking for the distance between some objects, a common ancestor should
+ * be passed in \p obj.
+ *
+ * \p firstp is set to logical index of the first object described by the matrix.
+ *
+ * The returned structure belongs to the hwloc library. The caller should
+ * not modify or free it.
+ */
+static __hwloc_inline const struct hwloc_distances_s *
+hwloc_get_distance_matrix_covering_obj_by_depth(hwloc_topology_t topology,
+ hwloc_obj_t obj, unsigned depth,
+ unsigned *firstp)
+{
+ if (!obj->cpuset)
+ return NULL;
+ while (obj) {
+ unsigned i;
+ for(i=0; i<obj->distances_count; i++)
+ if (obj->distances[i]->relative_depth == depth - obj->depth) {
+ if (!obj->distances[i]->nbobjs)
+ continue;
+ *firstp = hwloc_get_next_obj_inside_cpuset_by_depth(topology, obj->cpuset, depth, NULL)->logical_index;
+ return obj->distances[i];
+ }
+ obj = obj->parent;
+ }
+ return NULL;
+}
+
+/** \brief Get the latency in both directions between two objects.
+ *
+ * Look at ancestor objects from the bottom to the top until one of them
+ * contains a distance matrix that matches the objects exactly.
+ *
+ * \p latency gets the value from object \p obj1 to \p obj2, while
+ * \p reverse_latency gets the reverse-direction value, which
+ * may be different on some architectures.
+ *
+ * \return -1 if no ancestor contains a matching latency matrix.
+ */
+static __hwloc_inline int
+hwloc_get_latency(hwloc_topology_t topology,
+ hwloc_obj_t obj1, hwloc_obj_t obj2,
+ float *latency, float *reverse_latency)
+{
+ hwloc_obj_t ancestor;
+ const struct hwloc_distances_s * distances;
+ unsigned first_logical ;
+
+ if (obj1->depth != obj2->depth) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ ancestor = hwloc_get_common_ancestor_obj(topology, obj1, obj2);
+ distances = hwloc_get_distance_matrix_covering_obj_by_depth(topology, ancestor, obj1->depth, &first_logical);
+ if (distances && distances->latency) {
+ const float * latency_matrix = distances->latency;
+ unsigned nbobjs = distances->nbobjs;
+ unsigned l1 = obj1->logical_index - first_logical;
+ unsigned l2 = obj2->logical_index - first_logical;
+ *latency = latency_matrix[l1*nbobjs+l2];
+ *reverse_latency = latency_matrix[l2*nbobjs+l1];
+ return 0;
+ }
+
+ errno = ENOSYS;
+ return -1;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_advanced_io Finding I/O objects
+ * @{
+ */
+
+/** \brief Get the first non-I/O ancestor object.
+ *
+ * Given the I/O object \p ioobj, find the smallest non-I/O ancestor
+ * object. This regular object may then be used for binding because
+ * its locality is the same as \p ioobj.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_non_io_ancestor_obj(hwloc_topology_t topology __hwloc_attribute_unused,
+ hwloc_obj_t ioobj)
+{
+ hwloc_obj_t obj = ioobj;
+ while (obj && !obj->cpuset) {
+ obj = obj->parent;
+ }
+ return obj;
+}
+
+/** \brief Get the next PCI device in the system.
+ *
+ * \return the first PCI device if \p prev is \c NULL.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_pcidev(hwloc_topology_t topology, hwloc_obj_t prev)
+{
+ return hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_PCI_DEVICE, prev);
+}
+
+/** \brief Find the PCI device object matching the PCI bus id
+ * given domain, bus device and function PCI bus id.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_pcidev_by_busid(hwloc_topology_t topology,
+ unsigned domain, unsigned bus, unsigned dev, unsigned func)
+{
+ hwloc_obj_t obj = NULL;
+ while ((obj = hwloc_get_next_pcidev(topology, obj)) != NULL) {
+ if (obj->attr->pcidev.domain == domain
+ && obj->attr->pcidev.bus == bus
+ && obj->attr->pcidev.dev == dev
+ && obj->attr->pcidev.func == func)
+ return obj;
+ }
+ return NULL;
+}
+
+/** \brief Find the PCI device object matching the PCI bus id
+ * given as a string xxxx:yy:zz.t or yy:zz.t.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_pcidev_by_busidstring(hwloc_topology_t topology, const char *busid)
+{
+ unsigned domain = 0; /* default */
+ unsigned bus, dev, func;
+
+ if (sscanf(busid, "%x:%x.%x", &bus, &dev, &func) != 3
+ && sscanf(busid, "%x:%x:%x.%x", &domain, &bus, &dev, &func) != 4) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ return hwloc_get_pcidev_by_busid(topology, domain, bus, dev, func);
+}
+
+/** \brief Get the next OS device in the system.
+ *
+ * \return the first OS device if \p prev is \c NULL.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_osdev(hwloc_topology_t topology, hwloc_obj_t prev)
+{
+ return hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_OS_DEVICE, prev);
+}
+
+/** \brief Get the next bridge in the system.
+ *
+ * \return the first bridge if \p prev is \c NULL.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_bridge(hwloc_topology_t topology, hwloc_obj_t prev)
+{
+ return hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_BRIDGE, prev);
+}
+
+/* \brief Checks whether a given bridge covers a given PCI bus.
+ */
+static __hwloc_inline int
+hwloc_bridge_covers_pcibus(hwloc_obj_t bridge,
+ unsigned domain, unsigned bus)
+{
+ return bridge->type == HWLOC_OBJ_BRIDGE
+ && bridge->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI
+ && bridge->attr->bridge.downstream.pci.domain == domain
+ && bridge->attr->bridge.downstream.pci.secondary_bus <= bus
+ && bridge->attr->bridge.downstream.pci.subordinate_bus >= bus;
+}
+
+/** \brief Find the hostbridge that covers the given PCI bus.
+ *
+ * This is useful for finding the locality of a bus because
+ * it is the hostbridge parent cpuset.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_hostbridge_by_pcibus(hwloc_topology_t topology,
+ unsigned domain, unsigned bus)
+{
+ hwloc_obj_t obj = NULL;
+ while ((obj = hwloc_get_next_bridge(topology, obj)) != NULL) {
+ if (hwloc_bridge_covers_pcibus(obj, domain, bus)) {
+ /* found bridge covering this pcibus, make sure it's a hostbridge */
+ assert(obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST);
+ assert(obj->parent->type != HWLOC_OBJ_BRIDGE);
+ assert(obj->parent->cpuset);
+ return obj;
+ }
+ }
+ return NULL;
+}
+
+/** @} */
+
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_HELPER_H */
diff --git a/ext/hwloc/include/hwloc/inlines.h b/ext/hwloc/include/hwloc/inlines.h
new file mode 100644
index 0000000..7281750
--- /dev/null
+++ b/ext/hwloc/include/hwloc/inlines.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2013 Inria. All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2010 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/**
+ * This file contains the inline code of functions declared in hwloc.h
+ */
+
+#ifndef HWLOC_INLINES_H
+#define HWLOC_INLINES_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+#include <stdlib.h>
+#include <errno.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static __hwloc_inline int
+hwloc_get_type_or_below_depth (hwloc_topology_t topology, hwloc_obj_type_t type)
+{
+ int depth = hwloc_get_type_depth(topology, type);
+
+ if (depth != HWLOC_TYPE_DEPTH_UNKNOWN)
+ return depth;
+
+ /* find the highest existing level with type order >= */
+ for(depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU); ; depth--)
+ if (hwloc_compare_types(hwloc_get_depth_type(topology, depth), type) < 0)
+ return depth+1;
+
+ /* Shouldn't ever happen, as there is always a SYSTEM level with lower order and known depth. */
+ /* abort(); */
+}
+
+static __hwloc_inline int
+hwloc_get_type_or_above_depth (hwloc_topology_t topology, hwloc_obj_type_t type)
+{
+ int depth = hwloc_get_type_depth(topology, type);
+
+ if (depth != HWLOC_TYPE_DEPTH_UNKNOWN)
+ return depth;
+
+ /* find the lowest existing level with type order <= */
+ for(depth = 0; ; depth++)
+ if (hwloc_compare_types(hwloc_get_depth_type(topology, depth), type) > 0)
+ return depth-1;
+
+ /* Shouldn't ever happen, as there is always a PU level with higher order and known depth. */
+ /* abort(); */
+}
+
+static __hwloc_inline int
+hwloc_get_nbobjs_by_type (hwloc_topology_t topology, hwloc_obj_type_t type)
+{
+ int depth = hwloc_get_type_depth(topology, type);
+ if (depth == HWLOC_TYPE_DEPTH_UNKNOWN)
+ return 0;
+ if (depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+ return -1; /* FIXME: agregate nbobjs from different levels? */
+ return hwloc_get_nbobjs_by_depth(topology, depth);
+}
+
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type, unsigned idx)
+{
+ int depth = hwloc_get_type_depth(topology, type);
+ if (depth == HWLOC_TYPE_DEPTH_UNKNOWN)
+ return NULL;
+ if (depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+ return NULL;
+ return hwloc_get_obj_by_depth(topology, depth, idx);
+}
+
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_by_depth (hwloc_topology_t topology, unsigned depth, hwloc_obj_t prev)
+{
+ if (!prev)
+ return hwloc_get_obj_by_depth (topology, depth, 0);
+ if (prev->depth != depth)
+ return NULL;
+ return prev->next_cousin;
+}
+
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type,
+ hwloc_obj_t prev)
+{
+ int depth = hwloc_get_type_depth(topology, type);
+ if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+ return NULL;
+ return hwloc_get_next_obj_by_depth (topology, depth, prev);
+}
+
+static __hwloc_inline hwloc_obj_t
+hwloc_get_root_obj (hwloc_topology_t topology)
+{
+ return hwloc_get_obj_by_depth (topology, 0, 0);
+}
+
+static __hwloc_inline const char *
+hwloc_obj_get_info_by_name(hwloc_obj_t obj, const char *name)
+{
+ unsigned i;
+ for(i=0; i<obj->infos_count; i++)
+ if (!strcmp(obj->infos[i].name, name))
+ return obj->infos[i].value;
+ return NULL;
+}
+
+static __hwloc_inline void *
+hwloc_alloc_membind_policy_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+ void *p = hwloc_alloc_membind_nodeset(topology, len, nodeset, policy, flags);
+ if (p)
+ return p;
+ hwloc_set_membind_nodeset(topology, nodeset, policy, flags);
+ p = hwloc_alloc(topology, len);
+ if (p && policy != HWLOC_MEMBIND_FIRSTTOUCH)
+ /* Enforce the binding by touching the data */
+ memset(p, 0, len);
+ return p;
+}
+
+static __hwloc_inline void *
+hwloc_alloc_membind_policy(hwloc_topology_t topology, size_t len, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags)
+{
+ void *p = hwloc_alloc_membind(topology, len, set, policy, flags);
+ if (p)
+ return p;
+ hwloc_set_membind(topology, set, policy, flags);
+ p = hwloc_alloc(topology, len);
+ if (p && policy != HWLOC_MEMBIND_FIRSTTOUCH)
+ /* Enforce the binding by touching the data */
+ memset(p, 0, len);
+ return p;
+}
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_INLINES_H */
diff --git a/ext/hwloc/include/hwloc/intel-mic.h b/ext/hwloc/include/hwloc/intel-mic.h
new file mode 100644
index 0000000..d58237b
--- /dev/null
+++ b/ext/hwloc/include/hwloc/intel-mic.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright © 2013 Inria. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and Intel Xeon Phi (MIC).
+ *
+ * Applications that use both hwloc and Intel Xeon Phi (MIC) may want to
+ * include this file so as to get topology information for MIC devices.
+ */
+
+#ifndef HWLOC_INTEL_MIC_H
+#define HWLOC_INTEL_MIC_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+#include <hwloc/helper.h>
+#ifdef HWLOC_LINUX_SYS
+#include <hwloc/linux.h>
+#include <dirent.h>
+#include <string.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_intel_mic Interoperability with Intel Xeon Phi (MIC)
+ *
+ * This interface offers ways to retrieve topology information about
+ * Intel Xeon Phi (MIC) devices.
+ *
+ * @{
+ */
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close to MIC device whose index is \p idx.
+ *
+ * Return the CPU set describing the locality of the MIC device whose index is \p idx.
+ *
+ * Topology \p topology and device index \p idx must match the local machine.
+ * I/O devices detection is not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * If more information about the device is needed, OS objects should
+ * be used instead, see hwloc_intel_mic_get_device_osdev_by_index().
+ *
+ * This function is currently only implemented in a meaningful way for
+ * Linux; other systems will simply get a full cpuset.
+ */
+static __hwloc_inline int
+hwloc_intel_mic_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
+ int idx __hwloc_attribute_unused,
+ hwloc_cpuset_t set)
+{
+#ifdef HWLOC_LINUX_SYS
+ /* If we're on Linux, use the sysfs mechanism to get the local cpus */
+#define HWLOC_INTEL_MIC_DEVICE_SYSFS_PATH_MAX 128
+ char path[HWLOC_INTEL_MIC_DEVICE_SYSFS_PATH_MAX];
+ DIR *sysdir = NULL;
+ FILE *sysfile = NULL;
+ struct dirent *dirent;
+ unsigned pcibus, pcidev, pcifunc;
+
+ if (!hwloc_topology_is_thissystem(topology)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ sprintf(path, "/sys/class/mic/mic%d", idx);
+ sysdir = opendir(path);
+ if (!sysdir)
+ return -1;
+
+ while ((dirent = readdir(sysdir)) != NULL) {
+ if (sscanf(dirent->d_name, "pci_%02x:%02x.%02x", &pcibus, &pcidev, &pcifunc) == 3) {
+ sprintf(path, "/sys/class/mic/mic%d/pci_%02x:%02x.%02x/local_cpus", idx, pcibus, pcidev, pcifunc);
+ sysfile = fopen(path, "r");
+ if (!sysfile) {
+ closedir(sysdir);
+ return -1;
+ }
+
+ hwloc_linux_parse_cpumap_file(sysfile, set);
+ if (hwloc_bitmap_iszero(set))
+ hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+
+ fclose(sysfile);
+ break;
+ }
+ }
+
+ closedir(sysdir);
+#else
+ /* Non-Linux systems simply get a full cpuset */
+ hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#endif
+ return 0;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * MIC device for the given index.
+ *
+ * Return the OS device object describing the MIC device whose index is \p idx.
+ * Return NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_intel_mic_get_device_osdev_by_index(hwloc_topology_t topology,
+ unsigned idx)
+{
+ hwloc_obj_t osdev = NULL;
+ while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+ if (HWLOC_OBJ_OSDEV_COPROC == osdev->attr->osdev.type
+ && osdev->name
+ && !strncmp("mic", osdev->name, 3)
+ && atoi(osdev->name + 3) == (int) idx)
+ return osdev;
+ }
+ return NULL;
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_INTEL_MIC_H */
diff --git a/ext/hwloc/include/hwloc/linux-libnuma.h b/ext/hwloc/include/hwloc/linux-libnuma.h
new file mode 100644
index 0000000..0ce2591
--- /dev/null
+++ b/ext/hwloc/include/hwloc/linux-libnuma.h
@@ -0,0 +1,273 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria. All rights reserved.
+ * Copyright © 2009-2010, 2012 Université Bordeaux
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and Linux libnuma.
+ *
+ * Applications that use both Linux libnuma and hwloc may want to
+ * include this file so as to ease conversion between their respective types.
+*/
+
+#ifndef HWLOC_LINUX_LIBNUMA_H
+#define HWLOC_LINUX_LIBNUMA_H
+
+#include <hwloc.h>
+#include <numa.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_linux_libnuma_ulongs Interoperability with Linux libnuma unsigned long masks
+ *
+ * This interface helps converting between Linux libnuma unsigned long masks
+ * and hwloc cpusets and nodesets.
+ *
+ * \note Topology \p topology must match the current machine.
+ *
+ * \note The behavior of libnuma is undefined if the kernel is not NUMA-aware.
+ * (when CONFIG_NUMA is not set in the kernel configuration).
+ * This helper and libnuma may thus not be strictly compatible in this case,
+ * which may be detected by checking whether numa_available() returns -1.
+ *
+ * @{
+ */
+
+
+/** \brief Convert hwloc CPU set \p cpuset into the array of unsigned long \p mask
+ *
+ * \p mask is the array of unsigned long that will be filled.
+ * \p maxnode contains the maximal node number that may be stored in \p mask.
+ * \p maxnode will be set to the maximal node number that was found, plus one.
+ *
+ * This function may be used before calling set_mempolicy, mbind, migrate_pages
+ * or any other function that takes an array of unsigned long and a maximal
+ * node number as input parameter.
+ */
+static __hwloc_inline int
+hwloc_cpuset_to_linux_libnuma_ulongs(hwloc_topology_t topology, hwloc_const_cpuset_t cpuset,
+ unsigned long *mask, unsigned long *maxnode)
+{
+ int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+ unsigned long outmaxnode = -1;
+ hwloc_obj_t node = NULL;
+
+ /* round-up to the next ulong and clear all bytes */
+ *maxnode = (*maxnode + 8*sizeof(*mask) - 1) & ~(8*sizeof(*mask) - 1);
+ memset(mask, 0, *maxnode/8);
+
+ while ((node = hwloc_get_next_obj_covering_cpuset_by_depth(topology, cpuset, depth, node)) != NULL) {
+ if (node->os_index >= *maxnode)
+ continue;
+ mask[node->os_index/sizeof(*mask)/8] |= 1UL << (node->os_index % (sizeof(*mask)*8));
+ if (outmaxnode == (unsigned long) -1 || outmaxnode < node->os_index)
+ outmaxnode = node->os_index;
+ }
+
+ *maxnode = outmaxnode+1;
+ return 0;
+}
+
+/** \brief Convert hwloc NUMA node set \p nodeset into the array of unsigned long \p mask
+ *
+ * \p mask is the array of unsigned long that will be filled.
+ * \p maxnode contains the maximal node number that may be stored in \p mask.
+ * \p maxnode will be set to the maximal node number that was found, plus one.
+ *
+ * This function may be used before calling set_mempolicy, mbind, migrate_pages
+ * or any other function that takes an array of unsigned long and a maximal
+ * node number as input parameter.
+ */
+static __hwloc_inline int
+hwloc_nodeset_to_linux_libnuma_ulongs(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset,
+ unsigned long *mask, unsigned long *maxnode)
+{
+ int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+ unsigned long outmaxnode = -1;
+ hwloc_obj_t node = NULL;
+
+ /* round-up to the next ulong and clear all bytes */
+ *maxnode = (*maxnode + 8*sizeof(*mask) - 1) & ~(8*sizeof(*mask) - 1);
+ memset(mask, 0, *maxnode/8);
+
+ while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL) {
+ if (node->os_index >= *maxnode)
+ continue;
+ if (!hwloc_bitmap_isset(nodeset, node->os_index))
+ continue;
+ mask[node->os_index/sizeof(*mask)/8] |= 1UL << (node->os_index % (sizeof(*mask)*8));
+ if (outmaxnode == (unsigned long) -1 || outmaxnode < node->os_index)
+ outmaxnode = node->os_index;
+ }
+
+ *maxnode = outmaxnode+1;
+ return 0;
+}
+
+/** \brief Convert the array of unsigned long \p mask into hwloc CPU set
+ *
+ * \p mask is a array of unsigned long that will be read.
+ * \p maxnode contains the maximal node number that may be read in \p mask.
+ *
+ * This function may be used after calling get_mempolicy or any other function
+ * that takes an array of unsigned long as output parameter (and possibly
+ * a maximal node number as input parameter).
+ */
+static __hwloc_inline int
+hwloc_cpuset_from_linux_libnuma_ulongs(hwloc_topology_t topology, hwloc_cpuset_t cpuset,
+ const unsigned long *mask, unsigned long maxnode)
+{
+ int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+ hwloc_obj_t node = NULL;
+ hwloc_bitmap_zero(cpuset);
+ while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL)
+ if (node->os_index < maxnode
+ && (mask[node->os_index/sizeof(*mask)/8] & (1UL << (node->os_index % (sizeof(*mask)*8)))))
+ hwloc_bitmap_or(cpuset, cpuset, node->cpuset);
+ return 0;
+}
+
+/** \brief Convert the array of unsigned long \p mask into hwloc NUMA node set
+ *
+ * \p mask is a array of unsigned long that will be read.
+ * \p maxnode contains the maximal node number that may be read in \p mask.
+ *
+ * This function may be used after calling get_mempolicy or any other function
+ * that takes an array of unsigned long as output parameter (and possibly
+ * a maximal node number as input parameter).
+ */
+static __hwloc_inline int
+hwloc_nodeset_from_linux_libnuma_ulongs(hwloc_topology_t topology, hwloc_nodeset_t nodeset,
+ const unsigned long *mask, unsigned long maxnode)
+{
+ int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+ hwloc_obj_t node = NULL;
+ hwloc_bitmap_zero(nodeset);
+ while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL)
+ if (node->os_index < maxnode
+ && (mask[node->os_index/sizeof(*mask)/8] & (1UL << (node->os_index % (sizeof(*mask)*8)))))
+ hwloc_bitmap_set(nodeset, node->os_index);
+ return 0;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_linux_libnuma_bitmask Interoperability with Linux libnuma bitmask
+ *
+ * This interface helps converting between Linux libnuma bitmasks
+ * and hwloc cpusets and nodesets.
+ *
+ * \note Topology \p topology must match the current machine.
+ *
+ * \note The behavior of libnuma is undefined if the kernel is not NUMA-aware.
+ * (when CONFIG_NUMA is not set in the kernel configuration).
+ * This helper and libnuma may thus not be strictly compatible in this case,
+ * which may be detected by checking whether numa_available() returns -1.
+ *
+ * @{
+ */
+
+
+/** \brief Convert hwloc CPU set \p cpuset into the returned libnuma bitmask
+ *
+ * The returned bitmask should later be freed with numa_bitmask_free.
+ *
+ * This function may be used before calling many numa_ functions
+ * that use a struct bitmask as an input parameter.
+ *
+ * \return newly allocated struct bitmask.
+ */
+static __hwloc_inline struct bitmask *
+hwloc_cpuset_to_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_const_cpuset_t cpuset) __hwloc_attribute_malloc;
+static __hwloc_inline struct bitmask *
+hwloc_cpuset_to_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_const_cpuset_t cpuset)
+{
+ int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+ hwloc_obj_t node = NULL;
+ struct bitmask *bitmask = numa_allocate_cpumask();
+ if (!bitmask)
+ return NULL;
+ while ((node = hwloc_get_next_obj_covering_cpuset_by_depth(topology, cpuset, depth, node)) != NULL)
+ if (node->memory.local_memory)
+ numa_bitmask_setbit(bitmask, node->os_index);
+ return bitmask;
+}
+
+/** \brief Convert hwloc NUMA node set \p nodeset into the returned libnuma bitmask
+ *
+ * The returned bitmask should later be freed with numa_bitmask_free.
+ *
+ * This function may be used before calling many numa_ functions
+ * that use a struct bitmask as an input parameter.
+ *
+ * \return newly allocated struct bitmask.
+ */
+static __hwloc_inline struct bitmask *
+hwloc_nodeset_to_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset) __hwloc_attribute_malloc;
+static __hwloc_inline struct bitmask *
+hwloc_nodeset_to_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset)
+{
+ int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+ hwloc_obj_t node = NULL;
+ struct bitmask *bitmask = numa_allocate_cpumask();
+ if (!bitmask)
+ return NULL;
+ while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL)
+ if (hwloc_bitmap_isset(nodeset, node->os_index) && node->memory.local_memory)
+ numa_bitmask_setbit(bitmask, node->os_index);
+ return bitmask;
+}
+
+/** \brief Convert libnuma bitmask \p bitmask into hwloc CPU set \p cpuset
+ *
+ * This function may be used after calling many numa_ functions
+ * that use a struct bitmask as an output parameter.
+ */
+static __hwloc_inline int
+hwloc_cpuset_from_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_cpuset_t cpuset,
+ const struct bitmask *bitmask)
+{
+ int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+ hwloc_obj_t node = NULL;
+ hwloc_bitmap_zero(cpuset);
+ while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL)
+ if (numa_bitmask_isbitset(bitmask, node->os_index))
+ hwloc_bitmap_or(cpuset, cpuset, node->cpuset);
+ return 0;
+}
+
+/** \brief Convert libnuma bitmask \p bitmask into hwloc NUMA node set \p nodeset
+ *
+ * This function may be used after calling many numa_ functions
+ * that use a struct bitmask as an output parameter.
+ */
+static __hwloc_inline int
+hwloc_nodeset_from_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_nodeset_t nodeset,
+ const struct bitmask *bitmask)
+{
+ int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+ hwloc_obj_t node = NULL;
+ hwloc_bitmap_zero(nodeset);
+ while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL)
+ if (numa_bitmask_isbitset(bitmask, node->os_index))
+ hwloc_bitmap_set(nodeset, node->os_index);
+ return 0;
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_LINUX_NUMA_H */
diff --git a/ext/hwloc/include/hwloc/linux.h b/ext/hwloc/include/hwloc/linux.h
new file mode 100644
index 0000000..4ddc900
--- /dev/null
+++ b/ext/hwloc/include/hwloc/linux.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2013 Inria. All rights reserved.
+ * Copyright © 2009-2011 Université Bordeaux
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and Linux.
+ *
+ * Applications that use hwloc on Linux may want to include this file
+ * if using some low-level Linux features.
+ */
+
+#ifndef HWLOC_LINUX_H
+#define HWLOC_LINUX_H
+
+#include <hwloc.h>
+#include <stdio.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_linux Linux-specific helpers
+ *
+ * This includes helpers for manipulating Linux kernel cpumap files, and hwloc
+ * equivalents of the Linux sched_setaffinity and sched_getaffinity system calls.
+ *
+ * @{
+ */
+
+/** \brief Convert a linux kernel cpumap file \p file into hwloc CPU set.
+ *
+ * Might be used when reading CPU set from sysfs attributes such as topology
+ * and caches for processors, or local_cpus for devices.
+ */
+HWLOC_DECLSPEC int hwloc_linux_parse_cpumap_file(FILE *file, hwloc_cpuset_t set);
+
+/** \brief Bind a thread \p tid on cpus given in cpuset \p set
+ *
+ * The behavior is exactly the same as the Linux sched_setaffinity system call,
+ * but uses a hwloc cpuset.
+ *
+ * \note This is equivalent to calling hwloc_set_proc_cpubind() with
+ * HWLOC_CPUBIND_THREAD as flags.
+ */
+HWLOC_DECLSPEC int hwloc_linux_set_tid_cpubind(hwloc_topology_t topology, pid_t tid, hwloc_const_cpuset_t set);
+
+/** \brief Get the current binding of thread \p tid
+ *
+ * The behavior is exactly the same as the Linux sched_getaffinity system call,
+ * but uses a hwloc cpuset.
+ *
+ * \note This is equivalent to calling hwloc_get_proc_cpubind() with
+ * HWLOC_CPUBIND_THREAD as flags.
+ */
+HWLOC_DECLSPEC int hwloc_linux_get_tid_cpubind(hwloc_topology_t topology, pid_t tid, hwloc_cpuset_t set);
+
+/** \brief Get the last physical CPU where thread \p tid ran.
+ *
+ * \note This is equivalent to calling hwloc_get_proc_last_cpu_location() with
+ * HWLOC_CPUBIND_THREAD as flags.
+ */
+HWLOC_DECLSPEC int hwloc_linux_get_tid_last_cpu_location(hwloc_topology_t topology, pid_t tid, hwloc_bitmap_t set);
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_GLIBC_SCHED_H */
diff --git a/ext/hwloc/include/hwloc/myriexpress.h b/ext/hwloc/include/hwloc/myriexpress.h
new file mode 100644
index 0000000..68ff88f
--- /dev/null
+++ b/ext/hwloc/include/hwloc/myriexpress.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright © 2010-2014 Inria. All rights reserved.
+ * Copyright © 2011 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and Myrinet Express.
+ *
+ * Applications that use both hwloc and Myrinet Express verbs may want to
+ * include this file so as to get topology information for Myrinet hardware.
+ *
+ */
+
+#ifndef HWLOC_MYRIEXPRESS_H
+#define HWLOC_MYRIEXPRESS_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+
+#include <myriexpress.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_myriexpress Interoperability with Myrinet Express
+ *
+ * This interface offers ways to retrieve topology information about
+ * Myrinet Express hardware.
+ *
+ * @{
+ */
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close the MX board \p id.
+ *
+ * Return the CPU set describing the locality of the Myrinet Express
+ * board whose index is \p id.
+ *
+ * Topology \p topology and device \p id must match the local machine.
+ * I/O devices detection is not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * No additional information about the device is available.
+ */
+static __hwloc_inline int
+hwloc_mx_board_get_device_cpuset(hwloc_topology_t topology,
+ unsigned id, hwloc_cpuset_t set)
+{
+ uint32_t in, out;
+
+ if (!hwloc_topology_is_thissystem(topology)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ in = id;
+ if (mx_get_info(NULL, MX_NUMA_NODE, &in, sizeof(in), &out, sizeof(out)) != MX_SUCCESS) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (out != (uint32_t) -1) {
+ hwloc_obj_t obj = NULL;
+ while ((obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE, obj)) != NULL)
+ if (obj->os_index == out) {
+ hwloc_bitmap_copy(set, obj->cpuset);
+ goto out;
+ }
+ }
+ /* fallback to the full topology cpuset */
+ hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+
+ out:
+ return 0;
+}
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close the MX endpoint \p endpoint.
+ *
+ * Return the CPU set describing the locality of the Myrinet Express
+ * board that runs the MX endpoint \p endpoint.
+ *
+ * Topology \p topology and device \p id must match the local machine.
+ * I/O devices detection is not needed in the topology.
+ *
+ * The function only returns the locality of the endpoint.
+ * No additional information about the endpoint or device is available.
+ */
+static __hwloc_inline int
+hwloc_mx_endpoint_get_device_cpuset(hwloc_topology_t topology,
+ mx_endpoint_t endpoint, hwloc_cpuset_t set)
+{
+ uint64_t nid;
+ uint32_t nindex, eid;
+ mx_endpoint_addr_t eaddr;
+
+ if (mx_get_endpoint_addr(endpoint, &eaddr) != MX_SUCCESS) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (mx_decompose_endpoint_addr(eaddr, &nid, &eid) != MX_SUCCESS) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (mx_nic_id_to_board_number(nid, &nindex) != MX_SUCCESS) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ return hwloc_mx_board_get_device_cpuset(topology, nindex, set);
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_MYRIEXPRESS_H */
diff --git a/ext/hwloc/include/hwloc/nvml.h b/ext/hwloc/include/hwloc/nvml.h
new file mode 100644
index 0000000..462b332
--- /dev/null
+++ b/ext/hwloc/include/hwloc/nvml.h
@@ -0,0 +1,176 @@
+/*
+ * Copyright © 2012-2013 Inria. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and the NVIDIA Management Library.
+ *
+ * Applications that use both hwloc and the NVIDIA Management Library may want to
+ * include this file so as to get topology information for NVML devices.
+ */
+
+#ifndef HWLOC_NVML_H
+#define HWLOC_NVML_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+#include <hwloc/helper.h>
+#ifdef HWLOC_LINUX_SYS
+#include <hwloc/linux.h>
+#endif
+
+#include <nvml.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_nvml Interoperability with the NVIDIA Management Library
+ *
+ * This interface offers ways to retrieve topology information about
+ * devices managed by the NVIDIA Management Library (NVML).
+ *
+ * @{
+ */
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close to NVML device \p device.
+ *
+ * Return the CPU set describing the locality of the NVML device \p device.
+ *
+ * Topology \p topology and device \p device must match the local machine.
+ * I/O devices detection and the NVML component are not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * If more information about the device is needed, OS objects should
+ * be used instead, see hwloc_nvml_get_device_osdev()
+ * and hwloc_nvml_get_device_osdev_by_index().
+ *
+ * This function is currently only implemented in a meaningful way for
+ * Linux; other systems will simply get a full cpuset.
+ */
+static __hwloc_inline int
+hwloc_nvml_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
+ nvmlDevice_t device, hwloc_cpuset_t set)
+{
+#ifdef HWLOC_LINUX_SYS
+ /* If we're on Linux, use the sysfs mechanism to get the local cpus */
+#define HWLOC_NVML_DEVICE_SYSFS_PATH_MAX 128
+ char path[HWLOC_NVML_DEVICE_SYSFS_PATH_MAX];
+ FILE *sysfile = NULL;
+ nvmlReturn_t nvres;
+ nvmlPciInfo_t pci;
+
+ if (!hwloc_topology_is_thissystem(topology)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ nvres = nvmlDeviceGetPciInfo(device, &pci);
+ if (NVML_SUCCESS != nvres) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ sprintf(path, "/sys/bus/pci/devices/%04x:%02x:%02x.0/local_cpus", pci.domain, pci.bus, pci.device);
+ sysfile = fopen(path, "r");
+ if (!sysfile)
+ return -1;
+
+ hwloc_linux_parse_cpumap_file(sysfile, set);
+ if (hwloc_bitmap_iszero(set))
+ hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+
+ fclose(sysfile);
+#else
+ /* Non-Linux systems simply get a full cpuset */
+ hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#endif
+ return 0;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * NVML device whose index is \p idx.
+ *
+ * Return the OS device object describing the NVML device whose
+ * index is \p idx. Returns NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the NVML component must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_nvml_get_device_osdev_by_index(hwloc_topology_t topology, unsigned idx)
+{
+ hwloc_obj_t osdev = NULL;
+ while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+ if (HWLOC_OBJ_OSDEV_GPU == osdev->attr->osdev.type
+ && osdev->name
+ && !strncmp("nvml", osdev->name, 4)
+ && atoi(osdev->name + 4) == (int) idx)
+ return osdev;
+ }
+ return NULL;
+}
+
+/** \brief Get the hwloc OS device object corresponding to NVML device \p device.
+ *
+ * Return the hwloc OS device object that describes the given
+ * NVML device \p device. Return NULL if there is none.
+ *
+ * Topology \p topology and device \p device must match the local machine.
+ * I/O devices detection and the NVML component must be enabled in the topology.
+ * If not, the locality of the object may still be found using
+ * hwloc_nvml_get_device_cpuset().
+ *
+ * \note The corresponding hwloc PCI device may be found by looking
+ * at the result parent pointer.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_nvml_get_device_osdev(hwloc_topology_t topology, nvmlDevice_t device)
+{
+ hwloc_obj_t osdev;
+ nvmlReturn_t nvres;
+ nvmlPciInfo_t pci;
+
+ if (!hwloc_topology_is_thissystem(topology)) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ nvres = nvmlDeviceGetPciInfo(device, &pci);
+ if (NVML_SUCCESS != nvres)
+ return NULL;
+
+ osdev = NULL;
+ while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+ hwloc_obj_t pcidev = osdev->parent;
+ if (strncmp(osdev->name, "nvml", 4))
+ continue;
+ if (pcidev
+ && pcidev->type == HWLOC_OBJ_PCI_DEVICE
+ && pcidev->attr->pcidev.domain == pci.domain
+ && pcidev->attr->pcidev.bus == pci.bus
+ && pcidev->attr->pcidev.dev == pci.device
+ && pcidev->attr->pcidev.func == 0)
+ return osdev;
+ }
+
+ return NULL;
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_NVML_H */
diff --git a/ext/hwloc/include/hwloc/opencl.h b/ext/hwloc/include/hwloc/opencl.h
new file mode 100644
index 0000000..0301ad9
--- /dev/null
+++ b/ext/hwloc/include/hwloc/opencl.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright © 2012-2013 Inria. All rights reserved.
+ * Copyright © 2013 Université Bordeaux. All right reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and the OpenCL interface.
+ *
+ * Applications that use both hwloc and OpenCL may want to
+ * include this file so as to get topology information for OpenCL devices.
+ */
+
+#ifndef HWLOC_OPENCL_H
+#define HWLOC_OPENCL_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+#include <hwloc/helper.h>
+#ifdef HWLOC_LINUX_SYS
+#include <hwloc/linux.h>
+#endif
+
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+
+#include <stdio.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_opencl Interoperability with OpenCL
+ *
+ * This interface offers ways to retrieve topology information about
+ * OpenCL devices.
+ *
+ * Only the AMD OpenCL interface currently offers useful locality information
+ * about its devices.
+ *
+ * @{
+ */
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close to OpenCL device \p device.
+ *
+ * Return the CPU set describing the locality of the OpenCL device \p device.
+ *
+ * Topology \p topology and device \p device must match the local machine.
+ * I/O devices detection and the OpenCL component are not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * If more information about the device is needed, OS objects should
+ * be used instead, see hwloc_opencl_get_device_osdev()
+ * and hwloc_opencl_get_device_osdev_by_index().
+ *
+ * This function is currently only implemented in a meaningful way for
+ * Linux with the AMD OpenCL implementation; other systems will simply
+ * get a full cpuset.
+ */
+static __hwloc_inline int
+hwloc_opencl_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
+ cl_device_id device __hwloc_attribute_unused,
+ hwloc_cpuset_t set)
+{
+#if (defined HWLOC_LINUX_SYS) && (defined CL_DEVICE_TOPOLOGY_AMD)
+ /* If we're on Linux + AMD OpenCL, use the AMD extension + the sysfs mechanism to get the local cpus */
+#define HWLOC_OPENCL_DEVICE_SYSFS_PATH_MAX 128
+ char path[HWLOC_OPENCL_DEVICE_SYSFS_PATH_MAX];
+ FILE *sysfile = NULL;
+ cl_device_topology_amd amdtopo;
+ cl_int clret;
+
+ if (!hwloc_topology_is_thissystem(topology)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ clret = clGetDeviceInfo(device, CL_DEVICE_TOPOLOGY_AMD, sizeof(amdtopo), &amdtopo, NULL);
+ if (CL_SUCCESS != clret) {
+ hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+ return 0;
+ }
+ if (CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD != amdtopo.raw.type) {
+ hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+ return 0;
+ }
+
+ sprintf(path, "/sys/bus/pci/devices/0000:%02x:%02x.%01x/local_cpus", amdtopo.pcie.bus, amdtopo.pcie.device, amdtopo.pcie.function);
+ sysfile = fopen(path, "r");
+ if (!sysfile)
+ return -1;
+
+ hwloc_linux_parse_cpumap_file(sysfile, set);
+ if (hwloc_bitmap_iszero(set))
+ hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+
+ fclose(sysfile);
+#else
+ /* Non-Linux + AMD OpenCL systems simply get a full cpuset */
+ hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#endif
+ return 0;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * OpenCL device for the given indexes.
+ *
+ * Return the OS device object describing the OpenCL device
+ * whose platform index is \p platform_index,
+ * and whose device index within this platform if \p device_index.
+ * Return NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the OpenCL component must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_opencl_get_device_osdev_by_index(hwloc_topology_t topology,
+ unsigned platform_index, unsigned device_index)
+{
+ unsigned x = (unsigned) -1, y = (unsigned) -1;
+ hwloc_obj_t osdev = NULL;
+ while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+ if (HWLOC_OBJ_OSDEV_COPROC == osdev->attr->osdev.type
+ && osdev->name
+ && sscanf(osdev->name, "opencl%ud%u", &x, &y) == 2
+ && platform_index == x && device_index == y)
+ return osdev;
+ }
+ return NULL;
+}
+
+/** \brief Get the hwloc OS device object corresponding to OpenCL device \p device.
+ *
+ * Return the hwloc OS device object that describes the given
+ * OpenCL device \p device. Return NULL if there is none.
+ *
+ * Topology \p topology and device \p device must match the local machine.
+ * I/O devices detection and the OpenCL component must be enabled in the topology.
+ * If not, the locality of the object may still be found using
+ * hwloc_opencl_get_device_cpuset().
+ *
+ * \note The corresponding hwloc PCI device may be found by looking
+ * at the result parent pointer.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_opencl_get_device_osdev(hwloc_topology_t topology __hwloc_attribute_unused,
+ cl_device_id device __hwloc_attribute_unused)
+{
+#ifdef CL_DEVICE_TOPOLOGY_AMD
+ hwloc_obj_t osdev;
+ cl_device_topology_amd amdtopo;
+ cl_int clret;
+
+ clret = clGetDeviceInfo(device, CL_DEVICE_TOPOLOGY_AMD, sizeof(amdtopo), &amdtopo, NULL);
+ if (CL_SUCCESS != clret) {
+ errno = EINVAL;
+ return NULL;
+ }
+ if (CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD != amdtopo.raw.type) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ osdev = NULL;
+ while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+ hwloc_obj_t pcidev = osdev->parent;
+ if (strncmp(osdev->name, "opencl", 6))
+ continue;
+ if (pcidev
+ && pcidev->type == HWLOC_OBJ_PCI_DEVICE
+ && pcidev->attr->pcidev.domain == 0
+ && pcidev->attr->pcidev.bus == amdtopo.pcie.bus
+ && pcidev->attr->pcidev.dev == amdtopo.pcie.device
+ && pcidev->attr->pcidev.func == amdtopo.pcie.function)
+ return osdev;
+ }
+
+ return NULL;
+#else
+ return NULL;
+#endif
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_OPENCL_H */
diff --git a/ext/hwloc/include/hwloc/openfabrics-verbs.h b/ext/hwloc/include/hwloc/openfabrics-verbs.h
new file mode 100644
index 0000000..c6b8533
--- /dev/null
+++ b/ext/hwloc/include/hwloc/openfabrics-verbs.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2013 Inria. All rights reserved.
+ * Copyright © 2009-2010 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and OpenFabrics
+ * verbs.
+ *
+ * Applications that use both hwloc and OpenFabrics verbs may want to
+ * include this file so as to get topology information for OpenFabrics
+ * hardware.
+ *
+ */
+
+#ifndef HWLOC_OPENFABRICS_VERBS_H
+#define HWLOC_OPENFABRICS_VERBS_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+#ifdef HWLOC_LINUX_SYS
+#include <hwloc/linux.h>
+#endif
+
+#include <infiniband/verbs.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_openfabrics Interoperability with OpenFabrics
+ *
+ * This interface offers ways to retrieve topology information about
+ * OpenFabrics devices.
+ *
+ * @{
+ */
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close to device \p ibdev.
+ *
+ * Return the CPU set describing the locality of the OpenFabrics
+ * device \p ibdev.
+ *
+ * Topology \p topology and device \p ibdev must match the local machine.
+ * I/O devices detection is not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * If more information about the device is needed, OS objects should
+ * be used instead, see hwloc_ibv_get_device_osdev()
+ * and hwloc_ibv_get_device_osdev_by_name().
+ *
+ * This function is currently only implemented in a meaningful way for
+ * Linux; other systems will simply get a full cpuset.
+ */
+static __hwloc_inline int
+hwloc_ibv_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
+ struct ibv_device *ibdev, hwloc_cpuset_t set)
+{
+#ifdef HWLOC_LINUX_SYS
+ /* If we're on Linux, use the verbs-provided sysfs mechanism to
+ get the local cpus */
+#define HWLOC_OPENFABRICS_VERBS_SYSFS_PATH_MAX 128
+ char path[HWLOC_OPENFABRICS_VERBS_SYSFS_PATH_MAX];
+ FILE *sysfile = NULL;
+
+ if (!hwloc_topology_is_thissystem(topology)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ sprintf(path, "/sys/class/infiniband/%s/device/local_cpus",
+ ibv_get_device_name(ibdev));
+ sysfile = fopen(path, "r");
+ if (!sysfile)
+ return -1;
+
+ hwloc_linux_parse_cpumap_file(sysfile, set);
+ if (hwloc_bitmap_iszero(set))
+ hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+
+ fclose(sysfile);
+#else
+ /* Non-Linux systems simply get a full cpuset */
+ hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#endif
+ return 0;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the OpenFabrics
+ * device named \p ibname.
+ *
+ * Return the OS device object describing the OpenFabrics device whose
+ * name is \p ibname. Returns NULL if there is none.
+ * The name \p ibname is usually obtained from ibv_get_device_name().
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_ibv_get_device_osdev_by_name(hwloc_topology_t topology,
+ const char *ibname)
+{
+ hwloc_obj_t osdev = NULL;
+ while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+ if (HWLOC_OBJ_OSDEV_OPENFABRICS == osdev->attr->osdev.type
+ && osdev->name && !strcmp(ibname, osdev->name))
+ return osdev;
+ }
+ return NULL;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the OpenFabrics
+ * device \p ibdev.
+ *
+ * Return the OS device object describing the OpenFabrics device \p ibdev.
+ * Returns NULL if there is none.
+ *
+ * Topology \p topology and device \p ibdev must match the local machine.
+ * I/O devices detection must be enabled in the topology.
+ * If not, the locality of the object may still be found using
+ * hwloc_ibv_get_device_cpuset().
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_ibv_get_device_osdev(hwloc_topology_t topology,
+ struct ibv_device *ibdev)
+{
+ if (!hwloc_topology_is_thissystem(topology)) {
+ errno = EINVAL;
+ return NULL;
+ }
+ return hwloc_ibv_get_device_osdev_by_name(topology, ibv_get_device_name(ibdev));
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_OPENFABRICS_VERBS_H */
diff --git a/ext/hwloc/include/hwloc/plugins.h b/ext/hwloc/include/hwloc/plugins.h
new file mode 100644
index 0000000..7fc794d
--- /dev/null
+++ b/ext/hwloc/include/hwloc/plugins.h
@@ -0,0 +1,433 @@
+/*
+ * Copyright © 2013-2015 Inria. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#ifndef HWLOC_PLUGINS_H
+#define HWLOC_PLUGINS_H
+
+/** \file
+ * \brief Public interface for building hwloc plugins.
+ */
+
+struct hwloc_backend;
+
+#include <hwloc.h>
+#ifdef HWLOC_INSIDE_PLUGIN
+/* needed for hwloc_plugin_check_namespace() */
+#include <ltdl.h>
+#endif
+
+
+
+/** \defgroup hwlocality_disc_components Components and Plugins: Discovery components
+ * @{
+ */
+
+/** \brief Discovery component type */
+typedef enum hwloc_disc_component_type_e {
+ /** \brief CPU-only discovery through the OS, or generic no-OS support.
+ * \hideinitializer */
+ HWLOC_DISC_COMPONENT_TYPE_CPU = (1<<0),
+
+ /** \brief xml or synthetic,
+ * platform-specific components such as bgq.
+ * Anything the discovers CPU and everything else.
+ * No misc backend is expected to complement a global component.
+ * \hideinitializer */
+ HWLOC_DISC_COMPONENT_TYPE_GLOBAL = (1<<1),
+
+ /** \brief OpenCL, Cuda, etc.
+ * \hideinitializer */
+ HWLOC_DISC_COMPONENT_TYPE_MISC = (1<<2)
+} hwloc_disc_component_type_t;
+
+/** \brief Discovery component structure
+ *
+ * This is the major kind of components, taking care of the discovery.
+ * They are registered by generic components, either statically-built or as plugins.
+ */
+struct hwloc_disc_component {
+ /** \brief Discovery component type */
+ hwloc_disc_component_type_t type;
+
+ /** \brief Name.
+ * If this component is built as a plugin, this name does not have to match the plugin filename.
+ */
+ const char *name;
+
+ /** \brief Component types to exclude, as an OR'ed set of HWLOC_DISC_COMPONENT_TYPE_*.
+ *
+ * For a GLOBAL component, this usually includes all other types (~0).
+ *
+ * Other components only exclude types that may bring conflicting
+ * topology information. MISC components should likely not be excluded
+ * since they usually bring non-primary additional information.
+ */
+ unsigned excludes;
+
+ /** \brief Instantiate callback to create a backend from the component.
+ * Parameters data1, data2, data3 are NULL except for components
+ * that have special enabling routines such as hwloc_topology_set_xml(). */
+ struct hwloc_backend * (*instantiate)(struct hwloc_disc_component *component, const void *data1, const void *data2, const void *data3);
+
+ /** \brief Component priority.
+ * Used to sort topology->components, higher priority first.
+ * Also used to decide between two components with the same name.
+ *
+ * Usual values are
+ * 50 for native OS (or platform) components,
+ * 45 for x86,
+ * 40 for no-OS fallback,
+ * 30 for global components (xml, synthetic),
+ * 20 for pci,
+ * 10 for other misc components (opencl etc.).
+ */
+ unsigned priority;
+
+ /** \private Used internally to list components by priority on topology->components
+ * (the component structure is usually read-only,
+ * the core copies it before using this field for queueing)
+ */
+ struct hwloc_disc_component * next;
+};
+
+/** @} */
+
+
+
+
+/** \defgroup hwlocality_disc_backends Components and Plugins: Discovery backends
+ * @{
+ */
+
+/** \brief Discovery backend structure
+ *
+ * A backend is the instantiation of a discovery component.
+ * When a component gets enabled for a topology,
+ * its instantiate() callback creates a backend.
+ *
+ * hwloc_backend_alloc() initializes all fields to default values
+ * that the component may change (except "component" and "next")
+ * before enabling the backend with hwloc_backend_enable().
+ */
+struct hwloc_backend {
+ /** \private Reserved for the core, set by hwloc_backend_alloc() */
+ struct hwloc_disc_component * component;
+ /** \private Reserved for the core, set by hwloc_backend_enable() */
+ struct hwloc_topology * topology;
+ /** \private Reserved for the core. Set to 1 if forced through envvar, 0 otherwise. */
+ int envvar_forced;
+ /** \private Reserved for the core. Used internally to list backends topology->backends. */
+ struct hwloc_backend * next;
+
+ /** \brief Backend flags, as an OR'ed set of HWLOC_BACKEND_FLAG_* */
+ unsigned long flags;
+
+ /** \brief Backend-specific 'is_thissystem' property.
+ * Set to 0 or 1 if the backend should enforce the thissystem flag when it gets enabled.
+ * Set to -1 if the backend doesn't care (default). */
+ int is_thissystem;
+
+ /** \brief Backend private data, or NULL if none. */
+ void * private_data;
+ /** \brief Callback for freeing the private_data.
+ * May be NULL.
+ */
+ void (*disable)(struct hwloc_backend *backend);
+
+ /** \brief Main discovery callback.
+ * returns > 0 if it modified the topology tree, -1 on error, 0 otherwise.
+ * May be NULL if type is HWLOC_DISC_COMPONENT_TYPE_MISC. */
+ int (*discover)(struct hwloc_backend *backend);
+
+ /** \brief Callback used by the PCI backend to retrieve the locality of a PCI object from the OS/cpu backend.
+ * May be NULL. */
+ int (*get_obj_cpuset)(struct hwloc_backend *backend, struct hwloc_backend *caller, struct hwloc_obj *obj, hwloc_bitmap_t cpuset);
+
+ /** \brief Callback called by backends to notify this backend that a new object was added.
+ * returns > 0 if it modified the topology tree, 0 otherwise.
+ * May be NULL. */
+ int (*notify_new_object)(struct hwloc_backend *backend, struct hwloc_backend *caller, struct hwloc_obj *obj);
+};
+
+/** \brief Backend flags */
+enum hwloc_backend_flag_e {
+ /** \brief Levels should be reconnected before this backend discover() is used.
+ * \hideinitializer */
+ HWLOC_BACKEND_FLAG_NEED_LEVELS = (1UL<<0)
+};
+
+/** \brief Allocate a backend structure, set good default values, initialize backend->component and topology, etc.
+ * The caller will then modify whatever needed, and call hwloc_backend_enable().
+ */
+HWLOC_DECLSPEC struct hwloc_backend * hwloc_backend_alloc(struct hwloc_disc_component *component);
+
+/** \brief Enable a previously allocated and setup backend. */
+HWLOC_DECLSPEC int hwloc_backend_enable(struct hwloc_topology *topology, struct hwloc_backend *backend);
+
+/** \brief Used by backends discovery callbacks to request locality information from others.
+ *
+ * Traverse the list of enabled backends until one has a
+ * get_obj_cpuset() method, and call it.
+ */
+HWLOC_DECLSPEC int hwloc_backends_get_obj_cpuset(struct hwloc_backend *caller, struct hwloc_obj *obj, hwloc_bitmap_t cpuset);
+
+/** \brief Used by backends discovery callbacks to notify other
+ * backends of new objects.
+ *
+ * Traverse the list of enabled backends (all but caller) and invoke
+ * their notify_new_object() method to notify them that a new object
+ * just got added to the topology.
+ *
+ * Currently only used for notifying of new PCI device objects.
+ */
+HWLOC_DECLSPEC int hwloc_backends_notify_new_object(struct hwloc_backend *caller, struct hwloc_obj *obj);
+
+/** @} */
+
+
+
+
+/** \defgroup hwlocality_generic_components Components and Plugins: Generic components
+ * @{
+ */
+
+/** \brief Generic component type */
+typedef enum hwloc_component_type_e {
+ /** \brief The data field must point to a struct hwloc_disc_component. */
+ HWLOC_COMPONENT_TYPE_DISC,
+
+ /** \brief The data field must point to a struct hwloc_xml_component. */
+ HWLOC_COMPONENT_TYPE_XML
+} hwloc_component_type_t;
+
+/** \brief Generic component structure
+ *
+ * Generic components structure, either statically listed by configure in static-components.h
+ * or dynamically loaded as a plugin.
+ */
+struct hwloc_component {
+ /** \brief Component ABI version, set to HWLOC_COMPONENT_ABI */
+ unsigned abi;
+
+ /** \brief Process-wide component initialization callback.
+ *
+ * This optional callback is called when the component is registered
+ * to the hwloc core (after loading the plugin).
+ *
+ * When the component is built as a plugin, this callback
+ * should call hwloc_check_plugin_namespace()
+ * and return an negative error code on error.
+ *
+ * \p flags is always 0 for now.
+ *
+ * \return 0 on success, or a negative code on error.
+ *
+ * \note If the component uses ltdl for loading its own plugins,
+ * it should load/unload them only in init() and finalize(),
+ * to avoid race conditions with hwloc's use of ltdl.
+ */
+ int (*init)(unsigned long flags);
+
+ /** \brief Process-wide component termination callback.
+ *
+ * This optional callback is called after unregistering the component
+ * from the hwloc core (before unloading the plugin).
+ *
+ * \p flags is always 0 for now.
+ *
+ * \note If the component uses ltdl for loading its own plugins,
+ * it should load/unload them only in init() and finalize(),
+ * to avoid race conditions with hwloc's use of ltdl.
+ */
+ void (*finalize)(unsigned long flags);
+
+ /** \brief Component type */
+ hwloc_component_type_t type;
+
+ /** \brief Component flags, unused for now */
+ unsigned long flags;
+
+ /** \brief Component data, pointing to a struct hwloc_disc_component or struct hwloc_xml_component. */
+ void * data;
+};
+
+/** @} */
+
+
+
+
+/** \defgroup hwlocality_components_core_funcs Components and Plugins: Core functions to be used by components
+ * @{
+ */
+
+/** \brief Add an object to the topology.
+ *
+ * It is sorted along the tree of other objects according to the inclusion of
+ * cpusets, to eventually be added as a child of the smallest object including
+ * this object.
+ *
+ * If the cpuset is empty, the type of the object (and maybe some attributes)
+ * must be enough to find where to insert the object. This is especially true
+ * for NUMA nodes with memory and no CPUs.
+ *
+ * The given object should not have children.
+ *
+ * This shall only be called before levels are built.
+ *
+ * In case of error, hwloc_report_os_error() is called.
+ *
+ * Returns the object on success.
+ * Returns NULL and frees obj on error.
+ * Returns another object and frees obj if it was merged with an identical pre-existing object.
+ */
+HWLOC_DECLSPEC struct hwloc_obj *hwloc_insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t obj);
+
+/** \brief Type of error callbacks during object insertion */
+typedef void (*hwloc_report_error_t)(const char * msg, int line);
+/** \brief Report an insertion error from a backend */
+HWLOC_DECLSPEC void hwloc_report_os_error(const char * msg, int line);
+/** \brief Check whether insertion errors are hidden */
+HWLOC_DECLSPEC int hwloc_hide_errors(void);
+
+/** \brief Add an object to the topology and specify which error callback to use.
+ *
+ * Aside from the error callback selection, this function is identical to hwloc_insert_object_by_cpuset()
+ */
+HWLOC_DECLSPEC struct hwloc_obj *hwloc__insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t obj, hwloc_report_error_t report_error);
+
+/** \brief Insert an object somewhere in the topology.
+ *
+ * It is added as the last child of the given parent.
+ * The cpuset is completely ignored, so strange objects such as I/O devices should
+ * preferably be inserted with this.
+ *
+ * When used for "normal" children with cpusets (when importing from XML
+ * when duplicating a topology), the caller should make sure that:
+ * - children are inserted in order,
+ * - children cpusets do not intersect.
+ *
+ * The given object may have normal, I/O or Misc children, as long as they are in order as well.
+ * These children must have valid parent and next_sibling pointers.
+ */
+HWLOC_DECLSPEC void hwloc_insert_object_by_parent(struct hwloc_topology *topology, hwloc_obj_t parent, hwloc_obj_t obj);
+
+/** \brief Allocate and initialize an object of the given type and physical index */
+static __hwloc_inline struct hwloc_obj *
+hwloc_alloc_setup_object(hwloc_obj_type_t type, signed os_index)
+{
+ struct hwloc_obj *obj = malloc(sizeof(*obj));
+ memset(obj, 0, sizeof(*obj));
+ obj->type = type;
+ obj->os_index = os_index;
+ obj->attr = malloc(sizeof(*obj->attr));
+ memset(obj->attr, 0, sizeof(*obj->attr));
+ /* do not allocate the cpuset here, let the caller do it */
+ return obj;
+}
+
+/** \brief Setup object cpusets/nodesets by OR'ing its children.
+ *
+ * Used when adding an object late in the topology.
+ * Will update the new object by OR'ing all its new children sets.
+ *
+ * Used when PCI backend adds a hostbridge parent, when distances
+ * add a new Group, etc.
+ */
+HWLOC_DECLSPEC int hwloc_obj_add_children_sets(hwloc_obj_t obj);
+
+/** \brief Make sure that plugins can lookup core symbols.
+ *
+ * This is a sanity check to avoid lazy-lookup failures when libhwloc
+ * is loaded within a plugin, and later tries to load its own plugins.
+ * This may fail (and abort the program) if libhwloc symbols are in a
+ * private namespace.
+ *
+ * \return 0 on success.
+ * \return -1 if the plugin cannot be successfully loaded. The caller
+ * plugin init() callback should return a negative error code as well.
+ *
+ * Plugins should call this function in their init() callback to avoid
+ * later crashes if lazy symbol resolution is used by the upper layer that
+ * loaded hwloc (e.g. OpenCL implementations using dlopen with RTLD_LAZY).
+ *
+ * \note The build system must define HWLOC_INSIDE_PLUGIN if and only if
+ * building the caller as a plugin.
+ *
+ * \note This function should remain inline so plugins can call it even
+ * when they cannot find libhwloc symbols.
+ */
+static __hwloc_inline int
+hwloc_plugin_check_namespace(const char *pluginname __hwloc_attribute_unused, const char *symbol __hwloc_attribute_unused)
+{
+#ifdef HWLOC_INSIDE_PLUGIN
+ lt_dlhandle handle;
+ void *sym;
+ handle = lt_dlopen(NULL);
+ if (!handle)
+ /* cannot check, assume things will work */
+ return 0;
+ sym = lt_dlsym(handle, symbol);
+ lt_dlclose(handle);
+ if (!sym) {
+ static int verboseenv_checked = 0;
+ static int verboseenv_value = 0;
+ if (!verboseenv_checked) {
+ const char *verboseenv = getenv("HWLOC_PLUGINS_VERBOSE");
+ verboseenv_value = atoi(verboseenv);
+ verboseenv_checked = 1;
+ }
+ if (verboseenv_value)
+ fprintf(stderr, "Plugin `%s' disabling itself because it cannot find the `%s' core symbol.\n",
+ pluginname, symbol);
+ return -1;
+ }
+#endif /* HWLOC_INSIDE_PLUGIN */
+ return 0;
+}
+
+/** @} */
+
+
+
+
+/** \defgroup hwlocality_components_pci_funcs Components and Plugins: PCI functions to be used by components
+ * @{
+ */
+
+/** \brief Insert a list of PCI devices and bridges in the backend topology.
+ *
+ * Insert a list of objects (either PCI device or bridges) starting at first_obj
+ * (linked by next_sibling in the topology, and ending with NULL).
+ * Objects are placed under the right bridges, and the remaining upstream bridges
+ * are then inserted in the topology by calling the get_obj_cpuset() callback to
+ * find their locality.
+ */
+HWLOC_DECLSPEC int hwloc_insert_pci_device_list(struct hwloc_backend *backend, struct hwloc_obj *first_obj);
+
+/** \brief Return the offset of the given capability in the PCI config space buffer
+ *
+ * This function requires a 256-bytes config space. Unknown/unavailable bytes should be set to 0xff.
+ */
+HWLOC_DECLSPEC unsigned hwloc_pci_find_cap(const unsigned char *config, unsigned cap);
+
+/** \brief Fill linkspeed by reading the PCI config space where PCI_CAP_ID_EXP is at position offset.
+ *
+ * Needs 20 bytes of EXP capability block starting at offset in the config space
+ * for registers up to link status.
+ */
+HWLOC_DECLSPEC int hwloc_pci_find_linkspeed(const unsigned char *config, unsigned offset, float *linkspeed);
+
+/** \brief Modify the PCI device object into a bridge and fill its attribute if a bridge is found in the PCI config space.
+ *
+ * This function requires 64 bytes of common configuration header at the beginning of config.
+ */
+HWLOC_DECLSPEC int hwloc_pci_prepare_bridge(hwloc_obj_t obj, const unsigned char *config);
+
+/** @} */
+
+
+
+
+#endif /* HWLOC_PLUGINS_H */
diff --git a/ext/hwloc/include/hwloc/rename.h b/ext/hwloc/include/hwloc/rename.h
new file mode 100644
index 0000000..2684e71
--- /dev/null
+++ b/ext/hwloc/include/hwloc/rename.h
@@ -0,0 +1,651 @@
+/*
+ * Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved.
+ * Copyright © 2010-2015 Inria. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#ifndef HWLOC_RENAME_H
+#define HWLOC_RENAME_H
+
+#include <hwloc/autogen/config.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Only enact these defines if we're actually renaming the symbols
+ (i.e., avoid trying to have no-op defines if we're *not*
+ renaming). */
+
+#if HWLOC_SYM_TRANSFORM
+
+/* Use a preprocessor two-step in order to get the prefixing right.
+ Make 2 macros: HWLOC_NAME and HWLOC_NAME_CAPS for renaming
+ things. */
+
+#define HWLOC_MUNGE_NAME(a, b) HWLOC_MUNGE_NAME2(a, b)
+#define HWLOC_MUNGE_NAME2(a, b) a ## b
+#define HWLOC_NAME(name) HWLOC_MUNGE_NAME(HWLOC_SYM_PREFIX, hwloc_ ## name)
+#define HWLOC_NAME_CAPS(name) HWLOC_MUNGE_NAME(HWLOC_SYM_PREFIX_CAPS, hwloc_ ## name)
+
+/* Now define all the "real" names to be the prefixed names. This
+ allows us to use the real names throughout the code base (i.e.,
+ "hwloc_<foo>"); the preprocessor will adjust to have the prefixed
+ name under the covers. */
+
+/* Names from hwloc.h */
+
+#define hwloc_get_api_version HWLOC_NAME(get_api_version)
+
+#define hwloc_topology HWLOC_NAME(topology)
+#define hwloc_topology_t HWLOC_NAME(topology_t)
+
+#define hwloc_cpuset_t HWLOC_NAME(cpuset_t)
+#define hwloc_const_cpuset_t HWLOC_NAME(const_cpuset_t)
+#define hwloc_nodeset_t HWLOC_NAME(nodeset_t)
+#define hwloc_const_nodeset_t HWLOC_NAME(const_nodeset_t)
+
+#define HWLOC_OBJ_SYSTEM HWLOC_NAME_CAPS(OBJ_SYSTEM)
+#define HWLOC_OBJ_MACHINE HWLOC_NAME_CAPS(OBJ_MACHINE)
+#define HWLOC_OBJ_NUMANODE HWLOC_NAME_CAPS(OBJ_NUMANODE)
+#define HWLOC_OBJ_PACKAGE HWLOC_NAME_CAPS(OBJ_PACKAGE)
+#define HWLOC_OBJ_CACHE HWLOC_NAME_CAPS(OBJ_CACHE)
+#define HWLOC_OBJ_CORE HWLOC_NAME_CAPS(OBJ_CORE)
+#define HWLOC_OBJ_PU HWLOC_NAME_CAPS(OBJ_PU)
+#define HWLOC_OBJ_MISC HWLOC_NAME_CAPS(OBJ_MISC)
+#define HWLOC_OBJ_GROUP HWLOC_NAME_CAPS(OBJ_GROUP)
+#define HWLOC_OBJ_BRIDGE HWLOC_NAME_CAPS(OBJ_BRIDGE)
+#define HWLOC_OBJ_PCI_DEVICE HWLOC_NAME_CAPS(OBJ_PCI_DEVICE)
+#define HWLOC_OBJ_OS_DEVICE HWLOC_NAME_CAPS(OBJ_OS_DEVICE)
+#define HWLOC_OBJ_TYPE_MAX HWLOC_NAME_CAPS(OBJ_TYPE_MAX)
+#define hwloc_obj_type_t HWLOC_NAME(obj_type_t)
+
+#define hwloc_obj_cache_type_e HWLOC_NAME(obj_cache_type_e)
+#define hwloc_obj_cache_type_t HWLOC_NAME(obj_cache_type_t)
+#define HWLOC_OBJ_CACHE_UNIFIED HWLOC_NAME_CAPS(OBJ_CACHE_UNIFIED)
+#define HWLOC_OBJ_CACHE_DATA HWLOC_NAME_CAPS(OBJ_CACHE_DATA)
+#define HWLOC_OBJ_CACHE_INSTRUCTION HWLOC_NAME_CAPS(OBJ_CACHE_INSTRUCTION)
+
+#define hwloc_obj_bridge_type_e HWLOC_NAME(obj_bridge_type_e)
+#define hwloc_obj_bridge_type_t HWLOC_NAME(obj_bridge_type_t)
+#define HWLOC_OBJ_BRIDGE_HOST HWLOC_NAME_CAPS(OBJ_BRIDGE_HOST)
+#define HWLOC_OBJ_BRIDGE_PCI HWLOC_NAME_CAPS(OBJ_BRIDGE_PCI)
+
+#define hwloc_obj_osdev_type_e HWLOC_NAME(obj_osdev_type_e)
+#define hwloc_obj_osdev_type_t HWLOC_NAME(obj_osdev_type_t)
+#define HWLOC_OBJ_OSDEV_BLOCK HWLOC_NAME_CAPS(OBJ_OSDEV_BLOCK)
+#define HWLOC_OBJ_OSDEV_GPU HWLOC_NAME_CAPS(OBJ_OSDEV_GPU)
+#define HWLOC_OBJ_OSDEV_NETWORK HWLOC_NAME_CAPS(OBJ_OSDEV_NETWORK)
+#define HWLOC_OBJ_OSDEV_OPENFABRICS HWLOC_NAME_CAPS(OBJ_OSDEV_OPENFABRICS)
+#define HWLOC_OBJ_OSDEV_DMA HWLOC_NAME_CAPS(OBJ_OSDEV_DMA)
+#define HWLOC_OBJ_OSDEV_COPROC HWLOC_NAME_CAPS(OBJ_OSDEV_COPROC)
+
+#define hwloc_compare_types HWLOC_NAME(compare_types)
+
+#define hwloc_compare_types_e HWLOC_NAME(compare_types_e)
+#define HWLOC_TYPE_UNORDERED HWLOC_NAME_CAPS(TYPE_UNORDERED)
+
+#define hwloc_obj_memory_s HWLOC_NAME(obj_memory_s)
+#define hwloc_obj_memory_page_type_s HWLOC_NAME(obj_memory_page_type_s)
+
+#define hwloc_obj HWLOC_NAME(obj)
+#define hwloc_obj_t HWLOC_NAME(obj_t)
+
+#define hwloc_distances_s HWLOC_NAME(distances_s)
+#define hwloc_obj_info_s HWLOC_NAME(obj_info_s)
+
+#define hwloc_obj_attr_u HWLOC_NAME(obj_attr_u)
+#define hwloc_cache_attr_s HWLOC_NAME(cache_attr_s)
+#define hwloc_group_attr_s HWLOC_NAME(group_attr_s)
+#define hwloc_pcidev_attr_s HWLOC_NAME(pcidev_attr_s)
+#define hwloc_bridge_attr_s HWLOC_NAME(bridge_attr_s)
+#define hwloc_osdev_attr_s HWLOC_NAME(osdev_attr_s)
+
+#define hwloc_topology_init HWLOC_NAME(topology_init)
+#define hwloc_topology_load HWLOC_NAME(topology_load)
+#define hwloc_topology_destroy HWLOC_NAME(topology_destroy)
+#define hwloc_topology_dup HWLOC_NAME(topology_dup)
+#define hwloc_topology_check HWLOC_NAME(topology_check)
+
+#define hwloc_topology_flags_e HWLOC_NAME(topology_flags_e)
+
+#define HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM HWLOC_NAME_CAPS(TOPOLOGY_FLAG_WHOLE_SYSTEM)
+#define HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM HWLOC_NAME_CAPS(TOPOLOGY_FLAG_IS_THISSYSTEM)
+#define HWLOC_TOPOLOGY_FLAG_IO_DEVICES HWLOC_NAME_CAPS(TOPOLOGY_FLAG_IO_DEVICES)
+#define HWLOC_TOPOLOGY_FLAG_IO_BRIDGES HWLOC_NAME_CAPS(TOPOLOGY_FLAG_IO_BRIDGES)
+#define HWLOC_TOPOLOGY_FLAG_WHOLE_IO HWLOC_NAME_CAPS(TOPOLOGY_FLAG_WHOLE_IO)
+#define HWLOC_TOPOLOGY_FLAG_ICACHES HWLOC_NAME_CAPS(TOPOLOGY_FLAG_ICACHES)
+
+#define hwloc_topology_set_pid HWLOC_NAME(topology_set_pid)
+#define hwloc_topology_set_synthetic HWLOC_NAME(topology_set_synthetic)
+#define hwloc_topology_set_xml HWLOC_NAME(topology_set_xml)
+#define hwloc_topology_set_xmlbuffer HWLOC_NAME(topology_set_xmlbuffer)
+
+#define hwloc_topology_set_flags HWLOC_NAME(topology_set_flags)
+#define hwloc_topology_is_thissystem HWLOC_NAME(topology_is_thissystem)
+#define hwloc_topology_get_flags HWLOC_NAME(topology_get_flags)
+#define hwloc_topology_discovery_support HWLOC_NAME(topology_discovery_support)
+#define hwloc_topology_cpubind_support HWLOC_NAME(topology_cpubind_support)
+#define hwloc_topology_membind_support HWLOC_NAME(topology_membind_support)
+#define hwloc_topology_support HWLOC_NAME(topology_support)
+#define hwloc_topology_get_support HWLOC_NAME(topology_get_support)
+#define hwloc_topology_ignore_type HWLOC_NAME(topology_ignore_type)
+#define hwloc_topology_ignore_type_keep_structure HWLOC_NAME(topology_ignore_type_keep_structure)
+#define hwloc_topology_ignore_all_keep_structure HWLOC_NAME(topology_ignore_all_keep_structure)
+#define hwloc_topology_set_distance_matrix HWLOC_NAME(topology_set_distance_matrix)
+#define hwloc_topology_set_userdata HWLOC_NAME(topology_set_userdata)
+#define hwloc_topology_get_userdata HWLOC_NAME(topology_get_userdata)
+
+#define hwloc_restrict_flags_e HWLOC_NAME(restrict_flags_e)
+#define HWLOC_RESTRICT_FLAG_ADAPT_DISTANCES HWLOC_NAME_CAPS(RESTRICT_FLAG_ADAPT_DISTANCES)
+#define HWLOC_RESTRICT_FLAG_ADAPT_MISC HWLOC_NAME_CAPS(RESTRICT_FLAG_ADAPT_MISC)
+#define HWLOC_RESTRICT_FLAG_ADAPT_IO HWLOC_NAME_CAPS(RESTRICT_FLAG_ADAPT_IO)
+#define hwloc_topology_restrict HWLOC_NAME(topology_restrict)
+
+#define hwloc_topology_insert_misc_object HWLOC_NAME(topology_insert_misc_object)
+#define hwloc_topology_alloc_group_object HWLOC_NAME(topology_alloc_group_object)
+#define hwloc_topology_insert_group_object HWLOC_NAME(topology_insert_group_object)
+#define hwloc_obj_add_other_obj_sets HWLOC_NAME(obj_add_other_obj_sets)
+
+#define hwloc_topology_get_depth HWLOC_NAME(topology_get_depth)
+#define hwloc_get_type_depth HWLOC_NAME(get_type_depth)
+
+#define hwloc_get_type_depth_e HWLOC_NAME(get_type_depth_e)
+#define HWLOC_TYPE_DEPTH_UNKNOWN HWLOC_NAME_CAPS(TYPE_DEPTH_UNKNOWN)
+#define HWLOC_TYPE_DEPTH_MULTIPLE HWLOC_NAME_CAPS(TYPE_DEPTH_MULTIPLE)
+#define HWLOC_TYPE_DEPTH_BRIDGE HWLOC_NAME_CAPS(TYPE_DEPTH_BRIDGE)
+#define HWLOC_TYPE_DEPTH_PCI_DEVICE HWLOC_NAME_CAPS(TYPE_DEPTH_PCI_DEVICE)
+#define HWLOC_TYPE_DEPTH_OS_DEVICE HWLOC_NAME_CAPS(TYPE_DEPTH_OS_DEVICE)
+#define HWLOC_TYPE_DEPTH_MISC HWLOC_NAME_CAPS(TYPE_DEPTH_MISC)
+
+#define hwloc_get_depth_type HWLOC_NAME(get_depth_type)
+#define hwloc_get_nbobjs_by_depth HWLOC_NAME(get_nbobjs_by_depth)
+#define hwloc_get_nbobjs_by_type HWLOC_NAME(get_nbobjs_by_type)
+
+#define hwloc_get_obj_by_depth HWLOC_NAME(get_obj_by_depth )
+#define hwloc_get_obj_by_type HWLOC_NAME(get_obj_by_type )
+
+#define hwloc_obj_type_string HWLOC_NAME(obj_type_string )
+#define hwloc_obj_type_sscanf HWLOC_NAME(obj_type_sscanf)
+#define hwloc_obj_type_snprintf HWLOC_NAME(obj_type_snprintf )
+#define hwloc_obj_attr_snprintf HWLOC_NAME(obj_attr_snprintf )
+#define hwloc_obj_cpuset_snprintf HWLOC_NAME(obj_cpuset_snprintf)
+#define hwloc_obj_get_info_by_name HWLOC_NAME(obj_get_info_by_name)
+#define hwloc_obj_add_info HWLOC_NAME(obj_add_info)
+
+#define HWLOC_CPUBIND_PROCESS HWLOC_NAME_CAPS(CPUBIND_PROCESS)
+#define HWLOC_CPUBIND_THREAD HWLOC_NAME_CAPS(CPUBIND_THREAD)
+#define HWLOC_CPUBIND_STRICT HWLOC_NAME_CAPS(CPUBIND_STRICT)
+#define HWLOC_CPUBIND_NOMEMBIND HWLOC_NAME_CAPS(CPUBIND_NOMEMBIND)
+
+#define hwloc_cpubind_flags_t HWLOC_NAME(cpubind_flags_t)
+
+#define hwloc_set_cpubind HWLOC_NAME(set_cpubind)
+#define hwloc_get_cpubind HWLOC_NAME(get_cpubind)
+#define hwloc_set_proc_cpubind HWLOC_NAME(set_proc_cpubind)
+#define hwloc_get_proc_cpubind HWLOC_NAME(get_proc_cpubind)
+#define hwloc_set_thread_cpubind HWLOC_NAME(set_thread_cpubind)
+#define hwloc_get_thread_cpubind HWLOC_NAME(get_thread_cpubind)
+
+#define hwloc_get_last_cpu_location HWLOC_NAME(get_last_cpu_location)
+#define hwloc_get_proc_last_cpu_location HWLOC_NAME(get_proc_last_cpu_location)
+
+#define HWLOC_MEMBIND_DEFAULT HWLOC_NAME_CAPS(MEMBIND_DEFAULT)
+#define HWLOC_MEMBIND_FIRSTTOUCH HWLOC_NAME_CAPS(MEMBIND_FIRSTTOUCH)
+#define HWLOC_MEMBIND_BIND HWLOC_NAME_CAPS(MEMBIND_BIND)
+#define HWLOC_MEMBIND_INTERLEAVE HWLOC_NAME_CAPS(MEMBIND_INTERLEAVE)
+#define HWLOC_MEMBIND_REPLICATE HWLOC_NAME_CAPS(MEMBIND_REPLICATE)
+#define HWLOC_MEMBIND_NEXTTOUCH HWLOC_NAME_CAPS(MEMBIND_NEXTTOUCH)
+#define HWLOC_MEMBIND_MIXED HWLOC_NAME_CAPS(MEMBIND_MIXED)
+
+#define hwloc_membind_policy_t HWLOC_NAME(membind_policy_t)
+
+#define HWLOC_MEMBIND_PROCESS HWLOC_NAME_CAPS(MEMBIND_PROCESS)
+#define HWLOC_MEMBIND_THREAD HWLOC_NAME_CAPS(MEMBIND_THREAD)
+#define HWLOC_MEMBIND_STRICT HWLOC_NAME_CAPS(MEMBIND_STRICT)
+#define HWLOC_MEMBIND_MIGRATE HWLOC_NAME_CAPS(MEMBIND_MIGRATE)
+#define HWLOC_MEMBIND_NOCPUBIND HWLOC_NAME_CAPS(MEMBIND_NOCPUBIND)
+
+#define hwloc_membind_flags_t HWLOC_NAME(membind_flags_t)
+
+#define hwloc_set_membind_nodeset HWLOC_NAME(set_membind_nodeset)
+#define hwloc_set_membind HWLOC_NAME(set_membind)
+#define hwloc_get_membind_nodeset HWLOC_NAME(get_membind_nodeset)
+#define hwloc_get_membind HWLOC_NAME(get_membind)
+#define hwloc_set_proc_membind_nodeset HWLOC_NAME(set_proc_membind_nodeset)
+#define hwloc_set_proc_membind HWLOC_NAME(set_proc_membind)
+#define hwloc_get_proc_membind_nodeset HWLOC_NAME(get_proc_membind_nodeset)
+#define hwloc_get_proc_membind HWLOC_NAME(get_proc_membind)
+#define hwloc_set_area_membind_nodeset HWLOC_NAME(set_area_membind_nodeset)
+#define hwloc_set_area_membind HWLOC_NAME(set_area_membind)
+#define hwloc_get_area_membind_nodeset HWLOC_NAME(get_area_membind_nodeset)
+#define hwloc_get_area_membind HWLOC_NAME(get_area_membind)
+#define hwloc_alloc_membind_nodeset HWLOC_NAME(alloc_membind_nodeset)
+#define hwloc_alloc_membind HWLOC_NAME(alloc_membind)
+#define hwloc_alloc HWLOC_NAME(alloc)
+#define hwloc_free HWLOC_NAME(free)
+
+#define hwloc_get_non_io_ancestor_obj HWLOC_NAME(get_non_io_ancestor_obj)
+#define hwloc_get_next_pcidev HWLOC_NAME(get_next_pcidev)
+#define hwloc_get_pcidev_by_busid HWLOC_NAME(get_pcidev_by_busid)
+#define hwloc_get_pcidev_by_busidstring HWLOC_NAME(get_pcidev_by_busidstring)
+#define hwloc_get_next_osdev HWLOC_NAME(get_next_osdev)
+#define hwloc_get_next_bridge HWLOC_NAME(get_next_bridge)
+#define hwloc_bridge_covers_pcibus HWLOC_NAME(bridge_covers_pcibus)
+#define hwloc_get_hostbridge_by_pcibus HWLOC_NAME(get_hostbridge_by_pcibus)
+
+/* hwloc/bitmap.h */
+
+#define hwloc_bitmap_s HWLOC_NAME(bitmap_s)
+#define hwloc_bitmap_t HWLOC_NAME(bitmap_t)
+#define hwloc_const_bitmap_t HWLOC_NAME(const_bitmap_t)
+
+#define hwloc_bitmap_alloc HWLOC_NAME(bitmap_alloc)
+#define hwloc_bitmap_alloc_full HWLOC_NAME(bitmap_alloc_full)
+#define hwloc_bitmap_free HWLOC_NAME(bitmap_free)
+#define hwloc_bitmap_dup HWLOC_NAME(bitmap_dup)
+#define hwloc_bitmap_copy HWLOC_NAME(bitmap_copy)
+#define hwloc_bitmap_snprintf HWLOC_NAME(bitmap_snprintf)
+#define hwloc_bitmap_asprintf HWLOC_NAME(bitmap_asprintf)
+#define hwloc_bitmap_sscanf HWLOC_NAME(bitmap_sscanf)
+#define hwloc_bitmap_list_snprintf HWLOC_NAME(bitmap_list_snprintf)
+#define hwloc_bitmap_list_asprintf HWLOC_NAME(bitmap_list_asprintf)
+#define hwloc_bitmap_list_sscanf HWLOC_NAME(bitmap_list_sscanf)
+#define hwloc_bitmap_taskset_snprintf HWLOC_NAME(bitmap_taskset_snprintf)
+#define hwloc_bitmap_taskset_asprintf HWLOC_NAME(bitmap_taskset_asprintf)
+#define hwloc_bitmap_taskset_sscanf HWLOC_NAME(bitmap_taskset_sscanf)
+#define hwloc_bitmap_zero HWLOC_NAME(bitmap_zero)
+#define hwloc_bitmap_fill HWLOC_NAME(bitmap_fill)
+#define hwloc_bitmap_from_ulong HWLOC_NAME(bitmap_from_ulong)
+
+#define hwloc_bitmap_from_ith_ulong HWLOC_NAME(bitmap_from_ith_ulong)
+#define hwloc_bitmap_to_ulong HWLOC_NAME(bitmap_to_ulong)
+#define hwloc_bitmap_to_ith_ulong HWLOC_NAME(bitmap_to_ith_ulong)
+#define hwloc_bitmap_only HWLOC_NAME(bitmap_only)
+#define hwloc_bitmap_allbut HWLOC_NAME(bitmap_allbut)
+#define hwloc_bitmap_set HWLOC_NAME(bitmap_set)
+#define hwloc_bitmap_set_range HWLOC_NAME(bitmap_set_range)
+#define hwloc_bitmap_set_ith_ulong HWLOC_NAME(bitmap_set_ith_ulong)
+#define hwloc_bitmap_clr HWLOC_NAME(bitmap_clr)
+#define hwloc_bitmap_clr_range HWLOC_NAME(bitmap_clr_range)
+#define hwloc_bitmap_isset HWLOC_NAME(bitmap_isset)
+#define hwloc_bitmap_iszero HWLOC_NAME(bitmap_iszero)
+#define hwloc_bitmap_isfull HWLOC_NAME(bitmap_isfull)
+#define hwloc_bitmap_isequal HWLOC_NAME(bitmap_isequal)
+#define hwloc_bitmap_intersects HWLOC_NAME(bitmap_intersects)
+#define hwloc_bitmap_isincluded HWLOC_NAME(bitmap_isincluded)
+#define hwloc_bitmap_or HWLOC_NAME(bitmap_or)
+#define hwloc_bitmap_and HWLOC_NAME(bitmap_and)
+#define hwloc_bitmap_andnot HWLOC_NAME(bitmap_andnot)
+#define hwloc_bitmap_xor HWLOC_NAME(bitmap_xor)
+#define hwloc_bitmap_not HWLOC_NAME(bitmap_not)
+#define hwloc_bitmap_first HWLOC_NAME(bitmap_first)
+#define hwloc_bitmap_last HWLOC_NAME(bitmap_last)
+#define hwloc_bitmap_next HWLOC_NAME(bitmap_next)
+#define hwloc_bitmap_singlify HWLOC_NAME(bitmap_singlify)
+#define hwloc_bitmap_compare_first HWLOC_NAME(bitmap_compare_first)
+#define hwloc_bitmap_compare HWLOC_NAME(bitmap_compare)
+#define hwloc_bitmap_weight HWLOC_NAME(bitmap_weight)
+
+/* hwloc/helper.h */
+
+#define hwloc_get_type_or_below_depth HWLOC_NAME(get_type_or_below_depth)
+#define hwloc_get_type_or_above_depth HWLOC_NAME(get_type_or_above_depth)
+#define hwloc_get_root_obj HWLOC_NAME(get_root_obj)
+#define hwloc_get_ancestor_obj_by_depth HWLOC_NAME(get_ancestor_obj_by_depth)
+#define hwloc_get_ancestor_obj_by_type HWLOC_NAME(get_ancestor_obj_by_type)
+#define hwloc_get_next_obj_by_depth HWLOC_NAME(get_next_obj_by_depth)
+#define hwloc_get_next_obj_by_type HWLOC_NAME(get_next_obj_by_type)
+#define hwloc_get_pu_obj_by_os_index HWLOC_NAME(get_pu_obj_by_os_index)
+#define hwloc_get_numanode_obj_by_os_index HWLOC_NAME(get_numanode_obj_by_os_index)
+#define hwloc_get_next_child HWLOC_NAME(get_next_child)
+#define hwloc_get_common_ancestor_obj HWLOC_NAME(get_common_ancestor_obj)
+#define hwloc_obj_is_in_subtree HWLOC_NAME(obj_is_in_subtree)
+#define hwloc_get_first_largest_obj_inside_cpuset HWLOC_NAME(get_first_largest_obj_inside_cpuset)
+#define hwloc_get_largest_objs_inside_cpuset HWLOC_NAME(get_largest_objs_inside_cpuset)
+#define hwloc_get_next_obj_inside_cpuset_by_depth HWLOC_NAME(get_next_obj_inside_cpuset_by_depth)
+#define hwloc_get_next_obj_inside_cpuset_by_type HWLOC_NAME(get_next_obj_inside_cpuset_by_type)
+#define hwloc_get_obj_inside_cpuset_by_depth HWLOC_NAME(get_obj_inside_cpuset_by_depth)
+#define hwloc_get_obj_inside_cpuset_by_type HWLOC_NAME(get_obj_inside_cpuset_by_type)
+#define hwloc_get_nbobjs_inside_cpuset_by_depth HWLOC_NAME(get_nbobjs_inside_cpuset_by_depth)
+#define hwloc_get_nbobjs_inside_cpuset_by_type HWLOC_NAME(get_nbobjs_inside_cpuset_by_type)
+#define hwloc_get_obj_index_inside_cpuset HWLOC_NAME(get_obj_index_inside_cpuset)
+#define hwloc_get_child_covering_cpuset HWLOC_NAME(get_child_covering_cpuset)
+#define hwloc_get_obj_covering_cpuset HWLOC_NAME(get_obj_covering_cpuset)
+#define hwloc_get_next_obj_covering_cpuset_by_depth HWLOC_NAME(get_next_obj_covering_cpuset_by_depth)
+#define hwloc_get_next_obj_covering_cpuset_by_type HWLOC_NAME(get_next_obj_covering_cpuset_by_type)
+#define hwloc_get_cache_type_depth HWLOC_NAME(get_cache_type_depth)
+#define hwloc_get_cache_covering_cpuset HWLOC_NAME(get_cache_covering_cpuset)
+#define hwloc_get_shared_cache_covering_obj HWLOC_NAME(get_shared_cache_covering_obj)
+#define hwloc_get_closest_objs HWLOC_NAME(get_closest_objs)
+#define hwloc_get_obj_below_by_type HWLOC_NAME(get_obj_below_by_type)
+#define hwloc_get_obj_below_array_by_type HWLOC_NAME(get_obj_below_array_by_type)
+#define hwloc_distrib_flags_e HWLOC_NAME(distrib_flags_e)
+#define HWLOC_DISTRIB_FLAG_REVERSE HWLOC_NAME_CAPS(DISTRIB_FLAG_REVERSE)
+#define hwloc_distrib HWLOC_NAME(distrib)
+#define hwloc_alloc_membind_policy HWLOC_NAME(alloc_membind_policy)
+#define hwloc_alloc_membind_policy_nodeset HWLOC_NAME(alloc_membind_policy_nodeset)
+#define hwloc_topology_get_complete_cpuset HWLOC_NAME(topology_get_complete_cpuset)
+#define hwloc_topology_get_topology_cpuset HWLOC_NAME(topology_get_topology_cpuset)
+#define hwloc_topology_get_allowed_cpuset HWLOC_NAME(topology_get_allowed_cpuset)
+#define hwloc_topology_get_complete_nodeset HWLOC_NAME(topology_get_complete_nodeset)
+#define hwloc_topology_get_topology_nodeset HWLOC_NAME(topology_get_topology_nodeset)
+#define hwloc_topology_get_allowed_nodeset HWLOC_NAME(topology_get_allowed_nodeset)
+#define hwloc_cpuset_to_nodeset HWLOC_NAME(cpuset_to_nodeset)
+#define hwloc_cpuset_to_nodeset_strict HWLOC_NAME(cpuset_to_nodeset_strict)
+#define hwloc_cpuset_from_nodeset HWLOC_NAME(cpuset_from_nodeset)
+#define hwloc_cpuset_from_nodeset_strict HWLOC_NAME(cpuset_from_nodeset_strict)
+#define hwloc_get_whole_distance_matrix_by_depth HWLOC_NAME(get_whole_distance_matrix_by_depth)
+#define hwloc_get_whole_distance_matrix_by_type HWLOC_NAME(get_whole_distance_matrix_by_type)
+#define hwloc_get_distance_matrix_covering_obj_by_depth HWLOC_NAME(get_distance_matrix_covering_obj_by_depth)
+#define hwloc_get_latency HWLOC_NAME(get_latency)
+
+/* export.h */
+
+#define hwloc_topology_export_xml HWLOC_NAME(topology_export_xml)
+#define hwloc_topology_export_xmlbuffer HWLOC_NAME(topology_export_xmlbuffer)
+#define hwloc_free_xmlbuffer HWLOC_NAME(free_xmlbuffer)
+#define hwloc_topology_set_userdata_export_callback HWLOC_NAME(topology_set_userdata_export_callback)
+#define hwloc_export_obj_userdata HWLOC_NAME(export_obj_userdata)
+#define hwloc_export_obj_userdata_base64 HWLOC_NAME(export_obj_userdata_base64)
+#define hwloc_topology_set_userdata_import_callback HWLOC_NAME(topology_set_userdata_import_callback)
+
+#define hwloc_topology_export_synthetic_flags_e HWLOC_NAME(topology_export_synthetic_flags_e)
+#define HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES)
+#define HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS)
+#define hwloc_topology_export_synthetic HWLOC_NAME(topology_export_synthetic)
+
+/* diff.h */
+
+#define hwloc_topology_diff_obj_attr_type_e HWLOC_NAME(topology_diff_obj_attr_type_e)
+#define hwloc_topology_diff_obj_attr_type_t HWLOC_NAME(topology_diff_obj_attr_type_t)
+#define HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE HWLOC_NAME_CAPS(TOPOLOGY_DIFF_OBJ_ATTR_SIZE)
+#define HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME HWLOC_NAME_CAPS(TOPOLOGY_DIFF_OBJ_ATTR_NAME)
+#define HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO HWLOC_NAME_CAPS(TOPOLOGY_DIFF_OBJ_ATTR_INFO)
+#define hwloc_topology_diff_obj_attr_u HWLOC_NAME(topology_diff_obj_attr_u)
+#define hwloc_topology_diff_obj_attr_generic_s HWLOC_NAME(topology_diff_obj_attr_generic_s)
+#define hwloc_topology_diff_obj_attr_uint64_s HWLOC_NAME(topology_diff_obj_attr_uint64_s)
+#define hwloc_topology_diff_obj_attr_string_s HWLOC_NAME(topology_diff_obj_attr_string_s)
+#define hwloc_topology_diff_type_e HWLOC_NAME(topology_diff_type_e)
+#define hwloc_topology_diff_type_t HWLOC_NAME(topology_diff_type_t)
+#define HWLOC_TOPOLOGY_DIFF_OBJ_ATTR HWLOC_NAME_CAPS(TOPOLOGY_DIFF_OBJ_ATTR)
+#define HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX HWLOC_NAME_CAPS(TOPOLOGY_DIFF_TOO_COMPLEX)
+#define hwloc_topology_diff_u HWLOC_NAME(topology_diff_u)
+#define hwloc_topology_diff_t HWLOC_NAME(topology_diff_t)
+#define hwloc_topology_diff_generic_s HWLOC_NAME(topology_diff_generic_s)
+#define hwloc_topology_diff_obj_attr_s HWLOC_NAME(topology_diff_obj_attr_s)
+#define hwloc_topology_diff_too_complex_s HWLOC_NAME(topology_diff_too_complex_s)
+#define hwloc_topology_diff_build HWLOC_NAME(topology_diff_build)
+#define hwloc_topology_diff_apply_flags_e HWLOC_NAME(topology_diff_apply_flags_e)
+#define HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE HWLOC_NAME_CAPS(TOPOLOGY_DIFF_APPLY_REVERSE)
+#define hwloc_topology_diff_apply HWLOC_NAME(topology_diff_apply)
+#define hwloc_topology_diff_destroy HWLOC_NAME(topology_diff_destroy)
+#define hwloc_topology_diff_load_xml HWLOC_NAME(topology_diff_load_xml)
+#define hwloc_topology_diff_export_xml HWLOC_NAME(topology_diff_export_xml)
+#define hwloc_topology_diff_load_xmlbuffer HWLOC_NAME(topology_diff_load_xmlbuffer)
+#define hwloc_topology_diff_export_xmlbuffer HWLOC_NAME(topology_diff_export_xmlbuffer)
+
+/* glibc-sched.h */
+
+#define hwloc_cpuset_to_glibc_sched_affinity HWLOC_NAME(cpuset_to_glibc_sched_affinity)
+#define hwloc_cpuset_from_glibc_sched_affinity HWLOC_NAME(cpuset_from_glibc_sched_affinity)
+
+/* linux-libnuma.h */
+
+#define hwloc_cpuset_to_linux_libnuma_ulongs HWLOC_NAME(cpuset_to_linux_libnuma_ulongs)
+#define hwloc_nodeset_to_linux_libnuma_ulongs HWLOC_NAME(nodeset_to_linux_libnuma_ulongs)
+#define hwloc_cpuset_from_linux_libnuma_ulongs HWLOC_NAME(cpuset_from_linux_libnuma_ulongs)
+#define hwloc_nodeset_from_linux_libnuma_ulongs HWLOC_NAME(nodeset_from_linux_libnuma_ulongs)
+#define hwloc_cpuset_to_linux_libnuma_bitmask HWLOC_NAME(cpuset_to_linux_libnuma_bitmask)
+#define hwloc_nodeset_to_linux_libnuma_bitmask HWLOC_NAME(nodeset_to_linux_libnuma_bitmask)
+#define hwloc_cpuset_from_linux_libnuma_bitmask HWLOC_NAME(cpuset_from_linux_libnuma_bitmask)
+#define hwloc_nodeset_from_linux_libnuma_bitmask HWLOC_NAME(nodeset_from_linux_libnuma_bitmask)
+
+/* linux.h */
+
+#define hwloc_linux_parse_cpumap_file HWLOC_NAME(linux_parse_cpumap_file)
+#define hwloc_linux_set_tid_cpubind HWLOC_NAME(linux_set_tid_cpubind)
+#define hwloc_linux_get_tid_cpubind HWLOC_NAME(linux_get_tid_cpubind)
+#define hwloc_linux_get_tid_last_cpu_location HWLOC_NAME(linux_get_tid_last_cpu_location)
+
+/* openfabrics-verbs.h */
+
+#define hwloc_ibv_get_device_cpuset HWLOC_NAME(ibv_get_device_cpuset)
+#define hwloc_ibv_get_device_osdev HWLOC_NAME(ibv_get_device_osdev)
+#define hwloc_ibv_get_device_osdev_by_name HWLOC_NAME(ibv_get_device_osdev_by_name)
+
+/* myriexpress.h */
+
+#define hwloc_mx_board_get_device_cpuset HWLOC_NAME(mx_board_get_device_cpuset)
+#define hwloc_mx_endpoint_get_device_cpuset HWLOC_NAME(mx_endpoint_get_device_cpuset)
+
+/* intel-mic.h */
+
+#define hwloc_intel_mic_get_device_cpuset HWLOC_NAME(intel_mic_get_device_cpuset)
+#define hwloc_intel_mic_get_device_osdev_by_index HWLOC_NAME(intel_mic_get_device_osdev_by_index)
+
+/* opencl.h */
+
+#define hwloc_opencl_get_device_cpuset HWLOC_NAME(opencl_get_device_cpuset)
+#define hwloc_opencl_get_device_osdev HWLOC_NAME(opencl_get_device_osdev)
+#define hwloc_opencl_get_device_osdev_by_index HWLOC_NAME(opencl_get_device_osdev_by_index)
+
+/* cuda.h */
+
+#define hwloc_cuda_get_device_pci_ids HWLOC_NAME(cuda_get_device_pci_ids)
+#define hwloc_cuda_get_device_cpuset HWLOC_NAME(cuda_get_device_cpuset)
+#define hwloc_cuda_get_device_pcidev HWLOC_NAME(cuda_get_device_pcidev)
+#define hwloc_cuda_get_device_osdev HWLOC_NAME(cuda_get_device_osdev)
+#define hwloc_cuda_get_device_osdev_by_index HWLOC_NAME(cuda_get_device_osdev_by_index)
+
+/* cudart.h */
+
+#define hwloc_cudart_get_device_pci_ids HWLOC_NAME(cudart_get_device_pci_ids)
+#define hwloc_cudart_get_device_cpuset HWLOC_NAME(cudart_get_device_cpuset)
+#define hwloc_cudart_get_device_pcidev HWLOC_NAME(cudart_get_device_pcidev)
+#define hwloc_cudart_get_device_osdev_by_index HWLOC_NAME(cudart_get_device_osdev_by_index)
+
+/* nvml.h */
+
+#define hwloc_nvml_get_device_cpuset HWLOC_NAME(nvml_get_device_cpuset)
+#define hwloc_nvml_get_device_osdev HWLOC_NAME(nvml_get_device_osdev)
+#define hwloc_nvml_get_device_osdev_by_index HWLOC_NAME(nvml_get_device_osdev_by_index)
+
+/* gl.h */
+
+#define hwloc_gl_get_display_osdev_by_port_device HWLOC_NAME(gl_get_display_osdev_by_port_device)
+#define hwloc_gl_get_display_osdev_by_name HWLOC_NAME(gl_get_display_osdev_by_name)
+#define hwloc_gl_get_display_by_osdev HWLOC_NAME(gl_get_display_by_osdev)
+
+/* hwloc/plugins.h */
+
+#define hwloc_disc_component_type_e HWLOC_NAME(disc_component_type_e)
+#define HWLOC_DISC_COMPONENT_TYPE_CPU HWLOC_NAME_CAPS(DISC_COMPONENT_TYPE_CPU)
+#define HWLOC_DISC_COMPONENT_TYPE_GLOBAL HWLOC_NAME_CAPS(DISC_COMPONENT_TYPE_GLOBAL)
+#define HWLOC_DISC_COMPONENT_TYPE_MISC HWLOC_NAME_CAPS(DISC_COMPONENT_TYPE_MISC)
+#define hwloc_disc_component_type_t HWLOC_NAME(disc_component_type_t)
+#define hwloc_disc_component HWLOC_NAME(disc_component)
+
+#define hwloc_backend HWLOC_NAME(backend)
+#define hwloc_backend_flag_e HWLOC_NAME(backend_flag_e)
+#define HWLOC_BACKEND_FLAG_NEED_LEVELS HWLOC_NAME_CAPS(BACKEND_FLAG_NEED_LEVELS)
+
+#define hwloc_backend_alloc HWLOC_NAME(backend_alloc)
+#define hwloc_backend_enable HWLOC_NAME(backend_enable)
+#define hwloc_backends_get_obj_cpuset HWLOC_NAME(backends_get_obj_cpuset)
+#define hwloc_backends_notify_new_object HWLOC_NAME(backends_notify_new_object)
+
+#define hwloc_component_type_e HWLOC_NAME(component_type_e)
+#define HWLOC_COMPONENT_TYPE_DISC HWLOC_NAME_CAPS(COMPONENT_TYPE_DISC)
+#define HWLOC_COMPONENT_TYPE_XML HWLOC_NAME_CAPS(COMPONENT_TYPE_XML)
+#define hwloc_component_type_t HWLOC_NAME(component_type_t)
+#define hwloc_component HWLOC_NAME(component)
+
+#define hwloc_plugin_check_namespace HWLOC_NAME(plugin_check_namespace)
+
+#define hwloc_insert_object_by_cpuset HWLOC_NAME(insert_object_by_cpuset)
+#define hwloc_report_error_t HWLOC_NAME(report_error_t)
+#define hwloc_report_os_error HWLOC_NAME(report_os_error)
+#define hwloc_hide_errors HWLOC_NAME(hide_errors)
+#define hwloc__insert_object_by_cpuset HWLOC_NAME(_insert_object_by_cpuset)
+#define hwloc_insert_object_by_parent HWLOC_NAME(insert_object_by_parent)
+#define hwloc_alloc_setup_object HWLOC_NAME(alloc_setup_object)
+#define hwloc_obj_add_children_sets HWLOC_NAME(add_children_sets)
+
+#define hwloc_insert_pci_device_list HWLOC_NAME(insert_pci_device_list)
+#define hwloc_pci_find_cap HWLOC_NAME(pci_find_cap)
+#define hwloc_pci_find_linkspeed HWLOC_NAME(pci_find_linkspeed)
+#define hwloc_pci_prepare_bridge HWLOC_NAME(pci_prepare_bridge)
+
+/* hwloc/deprecated.h */
+
+#define hwloc_obj_type_of_string HWLOC_NAME(obj_type_of_string )
+#define hwloc_obj_snprintf HWLOC_NAME(obj_snprintf)
+#define hwloc_distributev HWLOC_NAME(distributev)
+#define hwloc_distribute HWLOC_NAME(distribute)
+#define hwloc_topology_insert_misc_object_by_parent HWLOC_NAME(topology_insert_misc_object_by_parent)
+
+/* private/debug.h */
+
+#define hwloc_debug HWLOC_NAME(debug)
+
+/* private/misc.h */
+
+#define hwloc_snprintf HWLOC_NAME(snprintf)
+#define hwloc_namecoloncmp HWLOC_NAME(namecoloncmp)
+#define hwloc_ffsl_manual HWLOC_NAME(ffsl_manual)
+#define hwloc_ffs32 HWLOC_NAME(ffs32)
+#define hwloc_ffsl_from_ffs32 HWLOC_NAME(ffsl_from_ffs32)
+#define hwloc_flsl_manual HWLOC_NAME(flsl_manual)
+#define hwloc_fls32 HWLOC_NAME(fls32)
+#define hwloc_flsl_from_fls32 HWLOC_NAME(flsl_from_fls32)
+#define hwloc_weight_long HWLOC_NAME(weight_long)
+#define hwloc_strncasecmp HWLOC_NAME(strncasecmp)
+
+/* private/cpuid-x86.h */
+
+#define hwloc_have_x86_cpuid HWLOC_NAME(have_x86_cpuid)
+#define hwloc_x86_cpuid HWLOC_NAME(x86_cpuid)
+
+/* private/xml.h */
+
+#define hwloc__xml_verbose HWLOC_NAME(_xml_verbose)
+
+#define hwloc__xml_import_state_s HWLOC_NAME(_xml_import_state_s)
+#define hwloc__xml_import_state_t HWLOC_NAME(_xml_import_state_t)
+#define hwloc__xml_import_diff HWLOC_NAME(_xml_import_diff)
+#define hwloc_xml_backend_data_s HWLOC_NAME(xml_backend_data_s)
+#define hwloc__xml_export_state_s HWLOC_NAME(_xml_export_state_s)
+#define hwloc__xml_export_state_t HWLOC_NAME(_xml_export_state_t)
+#define hwloc__xml_export_object HWLOC_NAME(_xml_export_object)
+#define hwloc__xml_export_diff HWLOC_NAME(_xml_export_diff)
+
+#define hwloc_xml_callbacks HWLOC_NAME(xml_callbacks)
+#define hwloc_xml_component HWLOC_NAME(xml_component)
+#define hwloc_xml_callbacks_register HWLOC_NAME(xml_callbacks_register)
+#define hwloc_xml_callbacks_reset HWLOC_NAME(xml_callbacks_reset)
+
+/* private/components.h */
+
+#define hwloc_disc_component_force_enable HWLOC_NAME(disc_component_force_enable)
+#define hwloc_disc_components_enable_others HWLOC_NAME(disc_components_instantiate_others)
+
+#define hwloc_backends_disable_all HWLOC_NAME(backends_disable_all)
+#define hwloc_backends_is_thissystem HWLOC_NAME(backends_is_thissystem)
+
+#define hwloc_components_init HWLOC_NAME(components_init)
+#define hwloc_components_destroy_all HWLOC_NAME(components_destroy_all)
+
+/* private/private.h */
+
+#define hwloc_ignore_type_e HWLOC_NAME(ignore_type_e)
+
+#define HWLOC_IGNORE_TYPE_NEVER HWLOC_NAME_CAPS(IGNORE_TYPE_NEVER)
+#define HWLOC_IGNORE_TYPE_KEEP_STRUCTURE HWLOC_NAME_CAPS(IGNORE_TYPE_KEEP_STRUCTURE)
+#define HWLOC_IGNORE_TYPE_ALWAYS HWLOC_NAME_CAPS(IGNORE_TYPE_ALWAYS)
+
+#define hwloc_os_distances_s HWLOC_NAME(os_distances_s)
+
+#define hwloc_xml_imported_distances_s HWLOC_NAME(xml_imported_distances_s)
+
+#define hwloc_alloc_obj_cpusets HWLOC_NAME(alloc_obj_cpusets)
+#define hwloc_setup_pu_level HWLOC_NAME(setup_pu_level)
+#define hwloc_get_sysctlbyname HWLOC_NAME(get_sysctlbyname)
+#define hwloc_get_sysctl HWLOC_NAME(get_sysctl)
+#define hwloc_fallback_nbprocessors HWLOC_NAME(fallback_nbprocessors)
+#define hwloc_connect_children HWLOC_NAME(connect_children)
+#define hwloc_connect_levels HWLOC_NAME(connect_levels)
+
+#define hwloc__object_cpusets_compare_first HWLOC_NAME(_object_cpusets_compare_first)
+#define hwloc__reorder_children HWLOC_NAME(_reorder_children)
+
+#define hwloc_topology_setup_defaults HWLOC_NAME(topology_setup_defaults)
+#define hwloc_topology_clear HWLOC_NAME(topology_clear)
+
+#define hwloc__add_info HWLOC_NAME(_add_info)
+#define hwloc__find_info_slot HWLOC_NAME(_find_info_slot)
+#define hwloc__move_infos HWLOC_NAME(_move_infos)
+#define hwloc__free_infos HWLOC_NAME(_free_infos)
+
+#define hwloc_binding_hooks HWLOC_NAME(binding_hooks)
+#define hwloc_set_native_binding_hooks HWLOC_NAME(set_native_binding_hooks)
+#define hwloc_set_binding_hooks HWLOC_NAME(set_binding_hooks)
+
+#define hwloc_set_linuxfs_hooks HWLOC_NAME(set_linuxfs_hooks)
+#define hwloc_set_bgq_hooks HWLOC_NAME(set_bgq_hooks)
+#define hwloc_set_solaris_hooks HWLOC_NAME(set_solaris_hooks)
+#define hwloc_set_aix_hooks HWLOC_NAME(set_aix_hooks)
+#define hwloc_set_osf_hooks HWLOC_NAME(set_osf_hooks)
+#define hwloc_set_windows_hooks HWLOC_NAME(set_windows_hooks)
+#define hwloc_set_darwin_hooks HWLOC_NAME(set_darwin_hooks)
+#define hwloc_set_freebsd_hooks HWLOC_NAME(set_freebsd_hooks)
+#define hwloc_set_netbsd_hooks HWLOC_NAME(set_netbsd_hooks)
+#define hwloc_set_hpux_hooks HWLOC_NAME(set_hpux_hooks)
+
+#define hwloc_add_uname_info HWLOC_NAME(add_uname_info)
+#define hwloc_free_unlinked_object HWLOC_NAME(free_unlinked_object)
+#define hwloc__duplicate_objects HWLOC_NAME(_duplicate_objects)
+
+#define hwloc_alloc_heap HWLOC_NAME(alloc_heap)
+#define hwloc_alloc_mmap HWLOC_NAME(alloc_mmap)
+#define hwloc_free_heap HWLOC_NAME(free_heap)
+#define hwloc_free_mmap HWLOC_NAME(free_mmap)
+#define hwloc_alloc_or_fail HWLOC_NAME(alloc_or_fail)
+
+#define hwloc_distances_init HWLOC_NAME(distances_init)
+#define hwloc_distances_destroy HWLOC_NAME(distances_destroy)
+#define hwloc_distances_set HWLOC_NAME(distances_set)
+#define hwloc_distances_set_from_env HWLOC_NAME(distances_set_from_env)
+#define hwloc_distances_restrict_os HWLOC_NAME(distances_restrict_os)
+#define hwloc_distances_restrict HWLOC_NAME(distances_restrict)
+#define hwloc_distances_finalize_os HWLOC_NAME(distances_finalize_os)
+#define hwloc_distances_finalize_logical HWLOC_NAME(distances_finalize_logical)
+#define hwloc_clear_object_distances HWLOC_NAME(clear_object_distances)
+#define hwloc_clear_object_distances_one HWLOC_NAME(clear_object_distances_one)
+#define hwloc_group_by_distances HWLOC_NAME(group_by_distances)
+
+#define hwloc_encode_to_base64 HWLOC_NAME(encode_to_base64)
+#define hwloc_decode_from_base64 HWLOC_NAME(decode_from_base64)
+
+#define hwloc_obj_add_info_nodup HWLOC_NAME(obj_add_info_nodup)
+
+#define hwloc_progname HWLOC_NAME(progname)
+
+#define hwloc_bitmap_compare_inclusion HWLOC_NAME(bitmap_compare_inclusion)
+
+/* private/solaris-chiptype.h */
+
+#define hwloc_solaris_get_chip_type HWLOC_NAME(solaris_get_chip_type)
+#define hwloc_solaris_get_chip_model HWLOC_NAME(solaris_get_chip_model)
+
+#endif /* HWLOC_SYM_TRANSFORM */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_RENAME_H */
diff --git a/ext/hwloc/include/numa.h b/ext/hwloc/include/numa.h
new file mode 100644
index 0000000..1dbc137
--- /dev/null
+++ b/ext/hwloc/include/numa.h
@@ -0,0 +1,468 @@
+/* Copyright (C) 2003,2004 Andi Kleen, SuSE Labs.
+
+ libnuma is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; version
+ 2.1.
+
+ libnuma is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should find a copy of v2.1 of the GNU Lesser General Public License
+ somewhere on your Linux system; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#ifndef _NUMA_H
+#define _NUMA_H 1
+
+/* allow an application to test for the current programming interface: */
+#define LIBNUMA_API_VERSION 2
+
+/* Simple NUMA policy library */
+
+#include <stddef.h>
+#include <string.h>
+#include <sys/types.h>
+#include <stdlib.h>
+
+#if defined(__x86_64__) || defined(__i386__)
+#define NUMA_NUM_NODES 128
+#else
+#define NUMA_NUM_NODES 2048
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+ unsigned long n[NUMA_NUM_NODES/(sizeof(unsigned long)*8)];
+} nodemask_t;
+
+struct bitmask {
+ unsigned long size; /* number of bits in the map */
+ unsigned long *maskp;
+};
+
+/* operations on struct bitmask */
+int numa_bitmask_isbitset(const struct bitmask *, unsigned int);
+struct bitmask *numa_bitmask_setall(struct bitmask *);
+struct bitmask *numa_bitmask_clearall(struct bitmask *);
+struct bitmask *numa_bitmask_setbit(struct bitmask *, unsigned int);
+struct bitmask *numa_bitmask_clearbit(struct bitmask *, unsigned int);
+unsigned int numa_bitmask_nbytes(struct bitmask *);
+struct bitmask *numa_bitmask_alloc(unsigned int);
+void numa_bitmask_free(struct bitmask *);
+int numa_bitmask_equal(const struct bitmask *, const struct bitmask *);
+void copy_nodemask_to_bitmask(nodemask_t *, struct bitmask *);
+void copy_bitmask_to_nodemask(struct bitmask *, nodemask_t *);
+void copy_bitmask_to_bitmask(struct bitmask *, struct bitmask *);
+
+/* compatibility for codes that used them: */
+
+static inline void nodemask_zero(nodemask_t *mask)
+{
+ struct bitmask tmp;
+
+ tmp.maskp = (unsigned long *)mask;
+ tmp.size = sizeof(nodemask_t) * 8;
+ numa_bitmask_clearall(&tmp);
+}
+
+static inline void nodemask_zero_compat(nodemask_t *mask)
+{
+ struct bitmask tmp;
+
+ tmp.maskp = (unsigned long *)mask;
+ tmp.size = sizeof(nodemask_t) * 8;
+ numa_bitmask_clearall(&tmp);
+}
+
+static inline void nodemask_set_compat(nodemask_t *mask, int node)
+{
+ mask->n[node / (8*sizeof(unsigned long))] |=
+ (1UL<<(node%(8*sizeof(unsigned long))));
+}
+
+static inline void nodemask_clr_compat(nodemask_t *mask, int node)
+{
+ mask->n[node / (8*sizeof(unsigned long))] &=
+ ~(1UL<<(node%(8*sizeof(unsigned long))));
+}
+
+static inline int nodemask_isset_compat(const nodemask_t *mask, int node)
+{
+ if ((unsigned)node >= NUMA_NUM_NODES)
+ return 0;
+ if (mask->n[node / (8*sizeof(unsigned long))] &
+ (1UL<<(node%(8*sizeof(unsigned long)))))
+ return 1;
+ return 0;
+}
+
+static inline int nodemask_equal(const nodemask_t *a, const nodemask_t *b)
+{
+ struct bitmask tmp_a, tmp_b;
+
+ tmp_a.maskp = (unsigned long *)a;
+ tmp_a.size = sizeof(nodemask_t) * 8;
+
+ tmp_b.maskp = (unsigned long *)b;
+ tmp_b.size = sizeof(nodemask_t) * 8;
+
+ return numa_bitmask_equal(&tmp_a, &tmp_b);
+}
+
+static inline int nodemask_equal_compat(const nodemask_t *a, const nodemask_t *b)
+{
+ struct bitmask tmp_a, tmp_b;
+
+ tmp_a.maskp = (unsigned long *)a;
+ tmp_a.size = sizeof(nodemask_t) * 8;
+
+ tmp_b.maskp = (unsigned long *)b;
+ tmp_b.size = sizeof(nodemask_t) * 8;
+
+ return numa_bitmask_equal(&tmp_a, &tmp_b);
+}
+
+/* NUMA support available. If this returns a negative value all other function
+ in this library are undefined. */
+int numa_available(void);
+
+/* Basic NUMA state */
+
+/* Get max available node */
+int numa_max_node(void);
+int numa_max_possible_node(void);
+/* Return preferred node */
+int numa_preferred(void);
+
+/* Return node size and free memory */
+long long numa_node_size64(int node, long long *freep);
+long numa_node_size(int node, long *freep);
+
+int numa_pagesize(void);
+
+/* Set with all nodes from which the calling process may allocate memory.
+ Only valid after numa_available. */
+extern struct bitmask *numa_all_nodes_ptr;
+
+/* Set with all nodes the kernel has exposed to userspace */
+extern struct bitmask *numa_nodes_ptr;
+
+/* For source compatibility */
+extern nodemask_t numa_all_nodes;
+
+/* Set with all cpus. */
+extern struct bitmask *numa_all_cpus_ptr;
+
+/* Set with no nodes */
+extern struct bitmask *numa_no_nodes_ptr;
+
+/* Source compatibility */
+extern nodemask_t numa_no_nodes;
+
+/* Only run and allocate memory from a specific set of nodes. */
+void numa_bind(struct bitmask *nodes);
+
+/* Set the NUMA node interleaving mask. 0 to turn off interleaving */
+void numa_set_interleave_mask(struct bitmask *nodemask);
+
+/* Return the current interleaving mask */
+struct bitmask *numa_get_interleave_mask(void);
+
+/* allocate a bitmask big enough for all nodes */
+struct bitmask *numa_allocate_nodemask(void);
+
+static inline void numa_free_nodemask(struct bitmask *b)
+{
+ numa_bitmask_free(b);
+}
+
+/* Some node to preferably allocate memory from for task. */
+void numa_set_preferred(int node);
+
+/* Set local memory allocation policy for task */
+void numa_set_localalloc(void);
+
+/* Only allocate memory from the nodes set in mask. 0 to turn off */
+void numa_set_membind(struct bitmask *nodemask);
+
+/* Return current membind */
+struct bitmask *numa_get_membind(void);
+
+/* Return allowed memories [nodes] */
+struct bitmask *numa_get_mems_allowed(void);
+
+int numa_get_interleave_node(void);
+
+/* NUMA memory allocation. These functions always round to page size
+ and are relatively slow. */
+
+/* Alloc memory page interleaved on nodes in mask */
+void *numa_alloc_interleaved_subset(size_t size, struct bitmask *nodemask);
+/* Alloc memory page interleaved on all nodes. */
+void *numa_alloc_interleaved(size_t size);
+/* Alloc memory located on node */
+void *numa_alloc_onnode(size_t size, int node);
+/* Alloc memory on local node */
+void *numa_alloc_local(size_t size);
+/* Allocation with current policy */
+void *numa_alloc(size_t size);
+/* Change the size of a memory area preserving the memory policy */
+void *numa_realloc(void *old_addr, size_t old_size, size_t new_size);
+/* Free memory allocated by the functions above */
+void numa_free(void *mem, size_t size);
+
+/* Low level functions, primarily for shared memory. All memory
+ processed by these must not be touched yet */
+
+/* Interleave an memory area. */
+void numa_interleave_memory(void *mem, size_t size, struct bitmask *mask);
+
+/* Allocate a memory area on a specific node. */
+void numa_tonode_memory(void *start, size_t size, int node);
+
+/* Allocate memory on a mask of nodes. */
+void numa_tonodemask_memory(void *mem, size_t size, struct bitmask *mask);
+
+/* Allocate a memory area on the current node. */
+void numa_setlocal_memory(void *start, size_t size);
+
+/* Allocate memory area with current memory policy */
+void numa_police_memory(void *start, size_t size);
+
+/* Run current task only on nodes in mask */
+int numa_run_on_node_mask(struct bitmask *mask);
+/* Run current task only on node */
+int numa_run_on_node(int node);
+/* Return current mask of nodes the task can run on */
+struct bitmask * numa_get_run_node_mask(void);
+
+/* When strict fail allocation when memory cannot be allocated in target node(s). */
+void numa_set_bind_policy(int strict);
+
+/* Fail when existing memory has incompatible policy */
+void numa_set_strict(int flag);
+
+/* maximum nodes (size of kernel nodemask_t) */
+int numa_num_possible_nodes();
+
+/* maximum cpus (size of kernel cpumask_t) */
+int numa_num_possible_cpus();
+
+/* nodes in the system */
+int numa_num_configured_nodes();
+
+/* maximum cpus */
+int numa_num_configured_cpus();
+
+/* maximum cpus allowed to current task */
+int numa_num_task_cpus();
+int numa_num_thread_cpus(); /* backward compatibility */
+
+/* maximum nodes allowed to current task */
+int numa_num_task_nodes();
+int numa_num_thread_nodes(); /* backward compatibility */
+
+/* allocate a bitmask the size of the kernel cpumask_t */
+struct bitmask *numa_allocate_cpumask();
+
+static inline void numa_free_cpumask(struct bitmask *b)
+{
+ numa_bitmask_free(b);
+}
+
+/* Convert node to CPU mask. -1/errno on failure, otherwise 0. */
+int numa_node_to_cpus(int, struct bitmask *);
+
+/* report the node of the specified cpu. -1/errno on invalid cpu. */
+int numa_node_of_cpu(int cpu);
+
+/* Report distance of node1 from node2. 0 on error.*/
+int numa_distance(int node1, int node2);
+
+/* Error handling. */
+/* This is an internal function in libnuma that can be overwritten by an user
+ program. Default is to print an error to stderr and exit if numa_exit_on_error
+ is true. */
+void numa_error(char *where);
+
+/* When true exit the program when a NUMA system call (except numa_available)
+ fails */
+extern int numa_exit_on_error;
+/* Warning function. Can also be overwritten. Default is to print on stderr
+ once. */
+void numa_warn(int num, char *fmt, ...);
+
+/* When true exit the program on a numa_warn() call */
+extern int numa_exit_on_warn;
+
+int numa_migrate_pages(int pid, struct bitmask *from, struct bitmask *to);
+
+int numa_move_pages(int pid, unsigned long count, void **pages,
+ const int *nodes, int *status, int flags);
+
+int numa_sched_getaffinity(pid_t, struct bitmask *);
+int numa_sched_setaffinity(pid_t, struct bitmask *);
+
+/* Convert an ascii list of nodes to a bitmask */
+struct bitmask *numa_parse_nodestring(char *);
+
+/* Convert an ascii list of cpu to a bitmask */
+struct bitmask *numa_parse_cpustring(char *);
+
+/*
+ * The following functions are for source code compatibility
+ * with releases prior to version 2.
+ * Such codes should be compiled with NUMA_VERSION1_COMPATIBILITY defined.
+ */
+
+static inline void numa_set_interleave_mask_compat(nodemask_t *nodemask)
+{
+ struct bitmask tmp;
+
+ tmp.maskp = (unsigned long *)nodemask;
+ tmp.size = sizeof(nodemask_t) * 8;
+ numa_set_interleave_mask(&tmp);
+}
+
+static inline nodemask_t numa_get_interleave_mask_compat()
+{
+ struct bitmask *tp;
+ nodemask_t mask;
+
+ tp = numa_get_interleave_mask();
+ copy_bitmask_to_nodemask(tp, &mask);
+ numa_bitmask_free(tp);
+ return mask;
+}
+
+static inline void numa_bind_compat(nodemask_t *mask)
+{
+ struct bitmask *tp;
+
+ tp = numa_allocate_nodemask();
+ copy_nodemask_to_bitmask(mask, tp);
+ numa_bind(tp);
+ numa_bitmask_free(tp);
+}
+
+static inline void numa_set_membind_compat(nodemask_t *mask)
+{
+ struct bitmask tmp;
+
+ tmp.maskp = (unsigned long *)mask;
+ tmp.size = sizeof(nodemask_t) * 8;
+ numa_set_membind(&tmp);
+}
+
+static inline nodemask_t numa_get_membind_compat()
+{
+ struct bitmask *tp;
+ nodemask_t mask;
+
+ tp = numa_get_membind();
+ copy_bitmask_to_nodemask(tp, &mask);
+ numa_bitmask_free(tp);
+ return mask;
+}
+
+static inline void *numa_alloc_interleaved_subset_compat(size_t size,
+ const nodemask_t *mask)
+{
+ struct bitmask tmp;
+
+ tmp.maskp = (unsigned long *)mask;
+ tmp.size = sizeof(nodemask_t) * 8;
+ return numa_alloc_interleaved_subset(size, &tmp);
+}
+
+static inline int numa_run_on_node_mask_compat(const nodemask_t *mask)
+{
+ struct bitmask tmp;
+
+ tmp.maskp = (unsigned long *)mask;
+ tmp.size = sizeof(nodemask_t) * 8;
+ return numa_run_on_node_mask(&tmp);
+}
+
+static inline nodemask_t numa_get_run_node_mask_compat()
+{
+ struct bitmask *tp;
+ nodemask_t mask;
+
+ tp = numa_get_run_node_mask();
+ copy_bitmask_to_nodemask(tp, &mask);
+ numa_bitmask_free(tp);
+ return mask;
+}
+
+static inline void numa_interleave_memory_compat(void *mem, size_t size,
+ const nodemask_t *mask)
+{
+ struct bitmask tmp;
+
+ tmp.maskp = (unsigned long *)mask;
+ tmp.size = sizeof(nodemask_t) * 8;
+ numa_interleave_memory(mem, size, &tmp);
+}
+
+static inline void numa_tonodemask_memory_compat(void *mem, size_t size,
+ const nodemask_t *mask)
+{
+ struct bitmask tmp;
+
+ tmp.maskp = (unsigned long *)mask;
+ tmp.size = sizeof(nodemask_t) * 8;
+ numa_tonodemask_memory(mem, size, &tmp);
+}
+
+static inline int numa_sched_getaffinity_compat(pid_t pid, unsigned len,
+ unsigned long *mask)
+{
+ struct bitmask tmp;
+
+ tmp.maskp = (unsigned long *)mask;
+ tmp.size = len * 8;
+ return numa_sched_getaffinity(pid, &tmp);
+}
+
+static inline int numa_sched_setaffinity_compat(pid_t pid, unsigned len,
+ unsigned long *mask)
+{
+ struct bitmask tmp;
+
+ tmp.maskp = (unsigned long *)mask;
+ tmp.size = len * 8;
+ return numa_sched_setaffinity(pid, &tmp);
+}
+
+static inline int numa_node_to_cpus_compat(int node, unsigned long *buffer,
+ int buffer_len)
+{
+ struct bitmask tmp;
+
+ tmp.maskp = (unsigned long *)buffer;
+ tmp.size = buffer_len * 8;
+ return numa_node_to_cpus(node, &tmp);
+}
+
+/* end of version 1 compatibility functions */
+
+/*
+ * To compile an application that uses libnuma version 1:
+ * add -DNUMA_VERSION1_COMPATIBILITY to your Makefile's CFLAGS
+ */
+#ifdef NUMA_VERSION1_COMPATIBILITY
+#include <numacompat1.h>
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/ext/hwloc/include/pci/config.h b/ext/hwloc/include/pci/config.h
new file mode 100644
index 0000000..beecb1d
--- /dev/null
+++ b/ext/hwloc/include/pci/config.h
@@ -0,0 +1,16 @@
+#define PCI_CONFIG_H
+#define PCI_ARCH_X86_64
+#define PCI_OS_LINUX
+#define PCI_HAVE_PM_LINUX_SYSFS
+#define PCI_HAVE_PM_LINUX_PROC
+#define PCI_HAVE_LINUX_BYTEORDER_H
+#define PCI_PATH_PROC_BUS_PCI "/proc/bus/pci"
+#define PCI_PATH_SYS_BUS_PCI "/sys/bus/pci"
+#define PCI_HAVE_PM_INTEL_CONF
+#define PCI_HAVE_64BIT_ADDRESS
+#define PCI_HAVE_PM_DUMP
+#define PCI_COMPRESSED_IDS
+#define PCI_IDS "pci.ids.gz"
+#define PCI_PATH_IDS_DIR "/usr/share/misc"
+#define PCI_USE_DNS
+#define PCI_ID_DOMAIN "pci.id.ucw.cz"
diff --git a/ext/hwloc/include/pci/header.h b/ext/hwloc/include/pci/header.h
new file mode 100644
index 0000000..d481f27
--- /dev/null
+++ b/ext/hwloc/include/pci/header.h
@@ -0,0 +1,1195 @@
+/*
+ * The PCI Library -- PCI Header Structure (based on <linux/pci.h>)
+ *
+ * Copyright (c) 1997--2010 Martin Mares <mj at ucw.cz>
+ *
+ * Can be freely distributed and used under the terms of the GNU GPL.
+ */
+
+/*
+ * Under PCI, each device has 256 bytes of configuration address space,
+ * of which the first 64 bytes are standardized as follows:
+ */
+#define PCI_VENDOR_ID 0x00 /* 16 bits */
+#define PCI_DEVICE_ID 0x02 /* 16 bits */
+#define PCI_COMMAND 0x04 /* 16 bits */
+#define PCI_COMMAND_IO 0x1 /* Enable response in I/O space */
+#define PCI_COMMAND_MEMORY 0x2 /* Enable response in Memory space */
+#define PCI_COMMAND_MASTER 0x4 /* Enable bus mastering */
+#define PCI_COMMAND_SPECIAL 0x8 /* Enable response to special cycles */
+#define PCI_COMMAND_INVALIDATE 0x10 /* Use memory write and invalidate */
+#define PCI_COMMAND_VGA_PALETTE 0x20 /* Enable palette snooping */
+#define PCI_COMMAND_PARITY 0x40 /* Enable parity checking */
+#define PCI_COMMAND_WAIT 0x80 /* Enable address/data stepping */
+#define PCI_COMMAND_SERR 0x100 /* Enable SERR */
+#define PCI_COMMAND_FAST_BACK 0x200 /* Enable back-to-back writes */
+#define PCI_COMMAND_DISABLE_INTx 0x400 /* PCIE: Disable INTx interrupts */
+
+#define PCI_STATUS 0x06 /* 16 bits */
+#define PCI_STATUS_INTx 0x08 /* PCIE: INTx interrupt pending */
+#define PCI_STATUS_CAP_LIST 0x10 /* Support Capability List */
+#define PCI_STATUS_66MHZ 0x20 /* Support 66 Mhz PCI 2.1 bus */
+#define PCI_STATUS_UDF 0x40 /* Support User Definable Features [obsolete] */
+#define PCI_STATUS_FAST_BACK 0x80 /* Accept fast-back to back */
+#define PCI_STATUS_PARITY 0x100 /* Detected parity error */
+#define PCI_STATUS_DEVSEL_MASK 0x600 /* DEVSEL timing */
+#define PCI_STATUS_DEVSEL_FAST 0x000
+#define PCI_STATUS_DEVSEL_MEDIUM 0x200
+#define PCI_STATUS_DEVSEL_SLOW 0x400
+#define PCI_STATUS_SIG_TARGET_ABORT 0x800 /* Set on target abort */
+#define PCI_STATUS_REC_TARGET_ABORT 0x1000 /* Master ack of " */
+#define PCI_STATUS_REC_MASTER_ABORT 0x2000 /* Set on master abort */
+#define PCI_STATUS_SIG_SYSTEM_ERROR 0x4000 /* Set when we drive SERR */
+#define PCI_STATUS_DETECTED_PARITY 0x8000 /* Set on parity error */
+
+#define PCI_CLASS_REVISION 0x08 /* High 24 bits are class, low 8
+ revision */
+#define PCI_REVISION_ID 0x08 /* Revision ID */
+#define PCI_CLASS_PROG 0x09 /* Reg. Level Programming Interface */
+#define PCI_CLASS_DEVICE 0x0a /* Device class */
+
+#define PCI_CACHE_LINE_SIZE 0x0c /* 8 bits */
+#define PCI_LATENCY_TIMER 0x0d /* 8 bits */
+#define PCI_HEADER_TYPE 0x0e /* 8 bits */
+#define PCI_HEADER_TYPE_NORMAL 0
+#define PCI_HEADER_TYPE_BRIDGE 1
+#define PCI_HEADER_TYPE_CARDBUS 2
+
+#define PCI_BIST 0x0f /* 8 bits */
+#define PCI_BIST_CODE_MASK 0x0f /* Return result */
+#define PCI_BIST_START 0x40 /* 1 to start BIST, 2 secs or less */
+#define PCI_BIST_CAPABLE 0x80 /* 1 if BIST capable */
+
+/*
+ * Base addresses specify locations in memory or I/O space.
+ * Decoded size can be determined by writing a value of
+ * 0xffffffff to the register, and reading it back. Only
+ * 1 bits are decoded.
+ */
+#define PCI_BASE_ADDRESS_0 0x10 /* 32 bits */
+#define PCI_BASE_ADDRESS_1 0x14 /* 32 bits [htype 0,1 only] */
+#define PCI_BASE_ADDRESS_2 0x18 /* 32 bits [htype 0 only] */
+#define PCI_BASE_ADDRESS_3 0x1c /* 32 bits */
+#define PCI_BASE_ADDRESS_4 0x20 /* 32 bits */
+#define PCI_BASE_ADDRESS_5 0x24 /* 32 bits */
+#define PCI_BASE_ADDRESS_SPACE 0x01 /* 0 = memory, 1 = I/O */
+#define PCI_BASE_ADDRESS_SPACE_IO 0x01
+#define PCI_BASE_ADDRESS_SPACE_MEMORY 0x00
+#define PCI_BASE_ADDRESS_MEM_TYPE_MASK 0x06
+#define PCI_BASE_ADDRESS_MEM_TYPE_32 0x00 /* 32 bit address */
+#define PCI_BASE_ADDRESS_MEM_TYPE_1M 0x02 /* Below 1M [obsolete] */
+#define PCI_BASE_ADDRESS_MEM_TYPE_64 0x04 /* 64 bit address */
+#define PCI_BASE_ADDRESS_MEM_PREFETCH 0x08 /* prefetchable? */
+#define PCI_BASE_ADDRESS_MEM_MASK (~(pciaddr_t)0x0f)
+#define PCI_BASE_ADDRESS_IO_MASK (~(pciaddr_t)0x03)
+/* bit 1 is reserved if address_space = 1 */
+
+/* Header type 0 (normal devices) */
+#define PCI_CARDBUS_CIS 0x28
+#define PCI_SUBSYSTEM_VENDOR_ID 0x2c
+#define PCI_SUBSYSTEM_ID 0x2e
+#define PCI_ROM_ADDRESS 0x30 /* Bits 31..11 are address, 10..1 reserved */
+#define PCI_ROM_ADDRESS_ENABLE 0x01
+#define PCI_ROM_ADDRESS_MASK (~(pciaddr_t)0x7ff)
+
+#define PCI_CAPABILITY_LIST 0x34 /* Offset of first capability list entry */
+
+/* 0x35-0x3b are reserved */
+#define PCI_INTERRUPT_LINE 0x3c /* 8 bits */
+#define PCI_INTERRUPT_PIN 0x3d /* 8 bits */
+#define PCI_MIN_GNT 0x3e /* 8 bits */
+#define PCI_MAX_LAT 0x3f /* 8 bits */
+
+/* Header type 1 (PCI-to-PCI bridges) */
+#define PCI_PRIMARY_BUS 0x18 /* Primary bus number */
+#define PCI_SECONDARY_BUS 0x19 /* Secondary bus number */
+#define PCI_SUBORDINATE_BUS 0x1a /* Highest bus number behind the bridge */
+#define PCI_SEC_LATENCY_TIMER 0x1b /* Latency timer for secondary interface */
+#define PCI_IO_BASE 0x1c /* I/O range behind the bridge */
+#define PCI_IO_LIMIT 0x1d
+#define PCI_IO_RANGE_TYPE_MASK 0x0f /* I/O bridging type */
+#define PCI_IO_RANGE_TYPE_16 0x00
+#define PCI_IO_RANGE_TYPE_32 0x01
+#define PCI_IO_RANGE_MASK ~0x0f
+#define PCI_SEC_STATUS 0x1e /* Secondary status register */
+#define PCI_MEMORY_BASE 0x20 /* Memory range behind */
+#define PCI_MEMORY_LIMIT 0x22
+#define PCI_MEMORY_RANGE_TYPE_MASK 0x0f
+#define PCI_MEMORY_RANGE_MASK ~0x0f
+#define PCI_PREF_MEMORY_BASE 0x24 /* Prefetchable memory range behind */
+#define PCI_PREF_MEMORY_LIMIT 0x26
+#define PCI_PREF_RANGE_TYPE_MASK 0x0f
+#define PCI_PREF_RANGE_TYPE_32 0x00
+#define PCI_PREF_RANGE_TYPE_64 0x01
+#define PCI_PREF_RANGE_MASK ~0x0f
+#define PCI_PREF_BASE_UPPER32 0x28 /* Upper half of prefetchable memory range */
+#define PCI_PREF_LIMIT_UPPER32 0x2c
+#define PCI_IO_BASE_UPPER16 0x30 /* Upper half of I/O addresses */
+#define PCI_IO_LIMIT_UPPER16 0x32
+/* 0x34 same as for htype 0 */
+/* 0x35-0x3b is reserved */
+#define PCI_ROM_ADDRESS1 0x38 /* Same as PCI_ROM_ADDRESS, but for htype 1 */
+/* 0x3c-0x3d are same as for htype 0 */
+#define PCI_BRIDGE_CONTROL 0x3e
+#define PCI_BRIDGE_CTL_PARITY 0x01 /* Enable parity detection on secondary interface */
+#define PCI_BRIDGE_CTL_SERR 0x02 /* The same for SERR forwarding */
+#define PCI_BRIDGE_CTL_NO_ISA 0x04 /* Disable bridging of ISA ports */
+#define PCI_BRIDGE_CTL_VGA 0x08 /* Forward VGA addresses */
+#define PCI_BRIDGE_CTL_MASTER_ABORT 0x20 /* Report master aborts */
+#define PCI_BRIDGE_CTL_BUS_RESET 0x40 /* Secondary bus reset */
+#define PCI_BRIDGE_CTL_FAST_BACK 0x80 /* Fast Back2Back enabled on secondary interface */
+#define PCI_BRIDGE_CTL_PRI_DISCARD_TIMER 0x100 /* PCI-X? */
+#define PCI_BRIDGE_CTL_SEC_DISCARD_TIMER 0x200 /* PCI-X? */
+#define PCI_BRIDGE_CTL_DISCARD_TIMER_STATUS 0x400 /* PCI-X? */
+#define PCI_BRIDGE_CTL_DISCARD_TIMER_SERR_EN 0x800 /* PCI-X? */
+
+/* Header type 2 (CardBus bridges) */
+/* 0x14-0x15 reserved */
+#define PCI_CB_SEC_STATUS 0x16 /* Secondary status */
+#define PCI_CB_PRIMARY_BUS 0x18 /* PCI bus number */
+#define PCI_CB_CARD_BUS 0x19 /* CardBus bus number */
+#define PCI_CB_SUBORDINATE_BUS 0x1a /* Subordinate bus number */
+#define PCI_CB_LATENCY_TIMER 0x1b /* CardBus latency timer */
+#define PCI_CB_MEMORY_BASE_0 0x1c
+#define PCI_CB_MEMORY_LIMIT_0 0x20
+#define PCI_CB_MEMORY_BASE_1 0x24
+#define PCI_CB_MEMORY_LIMIT_1 0x28
+#define PCI_CB_IO_BASE_0 0x2c
+#define PCI_CB_IO_BASE_0_HI 0x2e
+#define PCI_CB_IO_LIMIT_0 0x30
+#define PCI_CB_IO_LIMIT_0_HI 0x32
+#define PCI_CB_IO_BASE_1 0x34
+#define PCI_CB_IO_BASE_1_HI 0x36
+#define PCI_CB_IO_LIMIT_1 0x38
+#define PCI_CB_IO_LIMIT_1_HI 0x3a
+#define PCI_CB_IO_RANGE_MASK ~0x03
+/* 0x3c-0x3d are same as for htype 0 */
+#define PCI_CB_BRIDGE_CONTROL 0x3e
+#define PCI_CB_BRIDGE_CTL_PARITY 0x01 /* Similar to standard bridge control register */
+#define PCI_CB_BRIDGE_CTL_SERR 0x02
+#define PCI_CB_BRIDGE_CTL_ISA 0x04
+#define PCI_CB_BRIDGE_CTL_VGA 0x08
+#define PCI_CB_BRIDGE_CTL_MASTER_ABORT 0x20
+#define PCI_CB_BRIDGE_CTL_CB_RESET 0x40 /* CardBus reset */
+#define PCI_CB_BRIDGE_CTL_16BIT_INT 0x80 /* Enable interrupt for 16-bit cards */
+#define PCI_CB_BRIDGE_CTL_PREFETCH_MEM0 0x100 /* Prefetch enable for both memory regions */
+#define PCI_CB_BRIDGE_CTL_PREFETCH_MEM1 0x200
+#define PCI_CB_BRIDGE_CTL_POST_WRITES 0x400
+#define PCI_CB_SUBSYSTEM_VENDOR_ID 0x40
+#define PCI_CB_SUBSYSTEM_ID 0x42
+#define PCI_CB_LEGACY_MODE_BASE 0x44 /* 16-bit PC Card legacy mode base address (ExCa) */
+/* 0x48-0x7f reserved */
+
+/* Capability lists */
+
+#define PCI_CAP_LIST_ID 0 /* Capability ID */
+#define PCI_CAP_ID_PM 0x01 /* Power Management */
+#define PCI_CAP_ID_AGP 0x02 /* Accelerated Graphics Port */
+#define PCI_CAP_ID_VPD 0x03 /* Vital Product Data */
+#define PCI_CAP_ID_SLOTID 0x04 /* Slot Identification */
+#define PCI_CAP_ID_MSI 0x05 /* Message Signaled Interrupts */
+#define PCI_CAP_ID_CHSWP 0x06 /* CompactPCI HotSwap */
+#define PCI_CAP_ID_PCIX 0x07 /* PCI-X */
+#define PCI_CAP_ID_HT 0x08 /* HyperTransport */
+#define PCI_CAP_ID_VNDR 0x09 /* Vendor specific */
+#define PCI_CAP_ID_DBG 0x0A /* Debug port */
+#define PCI_CAP_ID_CCRC 0x0B /* CompactPCI Central Resource Control */
+#define PCI_CAP_ID_HOTPLUG 0x0C /* PCI hot-plug */
+#define PCI_CAP_ID_SSVID 0x0D /* Bridge subsystem vendor/device ID */
+#define PCI_CAP_ID_AGP3 0x0E /* AGP 8x */
+#define PCI_CAP_ID_SECURE 0x0F /* Secure device (?) */
+#define PCI_CAP_ID_EXP 0x10 /* PCI Express */
+#define PCI_CAP_ID_MSIX 0x11 /* MSI-X */
+#define PCI_CAP_ID_SATA 0x12 /* Serial-ATA HBA */
+#define PCI_CAP_ID_AF 0x13 /* Advanced features of PCI devices integrated in PCIe root cplx */
+#define PCI_CAP_LIST_NEXT 1 /* Next capability in the list */
+#define PCI_CAP_FLAGS 2 /* Capability defined flags (16 bits) */
+#define PCI_CAP_SIZEOF 4
+
+/* Capabilities residing in the PCI Express extended configuration space */
+
+#define PCI_EXT_CAP_ID_AER 0x01 /* Advanced Error Reporting */
+#define PCI_EXT_CAP_ID_VC 0x02 /* Virtual Channel */
+#define PCI_EXT_CAP_ID_DSN 0x03 /* Device Serial Number */
+#define PCI_EXT_CAP_ID_PB 0x04 /* Power Budgeting */
+#define PCI_EXT_CAP_ID_RCLINK 0x05 /* Root Complex Link Declaration */
+#define PCI_EXT_CAP_ID_RCILINK 0x06 /* Root Complex Internal Link Declaration */
+#define PCI_EXT_CAP_ID_RCECOLL 0x07 /* Root Complex Event Collector */
+#define PCI_EXT_CAP_ID_MFVC 0x08 /* Multi-Function Virtual Channel */
+#define PCI_EXT_CAP_ID_VC2 0x09 /* Virtual Channel (2nd ID) */
+#define PCI_EXT_CAP_ID_RBCB 0x0a /* Root Bridge Control Block */
+#define PCI_EXT_CAP_ID_VNDR 0x0b /* Vendor specific */
+#define PCI_EXT_CAP_ID_ACS 0x0d /* Access Controls */
+#define PCI_EXT_CAP_ID_ARI 0x0e /* Alternative Routing-ID Interpretation */
+#define PCI_EXT_CAP_ID_ATS 0x0f /* Address Translation Service */
+#define PCI_EXT_CAP_ID_SRIOV 0x10 /* Single Root I/O Virtualization */
+#define PCI_EXT_CAP_ID_TPH 0x17 /* Transaction processing hints */
+#define PCI_EXT_CAP_ID_LTR 0x18 /* Latency Tolerance Reporting */
+
+/*** Definitions of capabilities ***/
+
+/* Power Management Registers */
+
+#define PCI_PM_CAP_VER_MASK 0x0007 /* Version (2=PM1.1) */
+#define PCI_PM_CAP_PME_CLOCK 0x0008 /* Clock required for PME generation */
+#define PCI_PM_CAP_DSI 0x0020 /* Device specific initialization required */
+#define PCI_PM_CAP_AUX_C_MASK 0x01c0 /* Maximum aux current required in D3cold */
+#define PCI_PM_CAP_D1 0x0200 /* D1 power state support */
+#define PCI_PM_CAP_D2 0x0400 /* D2 power state support */
+#define PCI_PM_CAP_PME_D0 0x0800 /* PME can be asserted from D0 */
+#define PCI_PM_CAP_PME_D1 0x1000 /* PME can be asserted from D1 */
+#define PCI_PM_CAP_PME_D2 0x2000 /* PME can be asserted from D2 */
+#define PCI_PM_CAP_PME_D3_HOT 0x4000 /* PME can be asserted from D3hot */
+#define PCI_PM_CAP_PME_D3_COLD 0x8000 /* PME can be asserted from D3cold */
+#define PCI_PM_CTRL 4 /* PM control and status register */
+#define PCI_PM_CTRL_STATE_MASK 0x0003 /* Current power state (D0 to D3) */
+#define PCI_PM_CTRL_NO_SOFT_RST 0x0008 /* No Soft Reset from D3hot to D0 */
+#define PCI_PM_CTRL_PME_ENABLE 0x0100 /* PME pin enable */
+#define PCI_PM_CTRL_DATA_SEL_MASK 0x1e00 /* PM table data index */
+#define PCI_PM_CTRL_DATA_SCALE_MASK 0x6000 /* PM table data scaling factor */
+#define PCI_PM_CTRL_PME_STATUS 0x8000 /* PME pin status */
+#define PCI_PM_PPB_EXTENSIONS 6 /* PPB support extensions */
+#define PCI_PM_PPB_B2_B3 0x40 /* If bridge enters D3hot, bus enters: 0=B3, 1=B2 */
+#define PCI_PM_BPCC_ENABLE 0x80 /* Secondary bus is power managed */
+#define PCI_PM_DATA_REGISTER 7 /* PM table contents read here */
+#define PCI_PM_SIZEOF 8
+
+/* AGP registers */
+
+#define PCI_AGP_VERSION 2 /* BCD version number */
+#define PCI_AGP_RFU 3 /* Rest of capability flags */
+#define PCI_AGP_STATUS 4 /* Status register */
+#define PCI_AGP_STATUS_RQ_MASK 0xff000000 /* Maximum number of requests - 1 */
+#define PCI_AGP_STATUS_ISOCH 0x10000 /* Isochronous transactions supported */
+#define PCI_AGP_STATUS_ARQSZ_MASK 0xe000 /* log2(optimum async req size in bytes) - 4 */
+#define PCI_AGP_STATUS_CAL_MASK 0x1c00 /* Calibration cycle timing */
+#define PCI_AGP_STATUS_SBA 0x0200 /* Sideband addressing supported */
+#define PCI_AGP_STATUS_ITA_COH 0x0100 /* In-aperture accesses always coherent */
+#define PCI_AGP_STATUS_GART64 0x0080 /* 64-bit GART entries supported */
+#define PCI_AGP_STATUS_HTRANS 0x0040 /* If 0, core logic can xlate host CPU accesses thru aperture */
+#define PCI_AGP_STATUS_64BIT 0x0020 /* 64-bit addressing cycles supported */
+#define PCI_AGP_STATUS_FW 0x0010 /* Fast write transfers supported */
+#define PCI_AGP_STATUS_AGP3 0x0008 /* AGP3 mode supported */
+#define PCI_AGP_STATUS_RATE4 0x0004 /* 4x transfer rate supported (RFU in AGP3 mode) */
+#define PCI_AGP_STATUS_RATE2 0x0002 /* 2x transfer rate supported (8x in AGP3 mode) */
+#define PCI_AGP_STATUS_RATE1 0x0001 /* 1x transfer rate supported (4x in AGP3 mode) */
+#define PCI_AGP_COMMAND 8 /* Control register */
+#define PCI_AGP_COMMAND_RQ_MASK 0xff000000 /* Master: Maximum number of requests */
+#define PCI_AGP_COMMAND_ARQSZ_MASK 0xe000 /* log2(optimum async req size in bytes) - 4 */
+#define PCI_AGP_COMMAND_CAL_MASK 0x1c00 /* Calibration cycle timing */
+#define PCI_AGP_COMMAND_SBA 0x0200 /* Sideband addressing enabled */
+#define PCI_AGP_COMMAND_AGP 0x0100 /* Allow processing of AGP transactions */
+#define PCI_AGP_COMMAND_GART64 0x0080 /* 64-bit GART entries enabled */
+#define PCI_AGP_COMMAND_64BIT 0x0020 /* Allow generation of 64-bit addr cycles */
+#define PCI_AGP_COMMAND_FW 0x0010 /* Enable FW transfers */
+#define PCI_AGP_COMMAND_RATE4 0x0004 /* Use 4x rate (RFU in AGP3 mode) */
+#define PCI_AGP_COMMAND_RATE2 0x0002 /* Use 2x rate (8x in AGP3 mode) */
+#define PCI_AGP_COMMAND_RATE1 0x0001 /* Use 1x rate (4x in AGP3 mode) */
+#define PCI_AGP_SIZEOF 12
+
+/* Vital Product Data */
+
+#define PCI_VPD_ADDR 2 /* Address to access (15 bits!) */
+#define PCI_VPD_ADDR_MASK 0x7fff /* Address mask */
+#define PCI_VPD_ADDR_F 0x8000 /* Write 0, 1 indicates completion */
+#define PCI_VPD_DATA 4 /* 32-bits of data returned here */
+
+/* Slot Identification */
+
+#define PCI_SID_ESR 2 /* Expansion Slot Register */
+#define PCI_SID_ESR_NSLOTS 0x1f /* Number of expansion slots available */
+#define PCI_SID_ESR_FIC 0x20 /* First In Chassis Flag */
+#define PCI_SID_CHASSIS_NR 3 /* Chassis Number */
+
+/* Message Signaled Interrupts registers */
+
+#define PCI_MSI_FLAGS 2 /* Various flags */
+#define PCI_MSI_FLAGS_MASK_BIT 0x100 /* interrupt masking & reporting supported */
+#define PCI_MSI_FLAGS_64BIT 0x080 /* 64-bit addresses allowed */
+#define PCI_MSI_FLAGS_QSIZE 0x070 /* Message queue size configured */
+#define PCI_MSI_FLAGS_QMASK 0x00e /* Maximum queue size available */
+#define PCI_MSI_FLAGS_ENABLE 0x001 /* MSI feature enabled */
+#define PCI_MSI_RFU 3 /* Rest of capability flags */
+#define PCI_MSI_ADDRESS_LO 4 /* Lower 32 bits */
+#define PCI_MSI_ADDRESS_HI 8 /* Upper 32 bits (if PCI_MSI_FLAGS_64BIT set) */
+#define PCI_MSI_DATA_32 8 /* 16 bits of data for 32-bit devices */
+#define PCI_MSI_DATA_64 12 /* 16 bits of data for 64-bit devices */
+#define PCI_MSI_MASK_BIT_32 12 /* per-vector masking for 32-bit devices */
+#define PCI_MSI_MASK_BIT_64 16 /* per-vector masking for 64-bit devices */
+#define PCI_MSI_PENDING_32 16 /* per-vector interrupt pending for 32-bit devices */
+#define PCI_MSI_PENDING_64 20 /* per-vector interrupt pending for 64-bit devices */
+
+/* PCI-X */
+#define PCI_PCIX_COMMAND 2 /* Command register offset */
+#define PCI_PCIX_COMMAND_DPERE 0x0001 /* Data Parity Error Recover Enable */
+#define PCI_PCIX_COMMAND_ERO 0x0002 /* Enable Relaxed Ordering */
+#define PCI_PCIX_COMMAND_MAX_MEM_READ_BYTE_COUNT 0x000c /* Maximum Memory Read Byte Count */
+#define PCI_PCIX_COMMAND_MAX_OUTSTANDING_SPLIT_TRANS 0x0070
+#define PCI_PCIX_COMMAND_RESERVED 0xf80
+#define PCI_PCIX_STATUS 4 /* Status register offset */
+#define PCI_PCIX_STATUS_FUNCTION 0x00000007
+#define PCI_PCIX_STATUS_DEVICE 0x000000f8
+#define PCI_PCIX_STATUS_BUS 0x0000ff00
+#define PCI_PCIX_STATUS_64BIT 0x00010000
+#define PCI_PCIX_STATUS_133MHZ 0x00020000
+#define PCI_PCIX_STATUS_SC_DISCARDED 0x00040000 /* Split Completion Discarded */
+#define PCI_PCIX_STATUS_UNEXPECTED_SC 0x00080000 /* Unexpected Split Completion */
+#define PCI_PCIX_STATUS_DEVICE_COMPLEXITY 0x00100000 /* 0 = simple device, 1 = bridge device */
+#define PCI_PCIX_STATUS_DESIGNED_MAX_MEM_READ_BYTE_COUNT 0x00600000 /* 0 = 512 bytes, 1 = 1024, 2 = 2048, 3 = 4096 */
+#define PCI_PCIX_STATUS_DESIGNED_MAX_OUTSTANDING_SPLIT_TRANS 0x03800000
+#define PCI_PCIX_STATUS_DESIGNED_MAX_CUMULATIVE_READ_SIZE 0x1c000000
+#define PCI_PCIX_STATUS_RCVD_SC_ERR_MESS 0x20000000 /* Received Split Completion Error Message */
+#define PCI_PCIX_STATUS_266MHZ 0x40000000 /* 266 MHz capable */
+#define PCI_PCIX_STATUS_533MHZ 0x80000000 /* 533 MHz capable */
+#define PCI_PCIX_SIZEOF 4
+
+/* PCI-X Bridges */
+#define PCI_PCIX_BRIDGE_SEC_STATUS 2 /* Secondary bus status register offset */
+#define PCI_PCIX_BRIDGE_SEC_STATUS_64BIT 0x0001
+#define PCI_PCIX_BRIDGE_SEC_STATUS_133MHZ 0x0002
+#define PCI_PCIX_BRIDGE_SEC_STATUS_SC_DISCARDED 0x0004 /* Split Completion Discarded on secondary bus */
+#define PCI_PCIX_BRIDGE_SEC_STATUS_UNEXPECTED_SC 0x0008 /* Unexpected Split Completion on secondary bus */
+#define PCI_PCIX_BRIDGE_SEC_STATUS_SC_OVERRUN 0x0010 /* Split Completion Overrun on secondary bus */
+#define PCI_PCIX_BRIDGE_SEC_STATUS_SPLIT_REQUEST_DELAYED 0x0020
+#define PCI_PCIX_BRIDGE_SEC_STATUS_CLOCK_FREQ 0x01c0
+#define PCI_PCIX_BRIDGE_SEC_STATUS_RESERVED 0xfe00
+#define PCI_PCIX_BRIDGE_STATUS 4 /* Primary bus status register offset */
+#define PCI_PCIX_BRIDGE_STATUS_FUNCTION 0x00000007
+#define PCI_PCIX_BRIDGE_STATUS_DEVICE 0x000000f8
+#define PCI_PCIX_BRIDGE_STATUS_BUS 0x0000ff00
+#define PCI_PCIX_BRIDGE_STATUS_64BIT 0x00010000
+#define PCI_PCIX_BRIDGE_STATUS_133MHZ 0x00020000
+#define PCI_PCIX_BRIDGE_STATUS_SC_DISCARDED 0x00040000 /* Split Completion Discarded */
+#define PCI_PCIX_BRIDGE_STATUS_UNEXPECTED_SC 0x00080000 /* Unexpected Split Completion */
+#define PCI_PCIX_BRIDGE_STATUS_SC_OVERRUN 0x00100000 /* Split Completion Overrun */
+#define PCI_PCIX_BRIDGE_STATUS_SPLIT_REQUEST_DELAYED 0x00200000
+#define PCI_PCIX_BRIDGE_STATUS_RESERVED 0xffc00000
+#define PCI_PCIX_BRIDGE_UPSTREAM_SPLIT_TRANS_CTRL 8 /* Upstream Split Transaction Register offset */
+#define PCI_PCIX_BRIDGE_DOWNSTREAM_SPLIT_TRANS_CTRL 12 /* Downstream Split Transaction Register offset */
+#define PCI_PCIX_BRIDGE_STR_CAPACITY 0x0000ffff
+#define PCI_PCIX_BRIDGE_STR_COMMITMENT_LIMIT 0xffff0000
+#define PCI_PCIX_BRIDGE_SIZEOF 12
+
+/* HyperTransport (as of spec rev. 2.00) */
+#define PCI_HT_CMD 2 /* Command Register */
+#define PCI_HT_CMD_TYP_HI 0xe000 /* Capability Type high part */
+#define PCI_HT_CMD_TYP_HI_PRI 0x0000 /* Slave or Primary Interface */
+#define PCI_HT_CMD_TYP_HI_SEC 0x2000 /* Host or Secondary Interface */
+#define PCI_HT_CMD_TYP 0xf800 /* Capability Type */
+#define PCI_HT_CMD_TYP_SW 0x4000 /* Switch */
+#define PCI_HT_CMD_TYP_IDC 0x8000 /* Interrupt Discovery and Configuration */
+#define PCI_HT_CMD_TYP_RID 0x8800 /* Revision ID */
+#define PCI_HT_CMD_TYP_UIDC 0x9000 /* UnitID Clumping */
+#define PCI_HT_CMD_TYP_ECSA 0x9800 /* Extended Configuration Space Access */
+#define PCI_HT_CMD_TYP_AM 0xa000 /* Address Mapping */
+#define PCI_HT_CMD_TYP_MSIM 0xa800 /* MSI Mapping */
+#define PCI_HT_CMD_TYP_DR 0xb000 /* DirectRoute */
+#define PCI_HT_CMD_TYP_VCS 0xb800 /* VCSet */
+#define PCI_HT_CMD_TYP_RM 0xc000 /* Retry Mode */
+#define PCI_HT_CMD_TYP_X86 0xc800 /* X86 (reserved) */
+
+ /* Link Control Register */
+#define PCI_HT_LCTR_CFLE 0x0002 /* CRC Flood Enable */
+#define PCI_HT_LCTR_CST 0x0004 /* CRC Start Test */
+#define PCI_HT_LCTR_CFE 0x0008 /* CRC Force Error */
+#define PCI_HT_LCTR_LKFAIL 0x0010 /* Link Failure */
+#define PCI_HT_LCTR_INIT 0x0020 /* Initialization Complete */
+#define PCI_HT_LCTR_EOC 0x0040 /* End of Chain */
+#define PCI_HT_LCTR_TXO 0x0080 /* Transmitter Off */
+#define PCI_HT_LCTR_CRCERR 0x0f00 /* CRC Error */
+#define PCI_HT_LCTR_ISOCEN 0x1000 /* Isochronous Flow Control Enable */
+#define PCI_HT_LCTR_LSEN 0x2000 /* LDTSTOP# Tristate Enable */
+#define PCI_HT_LCTR_EXTCTL 0x4000 /* Extended CTL Time */
+#define PCI_HT_LCTR_64B 0x8000 /* 64-bit Addressing Enable */
+
+ /* Link Configuration Register */
+#define PCI_HT_LCNF_MLWI 0x0007 /* Max Link Width In */
+#define PCI_HT_LCNF_LW_8B 0x0 /* Link Width 8 bits */
+#define PCI_HT_LCNF_LW_16B 0x1 /* Link Width 16 bits */
+#define PCI_HT_LCNF_LW_32B 0x3 /* Link Width 32 bits */
+#define PCI_HT_LCNF_LW_2B 0x4 /* Link Width 2 bits */
+#define PCI_HT_LCNF_LW_4B 0x5 /* Link Width 4 bits */
+#define PCI_HT_LCNF_LW_NC 0x7 /* Link physically not connected */
+#define PCI_HT_LCNF_DFI 0x0008 /* Doubleword Flow Control In */
+#define PCI_HT_LCNF_MLWO 0x0070 /* Max Link Width Out */
+#define PCI_HT_LCNF_DFO 0x0080 /* Doubleword Flow Control Out */
+#define PCI_HT_LCNF_LWI 0x0700 /* Link Width In */
+#define PCI_HT_LCNF_DFIE 0x0800 /* Doubleword Flow Control In Enable */
+#define PCI_HT_LCNF_LWO 0x7000 /* Link Width Out */
+#define PCI_HT_LCNF_DFOE 0x8000 /* Doubleword Flow Control Out Enable */
+
+ /* Revision ID Register */
+#define PCI_HT_RID_MIN 0x1f /* Minor Revision */
+#define PCI_HT_RID_MAJ 0xe0 /* Major Revision */
+
+ /* Link Frequency/Error Register */
+#define PCI_HT_LFRER_FREQ 0x0f /* Transmitter Clock Frequency */
+#define PCI_HT_LFRER_200 0x00 /* 200MHz */
+#define PCI_HT_LFRER_300 0x01 /* 300MHz */
+#define PCI_HT_LFRER_400 0x02 /* 400MHz */
+#define PCI_HT_LFRER_500 0x03 /* 500MHz */
+#define PCI_HT_LFRER_600 0x04 /* 600MHz */
+#define PCI_HT_LFRER_800 0x05 /* 800MHz */
+#define PCI_HT_LFRER_1000 0x06 /* 1.0GHz */
+#define PCI_HT_LFRER_1200 0x07 /* 1.2GHz */
+#define PCI_HT_LFRER_1400 0x08 /* 1.4GHz */
+#define PCI_HT_LFRER_1600 0x09 /* 1.6GHz */
+#define PCI_HT_LFRER_VEND 0x0f /* Vendor-Specific */
+#define PCI_HT_LFRER_ERR 0xf0 /* Link Error */
+#define PCI_HT_LFRER_PROT 0x10 /* Protocol Error */
+#define PCI_HT_LFRER_OV 0x20 /* Overflow Error */
+#define PCI_HT_LFRER_EOC 0x40 /* End of Chain Error */
+#define PCI_HT_LFRER_CTLT 0x80 /* CTL Timeout */
+
+ /* Link Frequency Capability Register */
+#define PCI_HT_LFCAP_200 0x0001 /* 200MHz */
+#define PCI_HT_LFCAP_300 0x0002 /* 300MHz */
+#define PCI_HT_LFCAP_400 0x0004 /* 400MHz */
+#define PCI_HT_LFCAP_500 0x0008 /* 500MHz */
+#define PCI_HT_LFCAP_600 0x0010 /* 600MHz */
+#define PCI_HT_LFCAP_800 0x0020 /* 800MHz */
+#define PCI_HT_LFCAP_1000 0x0040 /* 1.0GHz */
+#define PCI_HT_LFCAP_1200 0x0080 /* 1.2GHz */
+#define PCI_HT_LFCAP_1400 0x0100 /* 1.4GHz */
+#define PCI_HT_LFCAP_1600 0x0200 /* 1.6GHz */
+#define PCI_HT_LFCAP_VEND 0x8000 /* Vendor-Specific */
+
+ /* Feature Register */
+#define PCI_HT_FTR_ISOCFC 0x0001 /* Isochronous Flow Control Mode */
+#define PCI_HT_FTR_LDTSTOP 0x0002 /* LDTSTOP# Supported */
+#define PCI_HT_FTR_CRCTM 0x0004 /* CRC Test Mode */
+#define PCI_HT_FTR_ECTLT 0x0008 /* Extended CTL Time Required */
+#define PCI_HT_FTR_64BA 0x0010 /* 64-bit Addressing */
+#define PCI_HT_FTR_UIDRD 0x0020 /* UnitID Reorder Disable */
+
+ /* Error Handling Register */
+#define PCI_HT_EH_PFLE 0x0001 /* Protocol Error Flood Enable */
+#define PCI_HT_EH_OFLE 0x0002 /* Overflow Error Flood Enable */
+#define PCI_HT_EH_PFE 0x0004 /* Protocol Error Fatal Enable */
+#define PCI_HT_EH_OFE 0x0008 /* Overflow Error Fatal Enable */
+#define PCI_HT_EH_EOCFE 0x0010 /* End of Chain Error Fatal Enable */
+#define PCI_HT_EH_RFE 0x0020 /* Response Error Fatal Enable */
+#define PCI_HT_EH_CRCFE 0x0040 /* CRC Error Fatal Enable */
+#define PCI_HT_EH_SERRFE 0x0080 /* System Error Fatal Enable (B */
+#define PCI_HT_EH_CF 0x0100 /* Chain Fail */
+#define PCI_HT_EH_RE 0x0200 /* Response Error */
+#define PCI_HT_EH_PNFE 0x0400 /* Protocol Error Nonfatal Enable */
+#define PCI_HT_EH_ONFE 0x0800 /* Overflow Error Nonfatal Enable */
+#define PCI_HT_EH_EOCNFE 0x1000 /* End of Chain Error Nonfatal Enable */
+#define PCI_HT_EH_RNFE 0x2000 /* Response Error Nonfatal Enable */
+#define PCI_HT_EH_CRCNFE 0x4000 /* CRC Error Nonfatal Enable */
+#define PCI_HT_EH_SERRNFE 0x8000 /* System Error Nonfatal Enable */
+
+/* HyperTransport: Slave or Primary Interface */
+#define PCI_HT_PRI_CMD 2 /* Command Register */
+#define PCI_HT_PRI_CMD_BUID 0x001f /* Base UnitID */
+#define PCI_HT_PRI_CMD_UC 0x03e0 /* Unit Count */
+#define PCI_HT_PRI_CMD_MH 0x0400 /* Master Host */
+#define PCI_HT_PRI_CMD_DD 0x0800 /* Default Direction */
+#define PCI_HT_PRI_CMD_DUL 0x1000 /* Drop on Uninitialized Link */
+
+#define PCI_HT_PRI_LCTR0 4 /* Link Control 0 Register */
+#define PCI_HT_PRI_LCNF0 6 /* Link Config 0 Register */
+#define PCI_HT_PRI_LCTR1 8 /* Link Control 1 Register */
+#define PCI_HT_PRI_LCNF1 10 /* Link Config 1 Register */
+#define PCI_HT_PRI_RID 12 /* Revision ID Register */
+#define PCI_HT_PRI_LFRER0 13 /* Link Frequency/Error 0 Register */
+#define PCI_HT_PRI_LFCAP0 14 /* Link Frequency Capability 0 Register */
+#define PCI_HT_PRI_FTR 16 /* Feature Register */
+#define PCI_HT_PRI_LFRER1 17 /* Link Frequency/Error 1 Register */
+#define PCI_HT_PRI_LFCAP1 18 /* Link Frequency Capability 1 Register */
+#define PCI_HT_PRI_ES 20 /* Enumeration Scratchpad Register */
+#define PCI_HT_PRI_EH 22 /* Error Handling Register */
+#define PCI_HT_PRI_MBU 24 /* Memory Base Upper Register */
+#define PCI_HT_PRI_MLU 25 /* Memory Limit Upper Register */
+#define PCI_HT_PRI_BN 26 /* Bus Number Register */
+#define PCI_HT_PRI_SIZEOF 28
+
+/* HyperTransport: Host or Secondary Interface */
+#define PCI_HT_SEC_CMD 2 /* Command Register */
+#define PCI_HT_SEC_CMD_WR 0x0001 /* Warm Reset */
+#define PCI_HT_SEC_CMD_DE 0x0002 /* Double-Ended */
+#define PCI_HT_SEC_CMD_DN 0x0076 /* Device Number */
+#define PCI_HT_SEC_CMD_CS 0x0080 /* Chain Side */
+#define PCI_HT_SEC_CMD_HH 0x0100 /* Host Hide */
+#define PCI_HT_SEC_CMD_AS 0x0400 /* Act as Slave */
+#define PCI_HT_SEC_CMD_HIECE 0x0800 /* Host Inbound End of Chain Error */
+#define PCI_HT_SEC_CMD_DUL 0x1000 /* Drop on Uninitialized Link */
+
+#define PCI_HT_SEC_LCTR 4 /* Link Control Register */
+#define PCI_HT_SEC_LCNF 6 /* Link Config Register */
+#define PCI_HT_SEC_RID 8 /* Revision ID Register */
+#define PCI_HT_SEC_LFRER 9 /* Link Frequency/Error Register */
+#define PCI_HT_SEC_LFCAP 10 /* Link Frequency Capability Register */
+#define PCI_HT_SEC_FTR 12 /* Feature Register */
+#define PCI_HT_SEC_FTR_EXTRS 0x0100 /* Extended Register Set */
+#define PCI_HT_SEC_FTR_UCNFE 0x0200 /* Upstream Configuration Enable */
+#define PCI_HT_SEC_ES 16 /* Enumeration Scratchpad Register */
+#define PCI_HT_SEC_EH 18 /* Error Handling Register */
+#define PCI_HT_SEC_MBU 20 /* Memory Base Upper Register */
+#define PCI_HT_SEC_MLU 21 /* Memory Limit Upper Register */
+#define PCI_HT_SEC_SIZEOF 24
+
+/* HyperTransport: Switch */
+#define PCI_HT_SW_CMD 2 /* Switch Command Register */
+#define PCI_HT_SW_CMD_VIBERR 0x0080 /* VIB Error */
+#define PCI_HT_SW_CMD_VIBFL 0x0100 /* VIB Flood */
+#define PCI_HT_SW_CMD_VIBFT 0x0200 /* VIB Fatal */
+#define PCI_HT_SW_CMD_VIBNFT 0x0400 /* VIB Nonfatal */
+#define PCI_HT_SW_PMASK 4 /* Partition Mask Register */
+#define PCI_HT_SW_SWINF 8 /* Switch Info Register */
+#define PCI_HT_SW_SWINF_DP 0x0000001f /* Default Port */
+#define PCI_HT_SW_SWINF_EN 0x00000020 /* Enable Decode */
+#define PCI_HT_SW_SWINF_CR 0x00000040 /* Cold Reset */
+#define PCI_HT_SW_SWINF_PCIDX 0x00000f00 /* Performance Counter Index */
+#define PCI_HT_SW_SWINF_BLRIDX 0x0003f000 /* Base/Limit Range Index */
+#define PCI_HT_SW_SWINF_SBIDX 0x00002000 /* Secondary Base Range Index */
+#define PCI_HT_SW_SWINF_HP 0x00040000 /* Hot Plug */
+#define PCI_HT_SW_SWINF_HIDE 0x00080000 /* Hide Port */
+#define PCI_HT_SW_PCD 12 /* Performance Counter Data Register */
+#define PCI_HT_SW_BLRD 16 /* Base/Limit Range Data Register */
+#define PCI_HT_SW_SBD 20 /* Secondary Base Data Register */
+#define PCI_HT_SW_SIZEOF 24
+
+ /* Counter indices */
+#define PCI_HT_SW_PC_PCR 0x0 /* Posted Command Receive */
+#define PCI_HT_SW_PC_NPCR 0x1 /* Nonposted Command Receive */
+#define PCI_HT_SW_PC_RCR 0x2 /* Response Command Receive */
+#define PCI_HT_SW_PC_PDWR 0x3 /* Posted DW Receive */
+#define PCI_HT_SW_PC_NPDWR 0x4 /* Nonposted DW Receive */
+#define PCI_HT_SW_PC_RDWR 0x5 /* Response DW Receive */
+#define PCI_HT_SW_PC_PCT 0x6 /* Posted Command Transmit */
+#define PCI_HT_SW_PC_NPCT 0x7 /* Nonposted Command Transmit */
+#define PCI_HT_SW_PC_RCT 0x8 /* Response Command Transmit */
+#define PCI_HT_SW_PC_PDWT 0x9 /* Posted DW Transmit */
+#define PCI_HT_SW_PC_NPDWT 0xa /* Nonposted DW Transmit */
+#define PCI_HT_SW_PC_RDWT 0xb /* Response DW Transmit */
+
+ /* Base/Limit Range indices */
+#define PCI_HT_SW_BLR_BASE0_LO 0x0 /* Base 0[31:1], Enable */
+#define PCI_HT_SW_BLR_BASE0_HI 0x1 /* Base 0 Upper */
+#define PCI_HT_SW_BLR_LIM0_LO 0x2 /* Limit 0 Lower */
+#define PCI_HT_SW_BLR_LIM0_HI 0x3 /* Limit 0 Upper */
+
+ /* Secondary Base indices */
+#define PCI_HT_SW_SB_LO 0x0 /* Secondary Base[31:1], Enable */
+#define PCI_HT_SW_S0_HI 0x1 /* Secondary Base Upper */
+
+/* HyperTransport: Interrupt Discovery and Configuration */
+#define PCI_HT_IDC_IDX 2 /* Index Register */
+#define PCI_HT_IDC_DATA 4 /* Data Register */
+#define PCI_HT_IDC_SIZEOF 8
+
+ /* Register indices */
+#define PCI_HT_IDC_IDX_LINT 0x01 /* Last Interrupt Register */
+#define PCI_HT_IDC_LINT 0x00ff0000 /* Last interrupt definition */
+#define PCI_HT_IDC_IDX_IDR 0x10 /* Interrupt Definition Registers */
+ /* Low part (at index) */
+#define PCI_HT_IDC_IDR_MASK 0x10000001 /* Mask */
+#define PCI_HT_IDC_IDR_POL 0x10000002 /* Polarity */
+#define PCI_HT_IDC_IDR_II_2 0x1000001c /* IntrInfo[4:2]: Message Type */
+#define PCI_HT_IDC_IDR_II_5 0x10000020 /* IntrInfo[5]: Request EOI */
+#define PCI_HT_IDC_IDR_II_6 0x00ffffc0 /* IntrInfo[23:6] */
+#define PCI_HT_IDC_IDR_II_24 0xff000000 /* IntrInfo[31:24] */
+ /* High part (at index + 1) */
+#define PCI_HT_IDC_IDR_II_32 0x00ffffff /* IntrInfo[55:32] */
+#define PCI_HT_IDC_IDR_PASSPW 0x40000000 /* PassPW setting for messages */
+#define PCI_HT_IDC_IDR_WEOI 0x80000000 /* Waiting for EOI */
+
+/* HyperTransport: Revision ID */
+#define PCI_HT_RID_RID 2 /* Revision Register */
+#define PCI_HT_RID_SIZEOF 4
+
+/* HyperTransport: UnitID Clumping */
+#define PCI_HT_UIDC_CS 4 /* Clumping Support Register */
+#define PCI_HT_UIDC_CE 8 /* Clumping Enable Register */
+#define PCI_HT_UIDC_SIZEOF 12
+
+/* HyperTransport: Extended Configuration Space Access */
+#define PCI_HT_ECSA_ADDR 4 /* Configuration Address Register */
+#define PCI_HT_ECSA_ADDR_REG 0x00000ffc /* Register */
+#define PCI_HT_ECSA_ADDR_FUN 0x00007000 /* Function */
+#define PCI_HT_ECSA_ADDR_DEV 0x000f1000 /* Device */
+#define PCI_HT_ECSA_ADDR_BUS 0x0ff00000 /* Bus Number */
+#define PCI_HT_ECSA_ADDR_TYPE 0x10000000 /* Access Type */
+#define PCI_HT_ECSA_DATA 8 /* Configuration Data Register */
+#define PCI_HT_ECSA_SIZEOF 12
+
+/* HyperTransport: Address Mapping */
+#define PCI_HT_AM_CMD 2 /* Command Register */
+#define PCI_HT_AM_CMD_NDMA 0x000f /* Number of DMA Mappings */
+#define PCI_HT_AM_CMD_IOSIZ 0x01f0 /* I/O Size */
+#define PCI_HT_AM_CMD_MT 0x0600 /* Map Type */
+#define PCI_HT_AM_CMD_MT_40B 0x0000 /* 40-bit */
+#define PCI_HT_AM_CMD_MT_64B 0x0200 /* 64-bit */
+
+ /* Window Control Register bits */
+#define PCI_HT_AM_SBW_CTR_COMP 0x1 /* Compat */
+#define PCI_HT_AM_SBW_CTR_NCOH 0x2 /* NonCoherent */
+#define PCI_HT_AM_SBW_CTR_ISOC 0x4 /* Isochronous */
+#define PCI_HT_AM_SBW_CTR_EN 0x8 /* Enable */
+
+/* HyperTransport: 40-bit Address Mapping */
+#define PCI_HT_AM40_SBNPW 4 /* Secondary Bus Non-Prefetchable Window Register */
+#define PCI_HT_AM40_SBW_BASE 0x000fffff /* Window Base */
+#define PCI_HT_AM40_SBW_CTR 0xf0000000 /* Window Control */
+#define PCI_HT_AM40_SBPW 8 /* Secondary Bus Prefetchable Window Register */
+#define PCI_HT_AM40_DMA_PBASE0 12 /* DMA Window Primary Base 0 Register */
+#define PCI_HT_AM40_DMA_CTR0 15 /* DMA Window Control 0 Register */
+#define PCI_HT_AM40_DMA_CTR_CTR 0xf0 /* Window Control */
+#define PCI_HT_AM40_DMA_SLIM0 16 /* DMA Window Secondary Limit 0 Register */
+#define PCI_HT_AM40_DMA_SBASE0 18 /* DMA Window Secondary Base 0 Register */
+#define PCI_HT_AM40_SIZEOF 12 /* size is variable: 12 + 8 * NDMA */
+
+/* HyperTransport: 64-bit Address Mapping */
+#define PCI_HT_AM64_IDX 4 /* Index Register */
+#define PCI_HT_AM64_DATA_LO 8 /* Data Lower Register */
+#define PCI_HT_AM64_DATA_HI 12 /* Data Upper Register */
+#define PCI_HT_AM64_SIZEOF 16
+
+ /* Register indices */
+#define PCI_HT_AM64_IDX_SBNPW 0x00 /* Secondary Bus Non-Prefetchable Window Register */
+#define PCI_HT_AM64_W_BASE_LO 0xfff00000 /* Window Base Lower */
+#define PCI_HT_AM64_W_CTR 0x0000000f /* Window Control */
+#define PCI_HT_AM64_IDX_SBPW 0x01 /* Secondary Bus Prefetchable Window Register */
+#define PCI_HT_AM64_IDX_PBNPW 0x02 /* Primary Bus Non-Prefetchable Window Register */
+#define PCI_HT_AM64_IDX_DMAPB0 0x04 /* DMA Window Primary Base 0 Register */
+#define PCI_HT_AM64_IDX_DMASB0 0x05 /* DMA Window Secondary Base 0 Register */
+#define PCI_HT_AM64_IDX_DMASL0 0x06 /* DMA Window Secondary Limit 0 Register */
+
+/* HyperTransport: MSI Mapping */
+#define PCI_HT_MSIM_CMD 2 /* Command Register */
+#define PCI_HT_MSIM_CMD_EN 0x0001 /* Mapping Active */
+#define PCI_HT_MSIM_CMD_FIXD 0x0002 /* MSI Mapping Address Fixed */
+#define PCI_HT_MSIM_ADDR_LO 4 /* MSI Mapping Address Lower Register */
+#define PCI_HT_MSIM_ADDR_HI 8 /* MSI Mapping Address Upper Register */
+#define PCI_HT_MSIM_SIZEOF 12
+
+/* HyperTransport: DirectRoute */
+#define PCI_HT_DR_CMD 2 /* Command Register */
+#define PCI_HT_DR_CMD_NDRS 0x000f /* Number of DirectRoute Spaces */
+#define PCI_HT_DR_CMD_IDX 0x01f0 /* Index */
+#define PCI_HT_DR_EN 4 /* Enable Vector Register */
+#define PCI_HT_DR_DATA 8 /* Data Register */
+#define PCI_HT_DR_SIZEOF 12
+
+ /* Register indices */
+#define PCI_HT_DR_IDX_BASE_LO 0x00 /* DirectRoute Base Lower Register */
+#define PCI_HT_DR_OTNRD 0x00000001 /* Opposite to Normal Request Direction */
+#define PCI_HT_DR_BL_LO 0xffffff00 /* Base/Limit Lower */
+#define PCI_HT_DR_IDX_BASE_HI 0x01 /* DirectRoute Base Upper Register */
+#define PCI_HT_DR_IDX_LIMIT_LO 0x02 /* DirectRoute Limit Lower Register */
+#define PCI_HT_DR_IDX_LIMIT_HI 0x03 /* DirectRoute Limit Upper Register */
+
+/* HyperTransport: VCSet */
+#define PCI_HT_VCS_SUP 4 /* VCSets Supported Register */
+#define PCI_HT_VCS_L1EN 5 /* Link 1 VCSets Enabled Register */
+#define PCI_HT_VCS_L0EN 6 /* Link 0 VCSets Enabled Register */
+#define PCI_HT_VCS_SBD 8 /* Stream Bucket Depth Register */
+#define PCI_HT_VCS_SINT 9 /* Stream Interval Register */
+#define PCI_HT_VCS_SSUP 10 /* Number of Streaming VCs Supported Register */
+#define PCI_HT_VCS_SSUP_0 0x00 /* Streaming VC 0 */
+#define PCI_HT_VCS_SSUP_3 0x01 /* Streaming VCs 0-3 */
+#define PCI_HT_VCS_SSUP_15 0x02 /* Streaming VCs 0-15 */
+#define PCI_HT_VCS_NFCBD 12 /* Non-FC Bucket Depth Register */
+#define PCI_HT_VCS_NFCINT 13 /* Non-FC Bucket Interval Register */
+#define PCI_HT_VCS_SIZEOF 16
+
+/* HyperTransport: Retry Mode */
+#define PCI_HT_RM_CTR0 4 /* Control 0 Register */
+#define PCI_HT_RM_CTR_LRETEN 0x01 /* Link Retry Enable */
+#define PCI_HT_RM_CTR_FSER 0x02 /* Force Single Error */
+#define PCI_HT_RM_CTR_ROLNEN 0x04 /* Rollover Nonfatal Enable */
+#define PCI_HT_RM_CTR_FSS 0x08 /* Force Single Stomp */
+#define PCI_HT_RM_CTR_RETNEN 0x10 /* Retry Nonfatal Enable */
+#define PCI_HT_RM_CTR_RETFEN 0x20 /* Retry Fatal Enable */
+#define PCI_HT_RM_CTR_AA 0xc0 /* Allowed Attempts */
+#define PCI_HT_RM_STS0 5 /* Status 0 Register */
+#define PCI_HT_RM_STS_RETSNT 0x01 /* Retry Sent */
+#define PCI_HT_RM_STS_CNTROL 0x02 /* Count Rollover */
+#define PCI_HT_RM_STS_SRCV 0x04 /* Stomp Received */
+#define PCI_HT_RM_CTR1 6 /* Control 1 Register */
+#define PCI_HT_RM_STS1 7 /* Status 1 Register */
+#define PCI_HT_RM_CNT0 8 /* Retry Count 0 Register */
+#define PCI_HT_RM_CNT1 10 /* Retry Count 1 Register */
+#define PCI_HT_RM_SIZEOF 12
+
+/* Vendor-Specific Capability (see PCI_EVNDR_xxx for the PCIe version) */
+#define PCI_VNDR_LENGTH 2 /* Length byte */
+
+/* PCI Express */
+#define PCI_EXP_FLAGS 0x2 /* Capabilities register */
+#define PCI_EXP_FLAGS_VERS 0x000f /* Capability version */
+#define PCI_EXP_FLAGS_TYPE 0x00f0 /* Device/Port type */
+#define PCI_EXP_TYPE_ENDPOINT 0x0 /* Express Endpoint */
+#define PCI_EXP_TYPE_LEG_END 0x1 /* Legacy Endpoint */
+#define PCI_EXP_TYPE_ROOT_PORT 0x4 /* Root Port */
+#define PCI_EXP_TYPE_UPSTREAM 0x5 /* Upstream Port */
+#define PCI_EXP_TYPE_DOWNSTREAM 0x6 /* Downstream Port */
+#define PCI_EXP_TYPE_PCI_BRIDGE 0x7 /* PCI/PCI-X Bridge */
+#define PCI_EXP_TYPE_PCIE_BRIDGE 0x8 /* PCI/PCI-X to PCIE Bridge */
+#define PCI_EXP_TYPE_ROOT_INT_EP 0x9 /* Root Complex Integrated Endpoint */
+#define PCI_EXP_TYPE_ROOT_EC 0xa /* Root Complex Event Collector */
+#define PCI_EXP_FLAGS_SLOT 0x0100 /* Slot implemented */
+#define PCI_EXP_FLAGS_IRQ 0x3e00 /* Interrupt message number */
+#define PCI_EXP_DEVCAP 0x4 /* Device capabilities */
+#define PCI_EXP_DEVCAP_PAYLOAD 0x07 /* Max_Payload_Size */
+#define PCI_EXP_DEVCAP_PHANTOM 0x18 /* Phantom functions */
+#define PCI_EXP_DEVCAP_EXT_TAG 0x20 /* Extended tags */
+#define PCI_EXP_DEVCAP_L0S 0x1c0 /* L0s Acceptable Latency */
+#define PCI_EXP_DEVCAP_L1 0xe00 /* L1 Acceptable Latency */
+#define PCI_EXP_DEVCAP_ATN_BUT 0x1000 /* Attention Button Present */
+#define PCI_EXP_DEVCAP_ATN_IND 0x2000 /* Attention Indicator Present */
+#define PCI_EXP_DEVCAP_PWR_IND 0x4000 /* Power Indicator Present */
+#define PCI_EXP_DEVCAP_RBE 0x8000 /* Role-Based Error Reporting */
+#define PCI_EXP_DEVCAP_PWR_VAL 0x3fc0000 /* Slot Power Limit Value */
+#define PCI_EXP_DEVCAP_PWR_SCL 0xc000000 /* Slot Power Limit Scale */
+#define PCI_EXP_DEVCAP_FLRESET 0x10000000 /* Function-Level Reset */
+#define PCI_EXP_DEVCTL 0x8 /* Device Control */
+#define PCI_EXP_DEVCTL_CERE 0x0001 /* Correctable Error Reporting En. */
+#define PCI_EXP_DEVCTL_NFERE 0x0002 /* Non-Fatal Error Reporting Enable */
+#define PCI_EXP_DEVCTL_FERE 0x0004 /* Fatal Error Reporting Enable */
+#define PCI_EXP_DEVCTL_URRE 0x0008 /* Unsupported Request Reporting En. */
+#define PCI_EXP_DEVCTL_RELAXED 0x0010 /* Enable Relaxed Ordering */
+#define PCI_EXP_DEVCTL_PAYLOAD 0x00e0 /* Max_Payload_Size */
+#define PCI_EXP_DEVCTL_EXT_TAG 0x0100 /* Extended Tag Field Enable */
+#define PCI_EXP_DEVCTL_PHANTOM 0x0200 /* Phantom Functions Enable */
+#define PCI_EXP_DEVCTL_AUX_PME 0x0400 /* Auxiliary Power PM Enable */
+#define PCI_EXP_DEVCTL_NOSNOOP 0x0800 /* Enable No Snoop */
+#define PCI_EXP_DEVCTL_READRQ 0x7000 /* Max_Read_Request_Size */
+#define PCI_EXP_DEVCTL_BCRE 0x8000 /* Bridge Configuration Retry Enable */
+#define PCI_EXP_DEVCTL_FLRESET 0x8000 /* Function-Level Reset [bit shared with BCRE] */
+#define PCI_EXP_DEVSTA 0xa /* Device Status */
+#define PCI_EXP_DEVSTA_CED 0x01 /* Correctable Error Detected */
+#define PCI_EXP_DEVSTA_NFED 0x02 /* Non-Fatal Error Detected */
+#define PCI_EXP_DEVSTA_FED 0x04 /* Fatal Error Detected */
+#define PCI_EXP_DEVSTA_URD 0x08 /* Unsupported Request Detected */
+#define PCI_EXP_DEVSTA_AUXPD 0x10 /* AUX Power Detected */
+#define PCI_EXP_DEVSTA_TRPND 0x20 /* Transactions Pending */
+#define PCI_EXP_LNKCAP 0xc /* Link Capabilities */
+#define PCI_EXP_LNKCAP_SPEED 0x0000f /* Maximum Link Speed */
+#define PCI_EXP_LNKCAP_WIDTH 0x003f0 /* Maximum Link Width */
+#define PCI_EXP_LNKCAP_ASPM 0x00c00 /* Active State Power Management */
+#define PCI_EXP_LNKCAP_L0S 0x07000 /* L0s Acceptable Latency */
+#define PCI_EXP_LNKCAP_L1 0x38000 /* L1 Acceptable Latency */
+#define PCI_EXP_LNKCAP_CLOCKPM 0x40000 /* Clock Power Management */
+#define PCI_EXP_LNKCAP_SURPRISE 0x80000 /* Surprise Down Error Reporting */
+#define PCI_EXP_LNKCAP_DLLA 0x100000 /* Data Link Layer Active Reporting */
+#define PCI_EXP_LNKCAP_LBNC 0x200000 /* Link Bandwidth Notification Capability */
+#define PCI_EXP_LNKCAP_PORT 0xff000000 /* Port Number */
+#define PCI_EXP_LNKCTL 0x10 /* Link Control */
+#define PCI_EXP_LNKCTL_ASPM 0x0003 /* ASPM Control */
+#define PCI_EXP_LNKCTL_RCB 0x0008 /* Read Completion Boundary */
+#define PCI_EXP_LNKCTL_DISABLE 0x0010 /* Link Disable */
+#define PCI_EXP_LNKCTL_RETRAIN 0x0020 /* Retrain Link */
+#define PCI_EXP_LNKCTL_CLOCK 0x0040 /* Common Clock Configuration */
+#define PCI_EXP_LNKCTL_XSYNCH 0x0080 /* Extended Synch */
+#define PCI_EXP_LNKCTL_CLOCKPM 0x0100 /* Clock Power Management */
+#define PCI_EXP_LNKCTL_HWAUTWD 0x0200 /* Hardware Autonomous Width Disable */
+#define PCI_EXP_LNKCTL_BWMIE 0x0400 /* Bandwidth Mgmt Interrupt Enable */
+#define PCI_EXP_LNKCTL_AUTBWIE 0x0800 /* Autonomous Bandwidth Mgmt Interrupt Enable */
+#define PCI_EXP_LNKSTA 0x12 /* Link Status */
+#define PCI_EXP_LNKSTA_SPEED 0x000f /* Negotiated Link Speed */
+#define PCI_EXP_LNKSTA_WIDTH 0x03f0 /* Negotiated Link Width */
+#define PCI_EXP_LNKSTA_TR_ERR 0x0400 /* Training Error (obsolete) */
+#define PCI_EXP_LNKSTA_TRAIN 0x0800 /* Link Training */
+#define PCI_EXP_LNKSTA_SL_CLK 0x1000 /* Slot Clock Configuration */
+#define PCI_EXP_LNKSTA_DL_ACT 0x2000 /* Data Link Layer in DL_Active State */
+#define PCI_EXP_LNKSTA_BWMGMT 0x4000 /* Bandwidth Mgmt Status */
+#define PCI_EXP_LNKSTA_AUTBW 0x8000 /* Autonomous Bandwidth Mgmt Status */
+#define PCI_EXP_SLTCAP 0x14 /* Slot Capabilities */
+#define PCI_EXP_SLTCAP_ATNB 0x0001 /* Attention Button Present */
+#define PCI_EXP_SLTCAP_PWRC 0x0002 /* Power Controller Present */
+#define PCI_EXP_SLTCAP_MRL 0x0004 /* MRL Sensor Present */
+#define PCI_EXP_SLTCAP_ATNI 0x0008 /* Attention Indicator Present */
+#define PCI_EXP_SLTCAP_PWRI 0x0010 /* Power Indicator Present */
+#define PCI_EXP_SLTCAP_HPS 0x0020 /* Hot-Plug Surprise */
+#define PCI_EXP_SLTCAP_HPC 0x0040 /* Hot-Plug Capable */
+#define PCI_EXP_SLTCAP_PWR_VAL 0x00007f80 /* Slot Power Limit Value */
+#define PCI_EXP_SLTCAP_PWR_SCL 0x00018000 /* Slot Power Limit Scale */
+#define PCI_EXP_SLTCAP_INTERLOCK 0x020000 /* Electromechanical Interlock Present */
+#define PCI_EXP_SLTCAP_NOCMDCOMP 0x040000 /* No Command Completed Support */
+#define PCI_EXP_SLTCAP_PSN 0xfff80000 /* Physical Slot Number */
+#define PCI_EXP_SLTCTL 0x18 /* Slot Control */
+#define PCI_EXP_SLTCTL_ATNB 0x0001 /* Attention Button Pressed Enable */
+#define PCI_EXP_SLTCTL_PWRF 0x0002 /* Power Fault Detected Enable */
+#define PCI_EXP_SLTCTL_MRLS 0x0004 /* MRL Sensor Changed Enable */
+#define PCI_EXP_SLTCTL_PRSD 0x0008 /* Presence Detect Changed Enable */
+#define PCI_EXP_SLTCTL_CMDC 0x0010 /* Command Completed Interrupt Enable */
+#define PCI_EXP_SLTCTL_HPIE 0x0020 /* Hot-Plug Interrupt Enable */
+#define PCI_EXP_SLTCTL_ATNI 0x00c0 /* Attention Indicator Control */
+#define PCI_EXP_SLTCTL_PWRI 0x0300 /* Power Indicator Control */
+#define PCI_EXP_SLTCTL_PWRC 0x0400 /* Power Controller Control */
+#define PCI_EXP_SLTCTL_INTERLOCK 0x0800 /* Electromechanical Interlock Control */
+#define PCI_EXP_SLTCTL_LLCHG 0x1000 /* Data Link Layer State Changed Enable */
+#define PCI_EXP_SLTSTA 0x1a /* Slot Status */
+#define PCI_EXP_SLTSTA_ATNB 0x0001 /* Attention Button Pressed */
+#define PCI_EXP_SLTSTA_PWRF 0x0002 /* Power Fault Detected */
+#define PCI_EXP_SLTSTA_MRLS 0x0004 /* MRL Sensor Changed */
+#define PCI_EXP_SLTSTA_PRSD 0x0008 /* Presence Detect Changed */
+#define PCI_EXP_SLTSTA_CMDC 0x0010 /* Command Completed */
+#define PCI_EXP_SLTSTA_MRL_ST 0x0020 /* MRL Sensor State */
+#define PCI_EXP_SLTSTA_PRES 0x0040 /* Presence Detect State */
+#define PCI_EXP_SLTSTA_INTERLOCK 0x0080 /* Electromechanical Interlock Status */
+#define PCI_EXP_SLTSTA_LLCHG 0x0100 /* Data Link Layer State Changed */
+#define PCI_EXP_RTCTL 0x1c /* Root Control */
+#define PCI_EXP_RTCTL_SECEE 0x0001 /* System Error on Correctable Error */
+#define PCI_EXP_RTCTL_SENFEE 0x0002 /* System Error on Non-Fatal Error */
+#define PCI_EXP_RTCTL_SEFEE 0x0004 /* System Error on Fatal Error */
+#define PCI_EXP_RTCTL_PMEIE 0x0008 /* PME Interrupt Enable */
+#define PCI_EXP_RTCTL_CRSVIS 0x0010 /* Configuration Request Retry Status Visible to SW */
+#define PCI_EXP_RTCAP 0x1e /* Root Capabilities */
+#define PCI_EXP_RTCAP_CRSVIS 0x0010 /* Configuration Request Retry Status Visible to SW */
+#define PCI_EXP_RTSTA 0x20 /* Root Status */
+#define PCI_EXP_RTSTA_PME_REQID 0x0000ffff /* PME Requester ID */
+#define PCI_EXP_RTSTA_PME_STATUS 0x00010000 /* PME Status */
+#define PCI_EXP_RTSTA_PME_PENDING 0x00020000 /* PME is Pending */
+#define PCI_EXP_DEVCAP2 0x24 /* Device capabilities 2 */
+#define PCI_EXP_DEVCTL2 0x28 /* Device Control */
+#define PCI_EXP_DEV2_TIMEOUT_RANGE(x) ((x) & 0xf) /* Completion Timeout Ranges Supported */
+#define PCI_EXP_DEV2_TIMEOUT_VALUE(x) ((x) & 0xf) /* Completion Timeout Value */
+#define PCI_EXP_DEV2_TIMEOUT_DIS 0x0010 /* Completion Timeout Disable Supported */
+#define PCI_EXP_DEV2_ARI 0x0020 /* ARI Forwarding */
+#define PCI_EXP_DEVSTA2 0x2a /* Device Status */
+#define PCI_EXP_LNKCAP2 0x2c /* Link Capabilities */
+#define PCI_EXP_LNKCTL2 0x30 /* Link Control */
+#define PCI_EXP_LNKCTL2_SPEED(x) ((x) & 0xf) /* Target Link Speed */
+#define PCI_EXP_LNKCTL2_CMPLNC 0x0010 /* Enter Compliance */
+#define PCI_EXP_LNKCTL2_SPEED_DIS 0x0020 /* Hardware Autonomous Speed Disable */
+#define PCI_EXP_LNKCTL2_DEEMPHASIS(x) (((x) >> 6) & 1) /* Selectable De-emphasis */
+#define PCI_EXP_LNKCTL2_MARGIN(x) (((x) >> 7) & 7) /* Transmit Margin */
+#define PCI_EXP_LNKCTL2_MOD_CMPLNC 0x0400 /* Enter Modified Compliance */
+#define PCI_EXP_LNKCTL2_CMPLNC_SOS 0x0800 /* Compliance SOS */
+#define PCI_EXP_LNKCTL2_COM_DEEMPHASIS(x) (((x) >> 12) & 1) /* Compliance De-emphasis */
+#define PCI_EXP_LNKSTA2 0x32 /* Link Status */
+#define PCI_EXP_LINKSTA2_DEEMPHASIS(x) ((x) & 1) /* Current De-emphasis Level */
+#define PCI_EXP_SLTCAP2 0x34 /* Slot Capabilities */
+#define PCI_EXP_SLTCTL2 0x38 /* Slot Control */
+#define PCI_EXP_SLTSTA2 0x3a /* Slot Status */
+
+/* MSI-X */
+#define PCI_MSIX_ENABLE 0x8000
+#define PCI_MSIX_MASK 0x4000
+#define PCI_MSIX_TABSIZE 0x07ff
+#define PCI_MSIX_TABLE 4
+#define PCI_MSIX_PBA 8
+#define PCI_MSIX_BIR 0x7
+
+/* Subsystem vendor/device ID for PCI bridges */
+#define PCI_SSVID_VENDOR 4
+#define PCI_SSVID_DEVICE 6
+
+/* PCI Advanced Features */
+#define PCI_AF_CAP 3
+#define PCI_AF_CAP_TP 0x01
+#define PCI_AF_CAP_FLR 0x02
+#define PCI_AF_CTRL 4
+#define PCI_AF_CTRL_FLR 0x01
+#define PCI_AF_STATUS 5
+#define PCI_AF_STATUS_TP 0x01
+
+/* SATA Host Bus Adapter */
+#define PCI_SATA_HBA_BARS 4
+#define PCI_SATA_HBA_REG0 8
+
+/*** Definitions of extended capabilities ***/
+
+/* Advanced Error Reporting */
+#define PCI_ERR_UNCOR_STATUS 4 /* Uncorrectable Error Status */
+#define PCI_ERR_UNC_TRAIN 0x00000001 /* Undefined in PCIe rev1.1 & 2.0 spec */
+#define PCI_ERR_UNC_DLP 0x00000010 /* Data Link Protocol */
+#define PCI_ERR_UNC_SDES 0x00000020 /* Surprise Down Error */
+#define PCI_ERR_UNC_POISON_TLP 0x00001000 /* Poisoned TLP */
+#define PCI_ERR_UNC_FCP 0x00002000 /* Flow Control Protocol */
+#define PCI_ERR_UNC_COMP_TIME 0x00004000 /* Completion Timeout */
+#define PCI_ERR_UNC_COMP_ABORT 0x00008000 /* Completer Abort */
+#define PCI_ERR_UNC_UNX_COMP 0x00010000 /* Unexpected Completion */
+#define PCI_ERR_UNC_RX_OVER 0x00020000 /* Receiver Overflow */
+#define PCI_ERR_UNC_MALF_TLP 0x00040000 /* Malformed TLP */
+#define PCI_ERR_UNC_ECRC 0x00080000 /* ECRC Error Status */
+#define PCI_ERR_UNC_UNSUP 0x00100000 /* Unsupported Request */
+#define PCI_ERR_UNC_ACS_VIOL 0x00200000 /* ACS Violation */
+#define PCI_ERR_UNCOR_MASK 8 /* Uncorrectable Error Mask */
+ /* Same bits as above */
+#define PCI_ERR_UNCOR_SEVER 12 /* Uncorrectable Error Severity */
+ /* Same bits as above */
+#define PCI_ERR_COR_STATUS 16 /* Correctable Error Status */
+#define PCI_ERR_COR_RCVR 0x00000001 /* Receiver Error Status */
+#define PCI_ERR_COR_BAD_TLP 0x00000040 /* Bad TLP Status */
+#define PCI_ERR_COR_BAD_DLLP 0x00000080 /* Bad DLLP Status */
+#define PCI_ERR_COR_REP_ROLL 0x00000100 /* REPLAY_NUM Rollover */
+#define PCI_ERR_COR_REP_TIMER 0x00001000 /* Replay Timer Timeout */
+#define PCI_ERR_COR_REP_ANFE 0x00002000 /* Advisory Non-Fatal Error */
+#define PCI_ERR_COR_MASK 20 /* Correctable Error Mask */
+ /* Same bits as above */
+#define PCI_ERR_CAP 24 /* Advanced Error Capabilities */
+#define PCI_ERR_CAP_FEP(x) ((x) & 31) /* First Error Pointer */
+#define PCI_ERR_CAP_ECRC_GENC 0x00000020 /* ECRC Generation Capable */
+#define PCI_ERR_CAP_ECRC_GENE 0x00000040 /* ECRC Generation Enable */
+#define PCI_ERR_CAP_ECRC_CHKC 0x00000080 /* ECRC Check Capable */
+#define PCI_ERR_CAP_ECRC_CHKE 0x00000100 /* ECRC Check Enable */
+#define PCI_ERR_HEADER_LOG 28 /* Header Log Register (16 bytes) */
+#define PCI_ERR_ROOT_COMMAND 44 /* Root Error Command */
+#define PCI_ERR_ROOT_STATUS 48
+#define PCI_ERR_ROOT_COR_SRC 52
+#define PCI_ERR_ROOT_SRC 54
+
+/* Virtual Channel */
+#define PCI_VC_PORT_REG1 4
+#define PCI_VC_PORT_REG2 8
+#define PCI_VC_PORT_CTRL 12
+#define PCI_VC_PORT_STATUS 14
+#define PCI_VC_RES_CAP 16
+#define PCI_VC_RES_CTRL 20
+#define PCI_VC_RES_STATUS 26
+
+/* Power Budgeting */
+#define PCI_PWR_DSR 4 /* Data Select Register */
+#define PCI_PWR_DATA 8 /* Data Register */
+#define PCI_PWR_DATA_BASE(x) ((x) & 0xff) /* Base Power */
+#define PCI_PWR_DATA_SCALE(x) (((x) >> 8) & 3) /* Data Scale */
+#define PCI_PWR_DATA_PM_SUB(x) (((x) >> 10) & 7) /* PM Sub State */
+#define PCI_PWR_DATA_PM_STATE(x) (((x) >> 13) & 3) /* PM State */
+#define PCI_PWR_DATA_TYPE(x) (((x) >> 15) & 7) /* Type */
+#define PCI_PWR_DATA_RAIL(x) (((x) >> 18) & 7) /* Power Rail */
+#define PCI_PWR_CAP 12 /* Capability */
+#define PCI_PWR_CAP_BUDGET(x) ((x) & 1) /* Included in system budget */
+
+/* Root Complex Link */
+#define PCI_RCLINK_ESD 4 /* Element Self Description */
+#define PCI_RCLINK_LINK1 16 /* First Link Entry */
+#define PCI_RCLINK_LINK_DESC 0 /* Link Entry: Description */
+#define PCI_RCLINK_LINK_ADDR 8 /* Link Entry: Address (64-bit) */
+#define PCI_RCLINK_LINK_SIZE 16 /* Link Entry: sizeof */
+
+/* PCIe Vendor-Specific Capability */
+#define PCI_EVNDR_HEADER 4 /* Vendor-Specific Header */
+#define PCI_EVNDR_REGISTERS 8 /* Vendor-Specific Registers */
+
+/* Access Control Services */
+#define PCI_ACS_CAP 0x04 /* ACS Capability Register */
+#define PCI_ACS_CAP_VALID 0x0001 /* ACS Source Validation */
+#define PCI_ACS_CAP_BLOCK 0x0002 /* ACS Translation Blocking */
+#define PCI_ACS_CAP_REQ_RED 0x0004 /* ACS P2P Request Redirect */
+#define PCI_ACS_CAP_CMPLT_RED 0x0008 /* ACS P2P Completion Redirect */
+#define PCI_ACS_CAP_FORWARD 0x0010 /* ACS Upstream Forwarding */
+#define PCI_ACS_CAP_EGRESS 0x0020 /* ACS P2P Egress Control */
+#define PCI_ACS_CAP_TRANS 0x0040 /* ACS Direct Translated P2P */
+#define PCI_ACS_CAP_VECTOR(x) (((x) >> 8) & 0xff) /* Egress Control Vector Size */
+#define PCI_ACS_CTRL 0x06 /* ACS Control Register */
+#define PCI_ACS_CTRL_VALID 0x0001 /* ACS Source Validation Enable */
+#define PCI_ACS_CTRL_BLOCK 0x0002 /* ACS Translation Blocking Enable */
+#define PCI_ACS_CTRL_REQ_RED 0x0004 /* ACS P2P Request Redirect Enable */
+#define PCI_ACS_CTRL_CMPLT_RED 0x0008 /* ACS P2P Completion Redirect Enable */
+#define PCI_ACS_CTRL_FORWARD 0x0010 /* ACS Upstream Forwarding Enable */
+#define PCI_ACS_CTRL_EGRESS 0x0020 /* ACS P2P Egress Control Enable */
+#define PCI_ACS_CTRL_TRANS 0x0040 /* ACS Direct Translated P2P Enable */
+#define PCI_ACS_EGRESS_CTRL 0x08 /* Egress Control Vector */
+
+/* Alternative Routing-ID Interpretation */
+#define PCI_ARI_CAP 0x04 /* ARI Capability Register */
+#define PCI_ARI_CAP_MFVC 0x0001 /* MFVC Function Groups Capability */
+#define PCI_ARI_CAP_ACS 0x0002 /* ACS Function Groups Capability */
+#define PCI_ARI_CAP_NFN(x) (((x) >> 8) & 0xff) /* Next Function Number */
+#define PCI_ARI_CTRL 0x06 /* ARI Control Register */
+#define PCI_ARI_CTRL_MFVC 0x0001 /* MFVC Function Groups Enable */
+#define PCI_ARI_CTRL_ACS 0x0002 /* ACS Function Groups Enable */
+#define PCI_ARI_CTRL_FG(x) (((x) >> 4) & 7) /* Function Group */
+
+/* Address Translation Service */
+#define PCI_ATS_CAP 0x04 /* ATS Capability Register */
+#define PCI_ATS_CAP_IQD(x) ((x) & 0x1f) /* Invalidate Queue Depth */
+#define PCI_ATS_CTRL 0x06 /* ATS Control Register */
+#define PCI_ATS_CTRL_STU(x) ((x) & 0x1f) /* Smallest Translation Unit */
+#define PCI_ATS_CTRL_ENABLE 0x8000 /* ATS Enable */
+
+/* Single Root I/O Virtualization */
+#define PCI_IOV_CAP 0x04 /* SR-IOV Capability Register */
+#define PCI_IOV_CAP_VFM 0x00000001 /* VF Migration Capable */
+#define PCI_IOV_CAP_IMN(x) ((x) >> 21) /* VF Migration Interrupt Message Number */
+#define PCI_IOV_CTRL 0x08 /* SR-IOV Control Register */
+#define PCI_IOV_CTRL_VFE 0x0001 /* VF Enable */
+#define PCI_IOV_CTRL_VFME 0x0002 /* VF Migration Enable */
+#define PCI_IOV_CTRL_VFMIE 0x0004 /* VF Migration Interrupt Enable */
+#define PCI_IOV_CTRL_MSE 0x0008 /* VF MSE */
+#define PCI_IOV_CTRL_ARI 0x0010 /* ARI Capable Hierarchy */
+#define PCI_IOV_STATUS 0x0a /* SR-IOV Status Register */
+#define PCI_IOV_STATUS_MS 0x0001 /* VF Migration Status */
+#define PCI_IOV_INITIALVF 0x0c /* Number of VFs that are initially associated */
+#define PCI_IOV_TOTALVF 0x0e /* Maximum number of VFs that could be associated */
+#define PCI_IOV_NUMVF 0x10 /* Number of VFs that are available */
+#define PCI_IOV_FDL 0x12 /* Function Dependency Link */
+#define PCI_IOV_OFFSET 0x14 /* First VF Offset */
+#define PCI_IOV_STRIDE 0x16 /* Routing ID offset from one VF to the next one */
+#define PCI_IOV_DID 0x1a /* VF Device ID */
+#define PCI_IOV_SUPPS 0x1c /* Supported Page Sizes */
+#define PCI_IOV_SYSPS 0x20 /* System Page Size */
+#define PCI_IOV_BAR_BASE 0x24 /* VF BAR0, VF BAR1, ... VF BAR5 */
+#define PCI_IOV_NUM_BAR 6 /* Number of VF BARs */
+#define PCI_IOV_MSAO 0x3c /* VF Migration State Array Offset */
+#define PCI_IOV_MSA_BIR(x) ((x) & 7) /* VF Migration State BIR */
+#define PCI_IOV_MSA_OFFSET(x) ((x) & 0xfffffff8) /* VF Migration State Offset */
+
+/* Transaction Processing Hints */
+#define PCI_TPH_CAPABILITIES 4
+#define PCI_TPH_INTVEC_SUP (1<<1) /* Supports interrupt vector mode */
+#define PCI_TPH_DEV_SUP (1<<2) /* Device specific mode supported */
+#define PCI_TPH_EXT_REQ_SUP (1<<8) /* Supports extended requests */
+#define PCI_TPH_ST_LOC_MASK (3<<9) /* Steering table location bits */
+#define PCI_TPH_ST_NONE (0<<9) /* No steering table */
+#define PCI_TPH_ST_CAP (1<<9) /* Steering table in TPH cap */
+#define PCI_TPH_ST_MSIX (2<<9) /* Steering table in MSI-X table */
+#define PCI_TPH_ST_SIZE_SHIFT (16) /* Encoded as size - 1 */
+
+/* Latency Tolerance Reporting */
+#define PCI_LTR_MAX_SNOOP 4 /* 16 bit value */
+#define PCI_LTR_VALUE_MASK (0x3ff)
+#define PCI_LTR_SCALE_SHIFT (10)
+#define PCI_LTR_SCALE_MASK (7)
+#define PCI_LTR_MAX_NOSNOOP 6 /* 16 bit value */
+
+/*
+ * The PCI interface treats multi-function devices as independent
+ * devices. The slot/function address of each device is encoded
+ * in a single byte as follows:
+ *
+ * 7:3 = slot
+ * 2:0 = function
+ */
+#define PCI_DEVFN(slot,func) ((((slot) & 0x1f) << 3) | ((func) & 0x07))
+#define PCI_SLOT(devfn) (((devfn) >> 3) & 0x1f)
+#define PCI_FUNC(devfn) ((devfn) & 0x07)
+
+/* Device classes and subclasses */
+
+#define PCI_CLASS_NOT_DEFINED 0x0000
+#define PCI_CLASS_NOT_DEFINED_VGA 0x0001
+
+#define PCI_BASE_CLASS_STORAGE 0x01
+#define PCI_CLASS_STORAGE_SCSI 0x0100
+#define PCI_CLASS_STORAGE_IDE 0x0101
+#define PCI_CLASS_STORAGE_FLOPPY 0x0102
+#define PCI_CLASS_STORAGE_IPI 0x0103
+#define PCI_CLASS_STORAGE_RAID 0x0104
+#define PCI_CLASS_STORAGE_ATA 0x0105
+#define PCI_CLASS_STORAGE_SATA 0x0106
+#define PCI_CLASS_STORAGE_SAS 0x0107
+#define PCI_CLASS_STORAGE_OTHER 0x0180
+
+#define PCI_BASE_CLASS_NETWORK 0x02
+#define PCI_CLASS_NETWORK_ETHERNET 0x0200
+#define PCI_CLASS_NETWORK_TOKEN_RING 0x0201
+#define PCI_CLASS_NETWORK_FDDI 0x0202
+#define PCI_CLASS_NETWORK_ATM 0x0203
+#define PCI_CLASS_NETWORK_ISDN 0x0204
+#define PCI_CLASS_NETWORK_OTHER 0x0280
+
+#define PCI_BASE_CLASS_DISPLAY 0x03
+#define PCI_CLASS_DISPLAY_VGA 0x0300
+#define PCI_CLASS_DISPLAY_XGA 0x0301
+#define PCI_CLASS_DISPLAY_3D 0x0302
+#define PCI_CLASS_DISPLAY_OTHER 0x0380
+
+#define PCI_BASE_CLASS_MULTIMEDIA 0x04
+#define PCI_CLASS_MULTIMEDIA_VIDEO 0x0400
+#define PCI_CLASS_MULTIMEDIA_AUDIO 0x0401
+#define PCI_CLASS_MULTIMEDIA_PHONE 0x0402
+#define PCI_CLASS_MULTIMEDIA_AUDIO_DEV 0x0403
+#define PCI_CLASS_MULTIMEDIA_OTHER 0x0480
+
+#define PCI_BASE_CLASS_MEMORY 0x05
+#define PCI_CLASS_MEMORY_RAM 0x0500
+#define PCI_CLASS_MEMORY_FLASH 0x0501
+#define PCI_CLASS_MEMORY_OTHER 0x0580
+
+#define PCI_BASE_CLASS_BRIDGE 0x06
+#define PCI_CLASS_BRIDGE_HOST 0x0600
+#define PCI_CLASS_BRIDGE_ISA 0x0601
+#define PCI_CLASS_BRIDGE_EISA 0x0602
+#define PCI_CLASS_BRIDGE_MC 0x0603
+#define PCI_CLASS_BRIDGE_PCI 0x0604
+#define PCI_CLASS_BRIDGE_PCMCIA 0x0605
+#define PCI_CLASS_BRIDGE_NUBUS 0x0606
+#define PCI_CLASS_BRIDGE_CARDBUS 0x0607
+#define PCI_CLASS_BRIDGE_RACEWAY 0x0608
+#define PCI_CLASS_BRIDGE_PCI_SEMI 0x0609
+#define PCI_CLASS_BRIDGE_IB_TO_PCI 0x060a
+#define PCI_CLASS_BRIDGE_OTHER 0x0680
+
+#define PCI_BASE_CLASS_COMMUNICATION 0x07
+#define PCI_CLASS_COMMUNICATION_SERIAL 0x0700
+#define PCI_CLASS_COMMUNICATION_PARALLEL 0x0701
+#define PCI_CLASS_COMMUNICATION_MSERIAL 0x0702
+#define PCI_CLASS_COMMUNICATION_MODEM 0x0703
+#define PCI_CLASS_COMMUNICATION_OTHER 0x0780
+
+#define PCI_BASE_CLASS_SYSTEM 0x08
+#define PCI_CLASS_SYSTEM_PIC 0x0800
+#define PCI_CLASS_SYSTEM_DMA 0x0801
+#define PCI_CLASS_SYSTEM_TIMER 0x0802
+#define PCI_CLASS_SYSTEM_RTC 0x0803
+#define PCI_CLASS_SYSTEM_PCI_HOTPLUG 0x0804
+#define PCI_CLASS_SYSTEM_OTHER 0x0880
+
+#define PCI_BASE_CLASS_INPUT 0x09
+#define PCI_CLASS_INPUT_KEYBOARD 0x0900
+#define PCI_CLASS_INPUT_PEN 0x0901
+#define PCI_CLASS_INPUT_MOUSE 0x0902
+#define PCI_CLASS_INPUT_SCANNER 0x0903
+#define PCI_CLASS_INPUT_GAMEPORT 0x0904
+#define PCI_CLASS_INPUT_OTHER 0x0980
+
+#define PCI_BASE_CLASS_DOCKING 0x0a
+#define PCI_CLASS_DOCKING_GENERIC 0x0a00
+#define PCI_CLASS_DOCKING_OTHER 0x0a80
+
+#define PCI_BASE_CLASS_PROCESSOR 0x0b
+#define PCI_CLASS_PROCESSOR_386 0x0b00
+#define PCI_CLASS_PROCESSOR_486 0x0b01
+#define PCI_CLASS_PROCESSOR_PENTIUM 0x0b02
+#define PCI_CLASS_PROCESSOR_ALPHA 0x0b10
+#define PCI_CLASS_PROCESSOR_POWERPC 0x0b20
+#define PCI_CLASS_PROCESSOR_MIPS 0x0b30
+#define PCI_CLASS_PROCESSOR_CO 0x0b40
+
+#define PCI_BASE_CLASS_SERIAL 0x0c
+#define PCI_CLASS_SERIAL_FIREWIRE 0x0c00
+#define PCI_CLASS_SERIAL_ACCESS 0x0c01
+#define PCI_CLASS_SERIAL_SSA 0x0c02
+#define PCI_CLASS_SERIAL_USB 0x0c03
+#define PCI_CLASS_SERIAL_FIBER 0x0c04
+#define PCI_CLASS_SERIAL_SMBUS 0x0c05
+#define PCI_CLASS_SERIAL_INFINIBAND 0x0c06
+
+#define PCI_BASE_CLASS_WIRELESS 0x0d
+#define PCI_CLASS_WIRELESS_IRDA 0x0d00
+#define PCI_CLASS_WIRELESS_CONSUMER_IR 0x0d01
+#define PCI_CLASS_WIRELESS_RF 0x0d10
+#define PCI_CLASS_WIRELESS_OTHER 0x0d80
+
+#define PCI_BASE_CLASS_INTELLIGENT 0x0e
+#define PCI_CLASS_INTELLIGENT_I2O 0x0e00
+
+#define PCI_BASE_CLASS_SATELLITE 0x0f
+#define PCI_CLASS_SATELLITE_TV 0x0f00
+#define PCI_CLASS_SATELLITE_AUDIO 0x0f01
+#define PCI_CLASS_SATELLITE_VOICE 0x0f03
+#define PCI_CLASS_SATELLITE_DATA 0x0f04
+
+#define PCI_BASE_CLASS_CRYPT 0x10
+#define PCI_CLASS_CRYPT_NETWORK 0x1000
+#define PCI_CLASS_CRYPT_ENTERTAINMENT 0x1010
+#define PCI_CLASS_CRYPT_OTHER 0x1080
+
+#define PCI_BASE_CLASS_SIGNAL 0x11
+#define PCI_CLASS_SIGNAL_DPIO 0x1100
+#define PCI_CLASS_SIGNAL_PERF_CTR 0x1101
+#define PCI_CLASS_SIGNAL_SYNCHRONIZER 0x1110
+#define PCI_CLASS_SIGNAL_OTHER 0x1180
+
+#define PCI_CLASS_OTHERS 0xff
+
+/* Several ID's we need in the library */
+
+#define PCI_VENDOR_ID_INTEL 0x8086
+#define PCI_VENDOR_ID_COMPAQ 0x0e11
diff --git a/ext/hwloc/include/pci/pci.h b/ext/hwloc/include/pci/pci.h
new file mode 100644
index 0000000..7a5a6b8
--- /dev/null
+++ b/ext/hwloc/include/pci/pci.h
@@ -0,0 +1,240 @@
+/*
+ * The PCI Library
+ *
+ * Copyright (c) 1997--2009 Martin Mares <mj at ucw.cz>
+ *
+ * Can be freely distributed and used under the terms of the GNU GPL.
+ */
+
+#ifndef _PCI_LIB_H
+#define _PCI_LIB_H
+
+#ifndef PCI_CONFIG_H
+#include "config.h"
+#endif
+
+#include "header.h"
+#include "types.h"
+
+#define PCI_LIB_VERSION 0x030100
+
+#ifndef PCI_ABI
+#define PCI_ABI
+#endif
+
+/*
+ * PCI Access Structure
+ */
+
+struct pci_methods;
+
+enum pci_access_type {
+ /* Known access methods, remember to update access.c as well */
+ PCI_ACCESS_AUTO, /* Autodetection */
+ PCI_ACCESS_SYS_BUS_PCI, /* Linux /sys/bus/pci */
+ PCI_ACCESS_PROC_BUS_PCI, /* Linux /proc/bus/pci */
+ PCI_ACCESS_I386_TYPE1, /* i386 ports, type 1 */
+ PCI_ACCESS_I386_TYPE2, /* i386 ports, type 2 */
+ PCI_ACCESS_FBSD_DEVICE, /* FreeBSD /dev/pci */
+ PCI_ACCESS_AIX_DEVICE, /* /dev/pci0, /dev/bus0, etc. */
+ PCI_ACCESS_NBSD_LIBPCI, /* NetBSD libpci */
+ PCI_ACCESS_OBSD_DEVICE, /* OpenBSD /dev/pci */
+ PCI_ACCESS_DUMP, /* Dump file */
+ PCI_ACCESS_MAX
+};
+
+struct pci_access {
+ /* Options you can change: */
+ unsigned int method; /* Access method */
+ int writeable; /* Open in read/write mode */
+ int buscentric; /* Bus-centric view of the world */
+
+ char *id_file_name; /* Name of ID list file (use pci_set_name_list_path()) */
+ int free_id_name; /* Set if id_file_name is malloced */
+ int numeric_ids; /* Enforce PCI_LOOKUP_NUMERIC (>1 => PCI_LOOKUP_MIXED) */
+
+ unsigned int id_lookup_mode; /* pci_lookup_mode flags which are set automatically */
+ /* Default: PCI_LOOKUP_CACHE */
+
+ int debugging; /* Turn on debugging messages */
+
+ /* Functions you can override: */
+ void (*error)(char *msg, ...) PCI_PRINTF(1,2); /* Write error message and quit */
+ void (*warning)(char *msg, ...) PCI_PRINTF(1,2); /* Write a warning message */
+ void (*debug)(char *msg, ...) PCI_PRINTF(1,2); /* Write a debugging message */
+
+ struct pci_dev *devices; /* Devices found on this bus */
+
+ /* Fields used internally: */
+ struct pci_methods *methods;
+ struct pci_param *params;
+ struct id_entry **id_hash; /* names.c */
+ struct id_bucket *current_id_bucket;
+ int id_load_failed;
+ int id_cache_status; /* 0=not read, 1=read, 2=dirty */
+ int fd; /* proc/sys: fd for config space */
+ int fd_rw; /* proc/sys: fd opened read-write */
+ int fd_pos; /* proc/sys: current position */
+ int fd_vpd; /* sys: fd for VPD */
+ struct pci_dev *cached_dev; /* proc/sys: device the fds are for */
+};
+
+/* Initialize PCI access */
+struct pci_access *pci_alloc(void) PCI_ABI;
+void pci_init(struct pci_access *) PCI_ABI;
+void pci_cleanup(struct pci_access *) PCI_ABI;
+
+/* Scanning of devices */
+void pci_scan_bus(struct pci_access *acc) PCI_ABI;
+struct pci_dev *pci_get_dev(struct pci_access *acc, int domain, int bus, int dev, int func) PCI_ABI; /* Raw access to specified device */
+void pci_free_dev(struct pci_dev *) PCI_ABI;
+
+/* Names of access methods */
+int pci_lookup_method(char *name) PCI_ABI; /* Returns -1 if not found */
+char *pci_get_method_name(int index) PCI_ABI; /* Returns "" if unavailable, NULL if index out of range */
+
+/*
+ * Named parameters
+ */
+
+struct pci_param {
+ struct pci_param *next; /* Please use pci_walk_params() for traversing the list */
+ char *param; /* Name of the parameter */
+ char *value; /* Value of the parameter */
+ int value_malloced; /* used internally */
+ char *help; /* Explanation of the parameter */
+};
+
+char *pci_get_param(struct pci_access *acc, char *param) PCI_ABI;
+int pci_set_param(struct pci_access *acc, char *param, char *value) PCI_ABI; /* 0 on success, -1 if no such parameter */
+/* To traverse the list, call pci_walk_params repeatedly, first with prev=NULL, and do not modify the parameters during traversal. */
+struct pci_param *pci_walk_params(struct pci_access *acc, struct pci_param *prev) PCI_ABI;
+
+/*
+ * Devices
+ */
+
+struct pci_dev {
+ struct pci_dev *next; /* Next device in the chain */
+ u16 domain; /* PCI domain (host bridge) */
+ u8 bus, dev, func; /* Bus inside domain, device and function */
+
+ /* These fields are set by pci_fill_info() */
+ int known_fields; /* Set of info fields already known */
+ u16 vendor_id, device_id; /* Identity of the device */
+ u16 device_class; /* PCI device class */
+ int irq; /* IRQ number */
+ pciaddr_t base_addr[6]; /* Base addresses including flags in lower bits */
+ pciaddr_t size[6]; /* Region sizes */
+ pciaddr_t rom_base_addr; /* Expansion ROM base address */
+ pciaddr_t rom_size; /* Expansion ROM size */
+ struct pci_cap *first_cap; /* List of capabilities */
+ char *phy_slot; /* Physical slot */
+
+ /* Fields used internally: */
+ struct pci_access *access;
+ struct pci_methods *methods;
+ u8 *cache; /* Cached config registers */
+ int cache_len;
+ int hdrtype; /* Cached low 7 bits of header type, -1 if unknown */
+ void *aux; /* Auxillary data */
+};
+
+#define PCI_ADDR_IO_MASK (~(pciaddr_t) 0x3)
+#define PCI_ADDR_MEM_MASK (~(pciaddr_t) 0xf)
+#define PCI_ADDR_FLAG_MASK 0xf
+
+u8 pci_read_byte(struct pci_dev *, int pos) PCI_ABI; /* Access to configuration space */
+u16 pci_read_word(struct pci_dev *, int pos) PCI_ABI;
+u32 pci_read_long(struct pci_dev *, int pos) PCI_ABI;
+int pci_read_block(struct pci_dev *, int pos, u8 *buf, int len) PCI_ABI;
+int pci_read_vpd(struct pci_dev *d, int pos, u8 *buf, int len) PCI_ABI;
+int pci_write_byte(struct pci_dev *, int pos, u8 data) PCI_ABI;
+int pci_write_word(struct pci_dev *, int pos, u16 data) PCI_ABI;
+int pci_write_long(struct pci_dev *, int pos, u32 data) PCI_ABI;
+int pci_write_block(struct pci_dev *, int pos, u8 *buf, int len) PCI_ABI;
+
+int pci_fill_info(struct pci_dev *, int flags) PCI_ABI; /* Fill in device information */
+
+#define PCI_FILL_IDENT 1
+#define PCI_FILL_IRQ 2
+#define PCI_FILL_BASES 4
+#define PCI_FILL_ROM_BASE 8
+#define PCI_FILL_SIZES 16
+#define PCI_FILL_CLASS 32
+#define PCI_FILL_CAPS 64
+#define PCI_FILL_EXT_CAPS 128
+#define PCI_FILL_PHYS_SLOT 256
+#define PCI_FILL_RESCAN 0x10000
+
+void pci_setup_cache(struct pci_dev *, u8 *cache, int len) PCI_ABI;
+
+/*
+ * Capabilities
+ */
+
+struct pci_cap {
+ struct pci_cap *next;
+ u16 id; /* PCI_CAP_ID_xxx */
+ u16 type; /* PCI_CAP_xxx */
+ unsigned int addr; /* Position in the config space */
+};
+
+#define PCI_CAP_NORMAL 1 /* Traditional PCI capabilities */
+#define PCI_CAP_EXTENDED 2 /* PCIe extended capabilities */
+
+struct pci_cap *pci_find_cap(struct pci_dev *, unsigned int id, unsigned int type) PCI_ABI;
+
+/*
+ * Filters
+ */
+
+struct pci_filter {
+ int domain, bus, slot, func; /* -1 = ANY */
+ int vendor, device;
+};
+
+void pci_filter_init(struct pci_access *, struct pci_filter *) PCI_ABI;
+char *pci_filter_parse_slot(struct pci_filter *, char *) PCI_ABI;
+char *pci_filter_parse_id(struct pci_filter *, char *) PCI_ABI;
+int pci_filter_match(struct pci_filter *, struct pci_dev *) PCI_ABI;
+
+/*
+ * Conversion of PCI ID's to names (according to the pci.ids file)
+ *
+ * Call pci_lookup_name() to identify different types of ID's:
+ *
+ * VENDOR (vendorID) -> vendor
+ * DEVICE (vendorID, deviceID) -> device
+ * VENDOR | DEVICE (vendorID, deviceID) -> combined vendor and device
+ * SUBSYSTEM | VENDOR (subvendorID) -> subsystem vendor
+ * SUBSYSTEM | DEVICE (vendorID, deviceID, subvendorID, subdevID) -> subsystem device
+ * SUBSYSTEM | VENDOR | DEVICE (vendorID, deviceID, subvendorID, subdevID) -> combined subsystem v+d
+ * SUBSYSTEM | ... (-1, -1, subvendorID, subdevID) -> generic subsystem
+ * CLASS (classID) -> class
+ * PROGIF (classID, progif) -> programming interface
+ */
+
+char *pci_lookup_name(struct pci_access *a, char *buf, int size, int flags, ...) PCI_ABI;
+
+int pci_load_name_list(struct pci_access *a) PCI_ABI; /* Called automatically by pci_lookup_*() when needed; returns success */
+void pci_free_name_list(struct pci_access *a) PCI_ABI; /* Called automatically by pci_cleanup() */
+void pci_set_name_list_path(struct pci_access *a, char *name, int to_be_freed) PCI_ABI;
+void pci_id_cache_flush(struct pci_access *a) PCI_ABI;
+
+enum pci_lookup_mode {
+ PCI_LOOKUP_VENDOR = 1, /* Vendor name (args: vendorID) */
+ PCI_LOOKUP_DEVICE = 2, /* Device name (args: vendorID, deviceID) */
+ PCI_LOOKUP_CLASS = 4, /* Device class (args: classID) */
+ PCI_LOOKUP_SUBSYSTEM = 8,
+ PCI_LOOKUP_PROGIF = 16, /* Programming interface (args: classID, prog_if) */
+ PCI_LOOKUP_NUMERIC = 0x10000, /* Want only formatted numbers; default if access->numeric_ids is set */
+ PCI_LOOKUP_NO_NUMBERS = 0x20000, /* Return NULL if not found in the database; default is to print numerically */
+ PCI_LOOKUP_MIXED = 0x40000, /* Include both numbers and names */
+ PCI_LOOKUP_NETWORK = 0x80000, /* Try to resolve unknown ID's by DNS */
+ PCI_LOOKUP_SKIP_LOCAL = 0x100000, /* Do not consult local database */
+ PCI_LOOKUP_CACHE = 0x200000, /* Consult the local cache before using DNS */
+ PCI_LOOKUP_REFRESH_CACHE = 0x400000, /* Forget all previously cached entries, but still allow updating the cache */
+};
+
+#endif
diff --git a/ext/hwloc/include/pci/types.h b/ext/hwloc/include/pci/types.h
new file mode 100644
index 0000000..4d23e69
--- /dev/null
+++ b/ext/hwloc/include/pci/types.h
@@ -0,0 +1,65 @@
+/*
+ * The PCI Library -- Types and Format Strings
+ *
+ * Copyright (c) 1997--2008 Martin Mares <mj at ucw.cz>
+ *
+ * Can be freely distributed and used under the terms of the GNU GPL.
+ */
+
+#include <sys/types.h>
+
+#ifndef PCI_HAVE_Uxx_TYPES
+
+#ifdef PCI_OS_WINDOWS
+#include <windef.h>
+typedef BYTE u8;
+typedef WORD u16;
+typedef DWORD u32;
+#elif defined(PCI_HAVE_STDINT_H) || (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
+#include <stdint.h>
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+#else
+typedef u_int8_t u8;
+typedef u_int16_t u16;
+typedef u_int32_t u32;
+#endif
+
+#ifdef PCI_HAVE_64BIT_ADDRESS
+#include <limits.h>
+#if ULONG_MAX > 0xffffffff
+typedef unsigned long u64;
+#define PCI_U64_FMT "l"
+#else
+typedef unsigned long long u64;
+#define PCI_U64_FMT "ll"
+#endif
+#endif
+
+#endif /* PCI_HAVE_Uxx_TYPES */
+
+#ifdef PCI_HAVE_64BIT_ADDRESS
+typedef u64 pciaddr_t;
+#define PCIADDR_T_FMT "%08" PCI_U64_FMT "x"
+#define PCIADDR_PORT_FMT "%04" PCI_U64_FMT "x"
+#else
+typedef u32 pciaddr_t;
+#define PCIADDR_T_FMT "%08x"
+#define PCIADDR_PORT_FMT "%04x"
+#endif
+
+#ifdef PCI_ARCH_SPARC64
+/* On sparc64 Linux the kernel reports remapped port addresses and IRQ numbers */
+#undef PCIADDR_PORT_FMT
+#define PCIADDR_PORT_FMT PCIADDR_T_FMT
+#define PCIIRQ_FMT "%08x"
+#else
+#define PCIIRQ_FMT "%d"
+#endif
+
+#if defined(__GNUC__) && __GNUC__ > 2
+#define PCI_PRINTF(x,y) __attribute__((format(printf, x, y)))
+#else
+#define PCI_PRINTF(x,y)
+#endif
diff --git a/ext/hwloc/include/private/autogen/README.txt b/ext/hwloc/include/private/autogen/README.txt
new file mode 100644
index 0000000..17f7f60
--- /dev/null
+++ b/ext/hwloc/include/private/autogen/README.txt
@@ -0,0 +1,3 @@
+This directory needs to exist in the repo so that the Autotools can
+generate a file here. We have a put a token file in this directory so
+that git doesn't ignore the empty directory in the repository.
diff --git a/ext/hwloc/include/private/autogen/config.h b/ext/hwloc/include/private/autogen/config.h
new file mode 100644
index 0000000..966fa78
--- /dev/null
+++ b/ext/hwloc/include/private/autogen/config.h
@@ -0,0 +1,772 @@
+/* include/private/autogen/config.h. Generated from config.h.in by configure. */
+/* include/private/autogen/config.h.in. Generated from configure.ac by autoheader. */
+
+/* -*- c -*-
+ *
+ * Copyright © 2009, 2011, 2012 CNRS, inria., Université Bordeaux All rights reserved.
+ * Copyright © 2009-2014 Cisco Systems, Inc. All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ *
+ * This file is automatically generated by configure. Edits will be lost
+ * the next time you run configure!
+ */
+
+#ifndef HWLOC_CONFIGURE_H
+#define HWLOC_CONFIGURE_H
+
+
+/* Define to 1 if gcc's __atomic builtins are available */
+/* #undef HAVE_ATOMIC_BUILTINS */
+
+/* Define to 1 if the system has the type `CACHE_DESCRIPTOR'. */
+/* #undef HAVE_CACHE_DESCRIPTOR */
+
+/* Define to 1 if the system has the type `CACHE_RELATIONSHIP'. */
+/* #undef HAVE_CACHE_RELATIONSHIP */
+
+/* Define to 1 if you have the `close' function. */
+#define HAVE_CLOSE 1
+
+/* Define to 1 if you have the `clz' function. */
+/* #undef HAVE_CLZ */
+
+/* Define to 1 if you have the `clzl' function. */
+/* #undef HAVE_CLZL */
+
+/* Define to 1 if you have the <CL/cl_ext.h> header file. */
+/* #undef HAVE_CL_CL_EXT_H */
+
+/* Define to 1 if you have the `cpuset_setaffinity' function. */
+/* #undef HAVE_CPUSET_SETAFFINITY */
+
+/* Define to 1 if you have the `cpuset_setid' function. */
+/* #undef HAVE_CPUSET_SETID */
+
+/* Define to 1 if you have the <ctype.h> header file. */
+#define HAVE_CTYPE_H 1
+
+/* Define to 1 if we have -lcuda */
+/* #undef HAVE_CUDA */
+
+/* Define to 1 if you have the <cuda.h> header file. */
+/* #undef HAVE_CUDA_H */
+
+/* Define to 1 if you have the <cuda_runtime_api.h> header file. */
+/* #undef HAVE_CUDA_RUNTIME_API_H */
+
+/* Define to 1 if you have the declaration of `CL_DEVICE_TOPOLOGY_AMD', and to
+ 0 if you don't. */
+/* #undef HAVE_DECL_CL_DEVICE_TOPOLOGY_AMD */
+
+/* Define to 1 if you have the declaration of `CTL_HW', and to 0 if you don't.
+ */
+#define HAVE_DECL_CTL_HW 0
+
+/* Define to 1 if you have the declaration of `fabsf', and to 0 if you don't.
+ */
+#define HAVE_DECL_FABSF 1
+
+/* Define to 1 if you have the declaration of `getexecname', and to 0 if you
+ don't. */
+#define HAVE_DECL_GETEXECNAME 0
+
+/* Define to 1 if you have the declaration of `GetModuleFileName', and to 0 if
+ you don't. */
+#define HAVE_DECL_GETMODULEFILENAME 0
+
+/* Define to 1 if you have the declaration of `getprogname', and to 0 if you
+ don't. */
+#define HAVE_DECL_GETPROGNAME 0
+
+/* Define to 1 if you have the declaration of `HW_NCPU', and to 0 if you
+ don't. */
+#define HAVE_DECL_HW_NCPU 0
+
+/* Define to 1 if you have the declaration of
+ `nvmlDeviceGetMaxPcieLinkGeneration', and to 0 if you don't. */
+/* #undef HAVE_DECL_NVMLDEVICEGETMAXPCIELINKGENERATION */
+
+/* Define to 1 if you have the declaration of `pthread_getaffinity_np', and to
+ 0 if you don't. */
+#define HAVE_DECL_PTHREAD_GETAFFINITY_NP 1
+
+/* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
+ 0 if you don't. */
+#define HAVE_DECL_PTHREAD_SETAFFINITY_NP 1
+
+/* Define to 1 if you have the declaration of `strtoull', and to 0 if you
+ don't. */
+#define HAVE_DECL_STRTOULL 1
+
+/* Define to 1 if you have the declaration of `_SC_LARGE_PAGESIZE', and to 0
+ if you don't. */
+#define HAVE_DECL__SC_LARGE_PAGESIZE 0
+
+/* Define to 1 if you have the declaration of `_SC_NPROCESSORS_CONF', and to 0
+ if you don't. */
+#define HAVE_DECL__SC_NPROCESSORS_CONF 1
+
+/* Define to 1 if you have the declaration of `_SC_NPROCESSORS_ONLN', and to 0
+ if you don't. */
+#define HAVE_DECL__SC_NPROCESSORS_ONLN 1
+
+/* Define to 1 if you have the declaration of `_SC_NPROC_CONF', and to 0 if
+ you don't. */
+#define HAVE_DECL__SC_NPROC_CONF 0
+
+/* Define to 1 if you have the declaration of `_SC_NPROC_ONLN', and to 0 if
+ you don't. */
+#define HAVE_DECL__SC_NPROC_ONLN 0
+
+/* Define to 1 if you have the declaration of `_SC_PAGESIZE', and to 0 if you
+ don't. */
+#define HAVE_DECL__SC_PAGESIZE 1
+
+/* Define to 1 if you have the declaration of `_SC_PAGE_SIZE', and to 0 if you
+ don't. */
+#define HAVE_DECL__SC_PAGE_SIZE 1
+
+/* Define to 1 if you have the <dirent.h> header file. */
+#define HAVE_DIRENT_H 1
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#define HAVE_DLFCN_H 1
+
+/* Define to 1 if you have the <endian.h> header file. */
+#define HAVE_ENDIAN_H 1
+
+/* Define to 1 if you have the <fcntl.h> header file. */
+#define HAVE_FCNTL_H 1
+
+/* Define to 1 if you have the `ffs' function. */
+#define HAVE_FFS 1
+
+/* Define to 1 if you have the `ffsl' function. */
+#define HAVE_FFSL 1
+
+/* Define to 1 if you have the `fls' function. */
+/* #undef HAVE_FLS */
+
+/* Define to 1 if you have the `flsl' function. */
+/* #undef HAVE_FLSL */
+
+/* Define to 1 if you have the `getpagesize' function. */
+#define HAVE_GETPAGESIZE 1
+
+/* Define to 1 if you have the `getpid' function. */
+#define HAVE_GETPID 1
+
+/* Define to 1 if you have the `gettimeofday' function. */
+#define HAVE_GETTIMEOFDAY 1
+
+/* Define to 1 if the system has the type `GROUP_AFFINITY'. */
+/* #undef HAVE_GROUP_AFFINITY */
+
+/* Define to 1 if the system has the type `GROUP_RELATIONSHIP'. */
+/* #undef HAVE_GROUP_RELATIONSHIP */
+
+/* Define to 1 if you have the `host_info' function. */
+/* #undef HAVE_HOST_INFO */
+
+/* Define to 1 if you have the <infiniband/verbs.h> header file. */
+/* #undef HAVE_INFINIBAND_VERBS_H */
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if the system has the type `KAFFINITY'. */
+/* #undef HAVE_KAFFINITY */
+
+/* Define to 1 if you have the <kstat.h> header file. */
+/* #undef HAVE_KSTAT_H */
+
+/* Define to 1 if you have the <langinfo.h> header file. */
+#define HAVE_LANGINFO_H 1
+
+/* Define to 1 if we have -lgdi32 */
+/* #undef HAVE_LIBGDI32 */
+
+/* Define to 1 if we have -libverbs */
+/* #undef HAVE_LIBIBVERBS */
+
+/* Define to 1 if we have -lkstat */
+/* #undef HAVE_LIBKSTAT */
+
+/* Define to 1 if we have -llgrp */
+/* #undef HAVE_LIBLGRP */
+
+/* Define to 1 if you have the <libudev.h> header file. */
+/* #undef HAVE_LIBUDEV_H */
+
+/* Define to 1 if you have the `localeconv' function. */
+#define HAVE_LOCALECONV 1
+
+/* Define to 1 if you have the <locale.h> header file. */
+#define HAVE_LOCALE_H 1
+
+/* Define to 1 if the system has the type `LOGICAL_PROCESSOR_RELATIONSHIP'. */
+/* #undef HAVE_LOGICAL_PROCESSOR_RELATIONSHIP */
+
+/* Define to 1 if the system has the type 'long long int'. */
+#define HAVE_LONG_LONG_INT 1
+
+/* Define to 1 if you have the <mach/mach_host.h> header file. */
+/* #undef HAVE_MACH_MACH_HOST_H */
+
+/* Define to 1 if you have the <mach/mach_init.h> header file. */
+/* #undef HAVE_MACH_MACH_INIT_H */
+
+/* Define to 1 if you have the <malloc.h> header file. */
+#define HAVE_MALLOC_H 1
+
+/* Define to 1 if you have the `memalign' function. */
+#define HAVE_MEMALIGN 1
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if we have -lmyriexpress */
+/* #undef HAVE_MYRIEXPRESS */
+
+/* Define to 1 if you have the <myriexpress.h> header file. */
+/* #undef HAVE_MYRIEXPRESS_H */
+
+/* Define to 1 if you have the `nl_langinfo' function. */
+#define HAVE_NL_LANGINFO 1
+
+/* Define to 1 if you have the <numaif.h> header file. */
+/* #undef HAVE_NUMAIF_H */
+
+/* Define to 1 if the system has the type `NUMA_NODE_RELATIONSHIP'. */
+/* #undef HAVE_NUMA_NODE_RELATIONSHIP */
+
+/* Define to 1 if you have the <NVCtrl/NVCtrl.h> header file. */
+/* #undef HAVE_NVCTRL_NVCTRL_H */
+
+/* Define to 1 if you have the <nvml.h> header file. */
+/* #undef HAVE_NVML_H */
+
+/* Define to 1 if you have the `open' function. */
+#define HAVE_OPEN 1
+
+/* Define to 1 if you have the `openat' function. */
+#define HAVE_OPENAT 1
+
+/* Define to 1 if you have the <picl.h> header file. */
+/* #undef HAVE_PICL_H */
+
+/* Define to 1 if you have the `posix_memalign' function. */
+#define HAVE_POSIX_MEMALIGN 1
+
+/* Define to 1 if the system has the type `PROCESSOR_CACHE_TYPE'. */
+/* #undef HAVE_PROCESSOR_CACHE_TYPE */
+
+/* Define to 1 if the system has the type `PROCESSOR_GROUP_INFO'. */
+/* #undef HAVE_PROCESSOR_GROUP_INFO */
+
+/* Define to 1 if the system has the type `PROCESSOR_RELATIONSHIP'. */
+/* #undef HAVE_PROCESSOR_RELATIONSHIP */
+
+/* Define to '1' if program_invocation_name is present and usable */
+#define HAVE_PROGRAM_INVOCATION_NAME 1
+
+/* Define to 1 if the system has the type `PSAPI_WORKING_SET_EX_BLOCK'. */
+/* #undef HAVE_PSAPI_WORKING_SET_EX_BLOCK */
+
+/* Define to 1 if the system has the type `PSAPI_WORKING_SET_EX_INFORMATION'.
+ */
+/* #undef HAVE_PSAPI_WORKING_SET_EX_INFORMATION */
+
+/* Define to 1 if you have the <pthread_np.h> header file. */
+/* #undef HAVE_PTHREAD_NP_H */
+
+/* Define to 1 if the system has the type `pthread_t'. */
+#define HAVE_PTHREAD_T 1
+
+/* Define to 1 if you have the `putwc' function. */
+#define HAVE_PUTWC 1
+
+/* Define to 1 if you have the `read' function. */
+#define HAVE_READ 1
+
+/* Define to 1 if the system has the type `RelationProcessorPackage'. */
+/* #undef HAVE_RELATIONPROCESSORPACKAGE */
+
+/* Define to 1 if you have the <sched.h> header file. */
+#define HAVE_SCHED_H 1
+
+/* Define to 1 if you have the `sched_yield' function. */
+#define HAVE_SCHED_YIELD 1
+
+/* Define to 1 if you have the `setlocale' function. */
+#define HAVE_SETLOCALE 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the `strftime' function. */
+#define HAVE_STRFTIME 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the `strncasecmp' function. */
+#define HAVE_STRNCASECMP 1
+
+/* Define to 1 if you have the `strtoll' function. */
+#define HAVE_STRTOLL 1
+
+/* Define to 1 if gcc's __sync builtins are available */
+#define HAVE_SYNC_BUILTINS 1
+
+/* Define to '1' if sysctl is present and usable */
+#define HAVE_SYSCTL 1
+
+/* Define to '1' if sysctlbyname is present and usable */
+/* #undef HAVE_SYSCTLBYNAME */
+
+/* Define to 1 if the system has the type
+ `SYSTEM_LOGICAL_PROCESSOR_INFORMATION'. */
+/* #undef HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION */
+
+/* Define to 1 if the system has the type
+ `SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX'. */
+/* #undef HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX */
+
+/* Define to 1 if you have the <sys/cpuset.h> header file. */
+/* #undef HAVE_SYS_CPUSET_H */
+
+/* Define to 1 if you have the <sys/lgrp_user.h> header file. */
+/* #undef HAVE_SYS_LGRP_USER_H */
+
+/* Define to 1 if you have the <sys/mman.h> header file. */
+#define HAVE_SYS_MMAN_H 1
+
+/* Define to 1 if you have the <sys/param.h> header file. */
+#define HAVE_SYS_PARAM_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/sysctl.h> header file. */
+#define HAVE_SYS_SYSCTL_H 1
+
+/* Define to 1 if you have the <sys/time.h> header file. */
+#define HAVE_SYS_TIME_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <sys/utsname.h> header file. */
+#define HAVE_SYS_UTSNAME_H 1
+
+/* Define to 1 if you have the `uname' function. */
+#define HAVE_UNAME 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* Define to 1 if the system has the type 'unsigned long long int'. */
+#define HAVE_UNSIGNED_LONG_LONG_INT 1
+
+/* Define to 1 if you have the `uselocale' function. */
+#define HAVE_USELOCALE 1
+
+/* Define to 1 if the system has the type `wchar_t'. */
+#define HAVE_WCHAR_T 1
+
+/* Define to 1 if you have the <X11/keysym.h> header file. */
+#define HAVE_X11_KEYSYM_H 1
+
+/* Define to 1 if you have the <X11/Xlib.h> header file. */
+#define HAVE_X11_XLIB_H 1
+
+/* Define to 1 if you have the <X11/Xutil.h> header file. */
+#define HAVE_X11_XUTIL_H 1
+
+/* Define to 1 if you have the <xlocale.h> header file. */
+#define HAVE_XLOCALE_H 1
+
+/* Define to '1' if __progname is present and usable */
+#define HAVE___PROGNAME 1
+
+/* Define to 1 on AIX */
+/* #undef HWLOC_AIX_SYS */
+
+/* Define to 1 on BlueGene/Q */
+/* #undef HWLOC_BGQ_SYS */
+
+/* Whether C compiler supports symbol visibility or not */
+#define HWLOC_C_HAVE_VISIBILITY 1
+
+/* Define to 1 on Darwin */
+/* #undef HWLOC_DARWIN_SYS */
+
+/* Whether we are in debugging mode or not */
+/* #undef HWLOC_DEBUG */
+
+/* Define to 1 on *FREEBSD */
+/* #undef HWLOC_FREEBSD_SYS */
+
+/* Whether your compiler has __attribute__ or not */
+#define HWLOC_HAVE_ATTRIBUTE 1
+
+/* Whether your compiler has __attribute__ aligned or not */
+#define HWLOC_HAVE_ATTRIBUTE_ALIGNED 1
+
+/* Whether your compiler has __attribute__ always_inline or not */
+#define HWLOC_HAVE_ATTRIBUTE_ALWAYS_INLINE 1
+
+/* Whether your compiler has __attribute__ cold or not */
+#define HWLOC_HAVE_ATTRIBUTE_COLD 1
+
+/* Whether your compiler has __attribute__ const or not */
+#define HWLOC_HAVE_ATTRIBUTE_CONST 1
+
+/* Whether your compiler has __attribute__ deprecated or not */
+#define HWLOC_HAVE_ATTRIBUTE_DEPRECATED 1
+
+/* Whether your compiler has __attribute__ format or not */
+#define HWLOC_HAVE_ATTRIBUTE_FORMAT 1
+
+/* Whether your compiler has __attribute__ hot or not */
+#define HWLOC_HAVE_ATTRIBUTE_HOT 1
+
+/* Whether your compiler has __attribute__ malloc or not */
+#define HWLOC_HAVE_ATTRIBUTE_MALLOC 1
+
+/* Whether your compiler has __attribute__ may_alias or not */
+#define HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS 1
+
+/* Whether your compiler has __attribute__ nonnull or not */
+#define HWLOC_HAVE_ATTRIBUTE_NONNULL 1
+
+/* Whether your compiler has __attribute__ noreturn or not */
+#define HWLOC_HAVE_ATTRIBUTE_NORETURN 1
+
+/* Whether your compiler has __attribute__ no_instrument_function or not */
+#define HWLOC_HAVE_ATTRIBUTE_NO_INSTRUMENT_FUNCTION 1
+
+/* Whether your compiler has __attribute__ packed or not */
+#define HWLOC_HAVE_ATTRIBUTE_PACKED 1
+
+/* Whether your compiler has __attribute__ pure or not */
+#define HWLOC_HAVE_ATTRIBUTE_PURE 1
+
+/* Whether your compiler has __attribute__ sentinel or not */
+#define HWLOC_HAVE_ATTRIBUTE_SENTINEL 1
+
+/* Whether your compiler has __attribute__ unused or not */
+#define HWLOC_HAVE_ATTRIBUTE_UNUSED 1
+
+/* Whether your compiler has __attribute__ warn unused result or not */
+#define HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT 1
+
+/* Whether your compiler has __attribute__ weak alias or not */
+#define HWLOC_HAVE_ATTRIBUTE_WEAK_ALIAS 1
+
+/* Define to 1 if your `ffs' function is known to be broken. */
+/* #undef HWLOC_HAVE_BROKEN_FFS */
+
+/* Define to 1 if you have the `cairo' library. */
+#define HWLOC_HAVE_CAIRO 1
+
+/* Define to 1 if you have the `clz' function. */
+/* #undef HWLOC_HAVE_CLZ */
+
+/* Define to 1 if you have the `clzl' function. */
+/* #undef HWLOC_HAVE_CLZL */
+
+/* Define to 1 if the CPU_SET macro works */
+#define HWLOC_HAVE_CPU_SET 1
+
+/* Define to 1 if the CPU_SET_S macro works */
+#define HWLOC_HAVE_CPU_SET_S 1
+
+/* Define to 1 if you have the `cudart' SDK. */
+/* #undef HWLOC_HAVE_CUDART */
+
+/* Define to 1 if function `clz' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_CLZ */
+
+/* Define to 1 if function `clzl' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_CLZL */
+
+/* Define to 1 if function `ffs' is declared by system headers */
+#define HWLOC_HAVE_DECL_FFS 1
+
+/* Define to 1 if function `ffsl' is declared by system headers */
+#define HWLOC_HAVE_DECL_FFSL 1
+
+/* Define to 1 if function `fls' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_FLS */
+
+/* Define to 1 if function `flsl' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_FLSL */
+
+/* Define to 1 if function `strncasecmp' is declared by system headers */
+#define HWLOC_HAVE_DECL_STRNCASECMP 1
+
+/* Define to 1 if you have the `ffs' function. */
+#define HWLOC_HAVE_FFS 1
+
+/* Define to 1 if you have the `ffsl' function. */
+#define HWLOC_HAVE_FFSL 1
+
+/* Define to 1 if you have the `fls' function. */
+/* #undef HWLOC_HAVE_FLS */
+
+/* Define to 1 if you have the `flsl' function. */
+/* #undef HWLOC_HAVE_FLSL */
+
+/* Define to 1 if you have the GL module components. */
+/* #undef HWLOC_HAVE_GL */
+
+/* Define to 1 if you have a library providing the termcap interface */
+/* #undef HWLOC_HAVE_LIBTERMCAP */
+
+/* Define to 1 if you have the `libxml2' library. */
+/* #undef HWLOC_HAVE_LIBXML2 */
+
+/* Define to 1 if building the Linux PCI component */
+#define HWLOC_HAVE_LINUXPCI 1
+
+/* Define to 1 if mbind is available. */
+/* #undef HWLOC_HAVE_MBIND */
+
+/* Define to 1 if migrate_pages is available. */
+/* #undef HWLOC_HAVE_MIGRATE_PAGES */
+
+/* Define to 1 if you have the `NVML' library. */
+/* #undef HWLOC_HAVE_NVML */
+
+/* Define to 1 if glibc provides the old prototype (without length) of
+ sched_setaffinity() */
+/* #undef HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+
+/* Define to 1 if you have the `OpenCL' library. */
+/* #undef HWLOC_HAVE_OPENCL */
+
+/* Define to 1 if the hwloc library should support dynamically-loaded plugins
+ */
+/* #undef HWLOC_HAVE_PLUGINS */
+
+/* `Define to 1 if you have pthread_getthrds_np' */
+/* #undef HWLOC_HAVE_PTHREAD_GETTHRDS_NP */
+
+/* Define to 1 if pthread mutexes are available */
+#define HWLOC_HAVE_PTHREAD_MUTEX 1
+
+/* Define to 1 if glibc provides a prototype of sched_setaffinity() */
+#define HWLOC_HAVE_SCHED_SETAFFINITY 1
+
+/* Define to 1 if set_mempolicy is available. */
+/* #undef HWLOC_HAVE_SET_MEMPOLICY */
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HWLOC_HAVE_STDINT_H 1
+
+/* Define to 1 if you have the `windows.h' header. */
+/* #undef HWLOC_HAVE_WINDOWS_H */
+
+/* Define to 1 if X11 headers including Xutil.h and keysym.h are available. */
+#define HWLOC_HAVE_X11_KEYSYM 1
+
+/* Define to 1 if you have x86 cpuid */
+#define HWLOC_HAVE_X86_CPUID 1
+
+/* Define to 1 if the _syscall3 macro works */
+/* #undef HWLOC_HAVE__SYSCALL3 */
+
+/* Define to 1 on HP-UX */
+/* #undef HWLOC_HPUX_SYS */
+
+/* Define to 1 on Irix */
+/* #undef HWLOC_IRIX_SYS */
+
+/* Define to 1 on Linux */
+#define HWLOC_LINUX_SYS 1
+
+/* Define to 1 on *NETBSD */
+/* #undef HWLOC_NETBSD_SYS */
+
+/* Define to 1 on OSF */
+/* #undef HWLOC_OSF_SYS */
+
+/* The size of `unsigned int', as computed by sizeof */
+#define HWLOC_SIZEOF_UNSIGNED_INT 4
+
+/* The size of `unsigned long', as computed by sizeof */
+#ifdef __x86_64
+#define HWLOC_SIZEOF_UNSIGNED_LONG 8
+#else
+#if defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__)
+#define HWLOC_SIZEOF_UNSIGNED_LONG 4
+#endif
+#endif
+
+/* Define to 1 on Solaris */
+/* #undef HWLOC_SOLARIS_SYS */
+
+/* The hwloc symbol prefix */
+#define HWLOC_SYM_PREFIX likwid_
+
+/* The hwloc symbol prefix in all caps */
+#define HWLOC_SYM_PREFIX_CAPS LIKWID_
+
+/* Whether we need to re-define all the hwloc public symbols or not */
+#define HWLOC_SYM_TRANSFORM 1
+
+/* Define to 1 on unsupported systems */
+/* #undef HWLOC_UNSUPPORTED_SYS */
+
+/* Define to 1 if ncurses works, preferred over curses */
+/* #undef HWLOC_USE_NCURSES */
+
+/* The library version, always available, even in embedded mode, contrary to
+ VERSION */
+#define HWLOC_VERSION "2.0.0a1-git"
+
+/* Define to 1 on WINDOWS */
+/* #undef HWLOC_WIN_SYS */
+
+
+/* Define to 1 on x86_64 */
+#ifdef __x86_64
+#define HWLOC_X86_64_ARCH 1
+#else
+#if defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__)
+/* Define to 1 on x86_32 */
+#define HWLOC_X86_32_ARCH 1
+#endif
+#endif
+
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+ */
+#define LT_OBJDIR ".libs/"
+
+/* Name of package */
+#define PACKAGE "hwloc"
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT "http://www.open-mpi.org/projects/hwloc/"
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "hwloc"
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING "hwloc 2.0.0a1-git"
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME "hwloc"
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL ""
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "2.0.0a1-git"
+
+/* The size of `unsigned int', as computed by sizeof. */
+#define SIZEOF_UNSIGNED_INT 4
+
+/* The size of `unsigned long', as computed by sizeof. */
+#define SIZEOF_UNSIGNED_LONG 8
+
+/* The size of `void *', as computed by sizeof. */
+#define SIZEOF_VOID_P 8
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Enable extensions on HP-UX. */
+#ifndef _HPUX_SOURCE
+# define _HPUX_SOURCE 1
+#endif
+
+
+/* Enable extensions on AIX 3, Interix. */
+#ifndef _ALL_SOURCE
+# define _ALL_SOURCE 1
+#endif
+/* Enable GNU extensions on systems that have them. */
+#ifndef _GNU_SOURCE
+# define _GNU_SOURCE 1
+#endif
+/* Enable threading extensions on Solaris. */
+#ifndef _POSIX_PTHREAD_SEMANTICS
+# define _POSIX_PTHREAD_SEMANTICS 1
+#endif
+/* Enable extensions on HP NonStop. */
+#ifndef _TANDEM_SOURCE
+# define _TANDEM_SOURCE 1
+#endif
+/* Enable general extensions on Solaris. */
+#ifndef __EXTENSIONS__
+# define __EXTENSIONS__ 1
+#endif
+
+
+/* Define to 1 if /dev/urandom should be used for seeding the hash function */
+#define USE_URANDOM 1
+
+/* Define to 1 if CryptGenRandom should be used for seeding the hash function
+ */
+#define USE_WINDOWS_CRYPTOAPI 1
+
+/* Version number of package */
+#define VERSION "2.0.0a1-git"
+
+/* Define to 1 if the X Window System is missing or not being used. */
+/* #undef X_DISPLAY_MISSING */
+
+/* Are we building for HP-UX? */
+#define _HPUX_SOURCE 1
+
+/* Define to 1 if on MINIX. */
+/* #undef _MINIX */
+
+/* Define to 2 if the system does not provide POSIX.1 features except with
+ this defined. */
+/* #undef _POSIX_1_SOURCE */
+
+/* Define to 1 if you need to in order for `stat' and other things to work. */
+/* #undef _POSIX_SOURCE */
+
+/* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>,
+ <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
+ #define below would cause a syntax error. */
+/* #undef _UINT32_T */
+
+/* Define this to the process ID type */
+#define hwloc_pid_t pid_t
+
+/* Define this to the thread ID type */
+#define hwloc_thread_t pthread_t
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+ calls it, or to nothing if 'inline' is not supported under any name. */
+#ifndef __cplusplus
+/* #undef inline */
+#endif
+
+/* Define to the type of a signed integer type of width exactly 32 bits if
+ such a type exists and the standard includes do not define it. */
+/* #undef int32_t */
+
+/* Define to the type of an unsigned integer type of width exactly 32 bits if
+ such a type exists and the standard includes do not define it. */
+/* #undef uint32_t */
+
+
+#endif /* HWLOC_CONFIGURE_H */
+
diff --git a/ext/hwloc/include/private/components.h b/ext/hwloc/include/private/components.h
new file mode 100644
index 0000000..b366345
--- /dev/null
+++ b/ext/hwloc/include/private/components.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright © 2012 Inria. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+
+#ifdef HWLOC_INSIDE_PLUGIN
+/*
+ * these declarations are internal only, they are not available to plugins
+ * (many functions below are internal static symbols).
+ */
+#error This file should not be used in plugins
+#endif
+
+
+#ifndef PRIVATE_COMPONENTS_H
+#define PRIVATE_COMPONENTS_H 1
+
+#include <hwloc/plugins.h>
+
+struct hwloc_topology;
+
+extern int hwloc_disc_component_force_enable(struct hwloc_topology *topology,
+ int envvar_forced, /* 1 if forced through envvar, 0 if forced through API */
+ int type, const char *name,
+ const void *data1, const void *data2, const void *data3);
+extern void hwloc_disc_components_enable_others(struct hwloc_topology *topology);
+
+/* Compute the topology is_thissystem flag based on enabled backends */
+extern void hwloc_backends_is_thissystem(struct hwloc_topology *topology);
+
+/* Disable and destroy all backends used by a topology */
+extern void hwloc_backends_disable_all(struct hwloc_topology *topology);
+
+/* Used by the core to setup/destroy the list of components */
+extern void hwloc_components_init(struct hwloc_topology *topology); /* increases components refcount, should be called exactly once per topology (during init) */
+extern void hwloc_components_destroy_all(struct hwloc_topology *topology); /* decreases components refcount, should be called exactly once per topology (during destroy) */
+
+#endif /* PRIVATE_COMPONENTS_H */
+
diff --git a/ext/hwloc/include/private/cpuid-x86.h b/ext/hwloc/include/private/cpuid-x86.h
new file mode 100644
index 0000000..8a8c48e
--- /dev/null
+++ b/ext/hwloc/include/private/cpuid-x86.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright © 2010-2012, 2014 Université Bordeaux
+ * Copyright © 2010 Cisco Systems, Inc. All rights reserved.
+ * Copyright © 2014 Inria. All rights reserved.
+ *
+ * See COPYING in top-level directory.
+ */
+
+/* Internals for x86's cpuid. */
+
+#ifndef HWLOC_PRIVATE_CPUID_X86_H
+#define HWLOC_PRIVATE_CPUID_X86_H
+
+#if (defined HWLOC_X86_32_ARCH) && (!defined HWLOC_HAVE_MSVC_CPUIDEX)
+static __hwloc_inline int hwloc_have_x86_cpuid(void)
+{
+ int ret;
+ unsigned tmp, tmp2;
+ __asm__(
+ "mov $0,%0\n\t" /* Not supported a priori */
+
+ "pushfl \n\t" /* Save flags */
+
+ "pushfl \n\t" \
+ "pop %1 \n\t" /* Get flags */ \
+
+#define TRY_TOGGLE \
+ "xor $0x00200000,%1\n\t" /* Try to toggle ID */ \
+ "mov %1,%2\n\t" /* Save expected value */ \
+ "push %1 \n\t" \
+ "popfl \n\t" /* Try to toggle */ \
+ "pushfl \n\t" \
+ "pop %1 \n\t" \
+ "cmp %1,%2\n\t" /* Compare with expected value */ \
+ "jnz 0f\n\t" /* Unexpected, failure */ \
+
+ TRY_TOGGLE /* Try to set/clear */
+ TRY_TOGGLE /* Try to clear/set */
+
+ "mov $1,%0\n\t" /* Passed the test! */
+
+ "0: \n\t"
+ "popfl \n\t" /* Restore flags */
+
+ : "=r" (ret), "=&r" (tmp), "=&r" (tmp2));
+ return ret;
+}
+#endif /* !defined HWLOC_X86_32_ARCH && !defined HWLOC_HAVE_MSVC_CPUIDEX*/
+#if (defined HWLOC_X86_64_ARCH) || (defined HWLOC_HAVE_MSVC_CPUIDEX)
+static __hwloc_inline int hwloc_have_x86_cpuid(void) { return 1; }
+#endif /* HWLOC_X86_64_ARCH */
+
+static __hwloc_inline void hwloc_x86_cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
+{
+#ifdef HWLOC_HAVE_MSVC_CPUIDEX
+ int regs[4];
+ __cpuidex(regs, *eax, *ecx);
+ *eax = regs[0];
+ *ebx = regs[1];
+ *ecx = regs[2];
+ *edx = regs[3];
+#else /* HWLOC_HAVE_MSVC_CPUIDEX */
+ /* Note: gcc might want to use bx or the stack for %1 addressing, so we can't
+ * use them :/ */
+#ifdef HWLOC_X86_64_ARCH
+ hwloc_uint64_t sav_rbx;
+ __asm__(
+ "mov %%rbx,%2\n\t"
+ "cpuid\n\t"
+ "xchg %2,%%rbx\n\t"
+ "movl %k2,%1\n\t"
+ : "+a" (*eax), "=m" (*ebx), "=&r"(sav_rbx),
+ "+c" (*ecx), "=&d" (*edx));
+#elif defined(HWLOC_X86_32_ARCH)
+ unsigned long sav_ebx;
+ __asm__(
+ "mov %%ebx,%2\n\t"
+ "cpuid\n\t"
+ "xchg %2,%%ebx\n\t"
+ "movl %k2,%1\n\t"
+ : "+a" (*eax), "=m" (*ebx), "=&r"(sav_ebx),
+ "+c" (*ecx), "=&d" (*edx));
+#else
+#error unknown architecture
+#endif
+#endif /* HWLOC_HAVE_MSVC_CPUIDEX */
+}
+
+#endif /* HWLOC_PRIVATE_X86_CPUID_H */
diff --git a/ext/hwloc/include/private/cpuid.h b/ext/hwloc/include/private/cpuid.h
new file mode 100644
index 0000000..214ab38
--- /dev/null
+++ b/ext/hwloc/include/private/cpuid.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2010-2012 Université Bordeaux 1
+ * Copyright © 2010 Cisco Systems, Inc. All rights reserved.
+ * Copyright © 2014 Inria. All rights reserved.
+ *
+ * See COPYING in top-level directory.
+ */
+
+/* Internals for x86's cpuid. */
+
+#ifndef HWLOC_PRIVATE_CPUID_H
+#define HWLOC_PRIVATE_CPUID_H
+
+#ifdef HWLOC_X86_32_ARCH
+static __hwloc_inline int hwloc_have_cpuid(void)
+{
+ int ret;
+ unsigned tmp, tmp2;
+ asm(
+ "mov $0,%0\n\t" /* Not supported a priori */
+
+ "pushfl \n\t" /* Save flags */
+
+ "pushfl \n\t" \
+ "pop %1 \n\t" /* Get flags */ \
+
+#define TRY_TOGGLE \
+ "xor $0x00200000,%1\n\t" /* Try to toggle ID */ \
+ "mov %1,%2\n\t" /* Save expected value */ \
+ "push %1 \n\t" \
+ "popfl \n\t" /* Try to toggle */ \
+ "pushfl \n\t" \
+ "pop %1 \n\t" \
+ "cmp %1,%2\n\t" /* Compare with expected value */ \
+ "jnz Lhwloc1\n\t" /* Unexpected, failure */ \
+
+ TRY_TOGGLE /* Try to set/clear */
+ TRY_TOGGLE /* Try to clear/set */
+
+ "mov $1,%0\n\t" /* Passed the test! */
+
+ "Lhwloc1: \n\t"
+ "popfl \n\t" /* Restore flags */
+
+ : "=r" (ret), "=&r" (tmp), "=&r" (tmp2));
+ return ret;
+}
+#endif /* HWLOC_X86_32_ARCH */
+#ifdef HWLOC_X86_64_ARCH
+static __hwloc_inline int hwloc_have_cpuid(void) { return 1; }
+#endif /* HWLOC_X86_64_ARCH */
+
+static __hwloc_inline void hwloc_cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
+{
+ /* Note: gcc might want to use bx or the stack for %1 addressing, so we can't
+ * use them :/ */
+#ifdef HWLOC_X86_64_ARCH
+ hwloc_uint64_t sav_rbx;
+ asm(
+ "mov %%rbx,%2\n\t"
+ "cpuid\n\t"
+ "xchg %2,%%rbx\n\t"
+ "movl %k2,%1\n\t"
+ : "+a" (*eax), "=m" (*ebx), "=&r"(sav_rbx),
+ "+c" (*ecx), "=&d" (*edx));
+#elif defined(HWLOC_X86_32_ARCH)
+ unsigned long sav_ebx;
+ asm(
+ "mov %%ebx,%2\n\t"
+ "cpuid\n\t"
+ "xchg %2,%%ebx\n\t"
+ "movl %k2,%1\n\t"
+ : "+a" (*eax), "=m" (*ebx), "=&r"(sav_ebx),
+ "+c" (*ecx), "=&d" (*edx));
+#else
+#error unknown architecture
+#endif
+}
+
+#endif /* HWLOC_PRIVATE_CPUID_H */
diff --git a/ext/hwloc/include/private/debug.h b/ext/hwloc/include/private/debug.h
new file mode 100644
index 0000000..4de91bf
--- /dev/null
+++ b/ext/hwloc/include/private/debug.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2012 Inria. All rights reserved.
+ * Copyright © 2009, 2011 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/* The configuration file */
+
+#ifndef HWLOC_DEBUG_H
+#define HWLOC_DEBUG_H
+
+#include <private/autogen/config.h>
+
+#ifdef HWLOC_DEBUG
+#include <stdarg.h>
+#include <stdio.h>
+#endif
+
+static __hwloc_inline void hwloc_debug(const char *s __hwloc_attribute_unused, ...)
+{
+#ifdef HWLOC_DEBUG
+ va_list ap;
+
+ va_start(ap, s);
+ vfprintf(stderr, s, ap);
+ va_end(ap);
+#endif
+}
+
+#ifdef HWLOC_DEBUG
+#define hwloc_debug_bitmap(fmt, bitmap) do { \
+ char *s; \
+ hwloc_bitmap_asprintf(&s, bitmap); \
+ fprintf(stderr, fmt, s); \
+ free(s); \
+} while (0)
+#define hwloc_debug_1arg_bitmap(fmt, arg1, bitmap) do { \
+ char *s; \
+ hwloc_bitmap_asprintf(&s, bitmap); \
+ fprintf(stderr, fmt, arg1, s); \
+ free(s); \
+} while (0)
+#define hwloc_debug_2args_bitmap(fmt, arg1, arg2, bitmap) do { \
+ char *s; \
+ hwloc_bitmap_asprintf(&s, bitmap); \
+ fprintf(stderr, fmt, arg1, arg2, s); \
+ free(s); \
+} while (0)
+#else
+#define hwloc_debug_bitmap(s, bitmap) do { } while(0)
+#define hwloc_debug_1arg_bitmap(s, arg1, bitmap) do { } while(0)
+#define hwloc_debug_2args_bitmap(s, arg1, arg2, bitmap) do { } while(0)
+#endif
+
+#endif /* HWLOC_DEBUG_H */
diff --git a/ext/hwloc/include/private/map.h b/ext/hwloc/include/private/map.h
new file mode 100644
index 0000000..77c18a5
--- /dev/null
+++ b/ext/hwloc/include/private/map.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright © 2013 Inria. All rights reserved.
+ * Copyright © 2013 Cisco Systems, Inc. All rights reserved.
+ * Copyright © 2013-2014 University of Wisconsin-La Crosse.
+ * All rights reserved.
+ *
+ * See COPYING in top-level directory.
+ *
+ * $HEADER$
+ */
+
+#ifndef _PRIVATE_NETLOC_MAP_H_
+#define _PRIVATE_NETLOC_MAP_H_
+
+#include <hwloc.h>
+#include <netloc.h>
+
+
+struct netloc_map__subnet;
+struct netloc_map__server;
+
+struct netloc_map__port {
+ struct netloc_map__subnet * subnet;
+ struct netloc_map__server * server;
+
+ netloc_edge_t * edge;
+
+ unsigned hwloc_obj_depth;
+ unsigned hwloc_obj_index;
+ hwloc_obj_t hwloc_obj; /* cached from depth/index above,
+ * only non-NULL if the topology hasn't been compressed in the meantime.
+ */
+
+ struct netloc_map__port *prev, *next;
+
+ char id[0];
+};
+
+struct netloc_map__subnet {
+ netloc_topology_t topology;
+ netloc_network_type_t type;
+
+ int port_by_id_ready;
+ struct netloc_dt_lookup_table port_by_id;
+
+ struct netloc_map__subnet *prev, *next;
+
+ struct netloc_map__port *port_first, *port_last;
+ unsigned ports_nr;
+
+ char id[0];
+};
+
+struct netloc_map__server {
+ hwloc_topology_t topology; /* NULL if compressed */
+#if HWLOC_API_VERSION >= 0x00010800
+ hwloc_topology_diff_t topology_diff;
+ struct netloc_map__server *topology_diff_refserver;
+#endif
+
+ int usecount; /* references from the application,
+ * or from topology diff for other servers.
+ * no compression when > 0
+ */
+
+ unsigned nr_ports;
+ unsigned nr_ports_allocated;
+ struct netloc_map__port ** ports;
+
+ struct netloc_map__server *prev, *next;
+ struct netloc_map *map;
+
+ char name[0];
+};
+
+enum netloc_map_verbose_flags_e {
+ NETLOC_MAP_VERBOSE_FLAG_COMPRESS = (1<<0)
+};
+
+struct netloc_map {
+ unsigned long flags;
+ unsigned long verbose_flags;
+
+ unsigned server_ports_nr; /* needed during build, to create large-enough hash tables */
+
+ char *hwloc_xml_path;
+ struct netloc_dt_lookup_table server_by_name;
+ struct netloc_map__server *server_first, *server_last;
+ unsigned servers_nr;
+
+ char *netloc_data_path;
+ struct netloc_dt_lookup_table subnet_by_id[NETLOC_NETWORK_TYPE_INVALID]; /* enough room for existing types */
+ struct netloc_map__subnet *subnet_first, *subnet_last;
+ unsigned subnets_nr;
+
+ int merged;
+};
+
+struct netloc_map__paths {
+ struct netloc_map *map;
+ unsigned long flags;
+ unsigned nr_paths;
+ struct netloc_map__path {
+ /* FIXME: cache the subnet */
+ unsigned nr_edges;
+ struct netloc_map_edge_s *edges;
+ } * paths;
+};
+
+#endif /* _PRIVATE_NETLOC_MAP_H_ */
diff --git a/ext/hwloc/include/private/misc.h b/ext/hwloc/include/private/misc.h
new file mode 100644
index 0000000..d0e6a46
--- /dev/null
+++ b/ext/hwloc/include/private/misc.h
@@ -0,0 +1,382 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria. All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/* Misc macros and inlines. */
+
+#ifndef HWLOC_PRIVATE_MISC_H
+#define HWLOC_PRIVATE_MISC_H
+
+#include <hwloc/autogen/config.h>
+#include <private/autogen/config.h>
+
+#ifdef HWLOC_HAVE_DECL_STRNCASECMP
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#else
+#ifdef HAVE_CTYPE_H
+#include <ctype.h>
+#endif
+#endif
+
+/* Compile-time assertion */
+#define HWLOC_BUILD_ASSERT(condition) ((void)sizeof(char[1 - 2*!(condition)]))
+
+#define HWLOC_BITS_PER_LONG (HWLOC_SIZEOF_UNSIGNED_LONG * 8)
+#define HWLOC_BITS_PER_INT (HWLOC_SIZEOF_UNSIGNED_INT * 8)
+
+#if (HWLOC_BITS_PER_LONG != 32) && (HWLOC_BITS_PER_LONG != 64)
+#error "unknown size for unsigned long."
+#endif
+
+#if (HWLOC_BITS_PER_INT != 16) && (HWLOC_BITS_PER_INT != 32) && (HWLOC_BITS_PER_INT != 64)
+#error "unknown size for unsigned int."
+#endif
+
+
+/**
+ * ffsl helpers.
+ */
+
+#if defined(HWLOC_HAVE_BROKEN_FFS)
+
+/* System has a broken ffs().
+ * We must check the before __GNUC__ or HWLOC_HAVE_FFSL
+ */
+# define HWLOC_NO_FFS
+
+#elif defined(__GNUC__)
+
+# if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))
+ /* Starting from 3.4, gcc has a long variant. */
+# define hwloc_ffsl(x) __builtin_ffsl(x)
+# else
+# define hwloc_ffs(x) __builtin_ffs(x)
+# define HWLOC_NEED_FFSL
+# endif
+
+#elif defined(HWLOC_HAVE_FFSL)
+
+# ifndef HWLOC_HAVE_DECL_FFSL
+extern int ffsl(long) __hwloc_attribute_const;
+# endif
+
+# define hwloc_ffsl(x) ffsl(x)
+
+#elif defined(HWLOC_HAVE_FFS)
+
+# ifndef HWLOC_HAVE_DECL_FFS
+extern int ffs(int) __hwloc_attribute_const;
+# endif
+
+# define hwloc_ffs(x) ffs(x)
+# define HWLOC_NEED_FFSL
+
+#else /* no ffs implementation */
+
+# define HWLOC_NO_FFS
+
+#endif
+
+#ifdef HWLOC_NO_FFS
+
+/* no ffs or it is known to be broken */
+static __hwloc_inline int
+hwloc_ffsl_manual(unsigned long x) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_ffsl_manual(unsigned long x)
+{
+ int i;
+
+ if (!x)
+ return 0;
+
+ i = 1;
+#if HWLOC_BITS_PER_LONG >= 64
+ if (!(x & 0xfffffffful)) {
+ x >>= 32;
+ i += 32;
+ }
+#endif
+ if (!(x & 0xffffu)) {
+ x >>= 16;
+ i += 16;
+ }
+ if (!(x & 0xff)) {
+ x >>= 8;
+ i += 8;
+ }
+ if (!(x & 0xf)) {
+ x >>= 4;
+ i += 4;
+ }
+ if (!(x & 0x3)) {
+ x >>= 2;
+ i += 2;
+ }
+ if (!(x & 0x1)) {
+ x >>= 1;
+ i += 1;
+ }
+
+ return i;
+}
+/* always define hwloc_ffsl as a macro, to avoid renaming breakage */
+#define hwloc_ffsl hwloc_ffsl_manual
+
+#elif defined(HWLOC_NEED_FFSL)
+
+/* We only have an int ffs(int) implementation, build a long one. */
+
+/* First make it 32 bits if it was only 16. */
+static __hwloc_inline int
+hwloc_ffs32(unsigned long x) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_ffs32(unsigned long x)
+{
+#if HWLOC_BITS_PER_INT == 16
+ int low_ffs, hi_ffs;
+
+ low_ffs = hwloc_ffs(x & 0xfffful);
+ if (low_ffs)
+ return low_ffs;
+
+ hi_ffs = hwloc_ffs(x >> 16);
+ if (hi_ffs)
+ return hi_ffs + 16;
+
+ return 0;
+#else
+ return hwloc_ffs(x);
+#endif
+}
+
+/* Then make it 64 bit if longs are. */
+static __hwloc_inline int
+hwloc_ffsl_from_ffs32(unsigned long x) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_ffsl_from_ffs32(unsigned long x)
+{
+#if HWLOC_BITS_PER_LONG == 64
+ int low_ffs, hi_ffs;
+
+ low_ffs = hwloc_ffs32(x & 0xfffffffful);
+ if (low_ffs)
+ return low_ffs;
+
+ hi_ffs = hwloc_ffs32(x >> 32);
+ if (hi_ffs)
+ return hi_ffs + 32;
+
+ return 0;
+#else
+ return hwloc_ffs32(x);
+#endif
+}
+/* always define hwloc_ffsl as a macro, to avoid renaming breakage */
+#define hwloc_ffsl hwloc_ffsl_from_ffs32
+
+#endif
+
+/**
+ * flsl helpers.
+ */
+#ifdef __GNUC_____
+
+# if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))
+# define hwloc_flsl(x) (x ? 8*sizeof(long) - __builtin_clzl(x) : 0)
+# else
+# define hwloc_fls(x) (x ? 8*sizeof(int) - __builtin_clz(x) : 0)
+# define HWLOC_NEED_FLSL
+# endif
+
+#elif defined(HWLOC_HAVE_FLSL)
+
+# ifndef HWLOC_HAVE_DECL_FLSL
+extern int flsl(long) __hwloc_attribute_const;
+# endif
+
+# define hwloc_flsl(x) flsl(x)
+
+#elif defined(HWLOC_HAVE_CLZL)
+
+# ifndef HWLOC_HAVE_DECL_CLZL
+extern int clzl(long) __hwloc_attribute_const;
+# endif
+
+# define hwloc_flsl(x) (x ? 8*sizeof(long) - clzl(x) : 0)
+
+#elif defined(HWLOC_HAVE_FLS)
+
+# ifndef HWLOC_HAVE_DECL_FLS
+extern int fls(int) __hwloc_attribute_const;
+# endif
+
+# define hwloc_fls(x) fls(x)
+# define HWLOC_NEED_FLSL
+
+#elif defined(HWLOC_HAVE_CLZ)
+
+# ifndef HWLOC_HAVE_DECL_CLZ
+extern int clz(int) __hwloc_attribute_const;
+# endif
+
+# define hwloc_fls(x) (x ? 8*sizeof(int) - clz(x) : 0)
+# define HWLOC_NEED_FLSL
+
+#else /* no fls implementation */
+
+static __hwloc_inline int
+hwloc_flsl_manual(unsigned long x) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_flsl_manual(unsigned long x)
+{
+ int i = 0;
+
+ if (!x)
+ return 0;
+
+ i = 1;
+#if HWLOC_BITS_PER_LONG >= 64
+ if ((x & 0xffffffff00000000ul)) {
+ x >>= 32;
+ i += 32;
+ }
+#endif
+ if ((x & 0xffff0000u)) {
+ x >>= 16;
+ i += 16;
+ }
+ if ((x & 0xff00)) {
+ x >>= 8;
+ i += 8;
+ }
+ if ((x & 0xf0)) {
+ x >>= 4;
+ i += 4;
+ }
+ if ((x & 0xc)) {
+ x >>= 2;
+ i += 2;
+ }
+ if ((x & 0x2)) {
+ x >>= 1;
+ i += 1;
+ }
+
+ return i;
+}
+/* always define hwloc_flsl as a macro, to avoid renaming breakage */
+#define hwloc_flsl hwloc_flsl_manual
+
+#endif
+
+#ifdef HWLOC_NEED_FLSL
+
+/* We only have an int fls(int) implementation, build a long one. */
+
+/* First make it 32 bits if it was only 16. */
+static __hwloc_inline int
+hwloc_fls32(unsigned long x) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_fls32(unsigned long x)
+{
+#if HWLOC_BITS_PER_INT == 16
+ int low_fls, hi_fls;
+
+ hi_fls = hwloc_fls(x >> 16);
+ if (hi_fls)
+ return hi_fls + 16;
+
+ low_fls = hwloc_fls(x & 0xfffful);
+ if (low_fls)
+ return low_fls;
+
+ return 0;
+#else
+ return hwloc_fls(x);
+#endif
+}
+
+/* Then make it 64 bit if longs are. */
+static __hwloc_inline int
+hwloc_flsl_from_fls32(unsigned long x) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_flsl_from_fls32(unsigned long x)
+{
+#if HWLOC_BITS_PER_LONG == 64
+ int low_fls, hi_fls;
+
+ hi_fls = hwloc_fls32(x >> 32);
+ if (hi_fls)
+ return hi_fls + 32;
+
+ low_fls = hwloc_fls32(x & 0xfffffffful);
+ if (low_fls)
+ return low_fls;
+
+ return 0;
+#else
+ return hwloc_fls32(x);
+#endif
+}
+/* always define hwloc_flsl as a macro, to avoid renaming breakage */
+#define hwloc_flsl hwloc_flsl_from_fls32
+
+#endif
+
+static __hwloc_inline int
+hwloc_weight_long(unsigned long w) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_weight_long(unsigned long w)
+{
+#if HWLOC_BITS_PER_LONG == 32
+#if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__) >= 4)
+ return __builtin_popcount(w);
+#else
+ unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
+ res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
+ res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
+ res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
+ return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
+#endif
+#else /* HWLOC_BITS_PER_LONG == 32 */
+#if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__) >= 4)
+ return __builtin_popcountll(w);
+#else
+ unsigned long res;
+ res = (w & 0x5555555555555555ul) + ((w >> 1) & 0x5555555555555555ul);
+ res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul);
+ res = (res & 0x0F0F0F0F0F0F0F0Ful) + ((res >> 4) & 0x0F0F0F0F0F0F0F0Ful);
+ res = (res & 0x00FF00FF00FF00FFul) + ((res >> 8) & 0x00FF00FF00FF00FFul);
+ res = (res & 0x0000FFFF0000FFFFul) + ((res >> 16) & 0x0000FFFF0000FFFFul);
+ return (res & 0x00000000FFFFFFFFul) + ((res >> 32) & 0x00000000FFFFFFFFul);
+#endif
+#endif /* HWLOC_BITS_PER_LONG == 64 */
+}
+
+#if !HAVE_DECL_STRTOULL
+unsigned long long int strtoull(const char *nptr, char **endptr, int base);
+#endif
+
+static __hwloc_inline int hwloc_strncasecmp(const char *s1, const char *s2, size_t n)
+{
+#ifdef HWLOC_HAVE_DECL_STRNCASECMP
+ return strncasecmp(s1, s2, n);
+#else
+ while (n) {
+ char c1 = tolower(*s1), c2 = tolower(*s2);
+ if (!c1 || !c2 || c1 != c2)
+ return c1-c2;
+ n--; s1++; s2++;
+ }
+ return 0;
+#endif
+}
+
+#endif /* HWLOC_PRIVATE_MISC_H */
diff --git a/ext/hwloc/include/private/private.h b/ext/hwloc/include/private/private.h
new file mode 100644
index 0000000..fa344ac
--- /dev/null
+++ b/ext/hwloc/include/private/private.h
@@ -0,0 +1,335 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria. All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved.
+ *
+ * See COPYING in top-level directory.
+ */
+
+/* Internal types and helpers. */
+
+
+#ifdef HWLOC_INSIDE_PLUGIN
+/*
+ * these declarations are internal only, they are not available to plugins
+ * (many functions below are internal static symbols).
+ */
+#error This file should not be used in plugins
+#endif
+
+
+#ifndef HWLOC_PRIVATE_H
+#define HWLOC_PRIVATE_H
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <hwloc/bitmap.h>
+#include <private/components.h>
+#include <private/debug.h>
+#include <sys/types.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif
+#ifdef HAVE_SYS_UTSNAME_H
+#include <sys/utsname.h>
+#endif
+#include <string.h>
+
+enum hwloc_ignore_type_e {
+ HWLOC_IGNORE_TYPE_NEVER = 0,
+ HWLOC_IGNORE_TYPE_KEEP_STRUCTURE,
+ HWLOC_IGNORE_TYPE_ALWAYS
+};
+
+#define HWLOC_DEPTH_MAX 128
+
+struct hwloc_topology {
+ unsigned nb_levels; /* Number of horizontal levels */
+ unsigned next_group_depth; /* Depth of the next Group object that we may create */
+ unsigned level_nbobjects[HWLOC_DEPTH_MAX]; /* Number of objects on each horizontal level */
+ struct hwloc_obj **levels[HWLOC_DEPTH_MAX]; /* Direct access to levels, levels[l = 0 .. nblevels-1][0..level_nbobjects[l]] */
+ unsigned long flags;
+ int type_depth[HWLOC_OBJ_TYPE_MAX];
+ enum hwloc_ignore_type_e ignored_types[HWLOC_OBJ_TYPE_MAX];
+ int is_thissystem;
+ int is_loaded;
+ int modified; /* >0 if objects were added/removed recently, which means a reconnect is needed */
+ hwloc_pid_t pid; /* Process ID the topology is view from, 0 for self */
+ void *userdata;
+
+ unsigned bridge_nbobjects;
+ struct hwloc_obj **bridge_level;
+ struct hwloc_obj *first_bridge, *last_bridge;
+ unsigned pcidev_nbobjects;
+ struct hwloc_obj **pcidev_level;
+ struct hwloc_obj *first_pcidev, *last_pcidev;
+ unsigned osdev_nbobjects;
+ struct hwloc_obj **osdev_level;
+ struct hwloc_obj *first_osdev, *last_osdev;
+ unsigned misc_nbobjects;
+ struct hwloc_obj **misc_level;
+ struct hwloc_obj *first_misc, *last_misc;
+
+ struct hwloc_binding_hooks {
+ int (*set_thisproc_cpubind)(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags);
+ int (*get_thisproc_cpubind)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+ int (*set_thisthread_cpubind)(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags);
+ int (*get_thisthread_cpubind)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+ int (*set_proc_cpubind)(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_cpuset_t set, int flags);
+ int (*get_proc_cpubind)(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, int flags);
+#ifdef hwloc_thread_t
+ int (*set_thread_cpubind)(hwloc_topology_t topology, hwloc_thread_t tid, hwloc_const_cpuset_t set, int flags);
+ int (*get_thread_cpubind)(hwloc_topology_t topology, hwloc_thread_t tid, hwloc_cpuset_t set, int flags);
+#endif
+
+ int (*get_thisproc_last_cpu_location)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+ int (*get_thisthread_last_cpu_location)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+ int (*get_proc_last_cpu_location)(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, int flags);
+
+ int (*set_thisproc_membind)(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+ int (*get_thisproc_membind)(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+ int (*set_thisthread_membind)(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+ int (*get_thisthread_membind)(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+ int (*set_proc_membind)(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+ int (*get_proc_membind)(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+ int (*set_area_membind)(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+ int (*get_area_membind)(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+ /* This has to return the same kind of pointer as alloc_membind, so that free_membind can be used on it */
+ void *(*alloc)(hwloc_topology_t topology, size_t len);
+ /* alloc_membind has to always succeed if !(flags & HWLOC_MEMBIND_STRICT).
+ * see hwloc_alloc_or_fail which is convenient for that. */
+ void *(*alloc_membind)(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+ int (*free_membind)(hwloc_topology_t topology, void *addr, size_t len);
+ } binding_hooks;
+
+ struct hwloc_topology_support support;
+
+ void (*userdata_export_cb)(void *reserved, struct hwloc_topology *topology, struct hwloc_obj *obj);
+ void (*userdata_import_cb)(struct hwloc_topology *topology, struct hwloc_obj *obj, const char *name, const void *buffer, size_t length);
+
+ struct hwloc_os_distances_s {
+ hwloc_obj_type_t type;
+ int nbobjs;
+ unsigned *indexes; /* array of OS indexes before we can convert them into objs. always available.
+ */
+ struct hwloc_obj **objs; /* array of objects, in the same order as above.
+ * either given (by a backend) together with the indexes array above.
+ * or build from the above indexes array when not given (by the user).
+ */
+ float *distances; /* distance matrices, ordered according to the above indexes/objs array.
+ * distance from i to j is stored in slot i*nbnodes+j.
+ * will be copied into the main logical-index-ordered distance at the end of the discovery.
+ */
+ int forced; /* set if the user forced a matrix to ignore the OS one */
+
+ struct hwloc_os_distances_s *prev, *next;
+ } *first_osdist, *last_osdist;
+
+ /* list of enabled backends. */
+ struct hwloc_backend * backends;
+};
+
+extern void hwloc_alloc_obj_cpusets(hwloc_obj_t obj);
+extern void hwloc_setup_pu_level(struct hwloc_topology *topology, unsigned nb_pus);
+extern int hwloc_get_sysctlbyname(const char *name, int64_t *n);
+extern int hwloc_get_sysctl(int name[], unsigned namelen, int *n);
+extern unsigned hwloc_fallback_nbprocessors(struct hwloc_topology *topology);
+extern void hwloc_connect_children(hwloc_obj_t obj);
+extern int hwloc_connect_levels(hwloc_topology_t topology);
+
+extern int hwloc__object_cpusets_compare_first(hwloc_obj_t obj1, hwloc_obj_t obj2);
+extern void hwloc__reorder_children(hwloc_obj_t parent);
+
+extern void hwloc_topology_setup_defaults(struct hwloc_topology *topology);
+extern void hwloc_topology_clear(struct hwloc_topology *topology);
+
+extern void hwloc__add_info(struct hwloc_obj_info_s **infosp, unsigned *countp, const char *name, const char *value);
+extern char ** hwloc__find_info_slot(struct hwloc_obj_info_s **infosp, unsigned *countp, const char *name);
+extern void hwloc__move_infos(struct hwloc_obj_info_s **dst_infosp, unsigned *dst_countp, struct hwloc_obj_info_s **src_infosp, unsigned *src_countp);
+extern void hwloc__free_infos(struct hwloc_obj_info_s *infos, unsigned count);
+
+/* set native OS binding hooks */
+extern void hwloc_set_native_binding_hooks(struct hwloc_binding_hooks *hooks, struct hwloc_topology_support *support);
+/* set either native OS binding hooks (if thissystem), or dummy ones */
+extern void hwloc_set_binding_hooks(struct hwloc_topology *topology);
+
+#if defined(HWLOC_LINUX_SYS)
+extern void hwloc_set_linuxfs_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_LINUX_SYS */
+
+#if defined(HWLOC_BGQ_SYS)
+extern void hwloc_set_bgq_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_BGQ_SYS */
+
+#ifdef HWLOC_SOLARIS_SYS
+extern void hwloc_set_solaris_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_SOLARIS_SYS */
+
+#ifdef HWLOC_AIX_SYS
+extern void hwloc_set_aix_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_AIX_SYS */
+
+#ifdef HWLOC_OSF_SYS
+extern void hwloc_set_osf_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_OSF_SYS */
+
+#ifdef HWLOC_WIN_SYS
+extern void hwloc_set_windows_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_WIN_SYS */
+
+#ifdef HWLOC_DARWIN_SYS
+extern void hwloc_set_darwin_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_DARWIN_SYS */
+
+#ifdef HWLOC_FREEBSD_SYS
+extern void hwloc_set_freebsd_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_FREEBSD_SYS */
+
+#ifdef HWLOC_NETBSD_SYS
+extern void hwloc_set_netbsd_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_NETBSD_SYS */
+
+#ifdef HWLOC_HPUX_SYS
+extern void hwloc_set_hpux_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_HPUX_SYS */
+
+/* Insert uname-specific names/values in the object infos array.
+ * If cached_uname isn't NULL, it is used as a struct utsname instead of recalling uname.
+ * Any field that starts with \0 is ignored.
+ */
+extern void hwloc_add_uname_info(struct hwloc_topology *topology, void *cached_uname);
+
+/* Free obj and its attributes assuming it doesn't have any children/parent anymore */
+extern void hwloc_free_unlinked_object(hwloc_obj_t obj);
+
+/* Duplicate src and its children under newparent in newtopology */
+extern void hwloc__duplicate_objects(struct hwloc_topology *newtopology, struct hwloc_obj *newparent, struct hwloc_obj *src);
+
+/* This can be used for the alloc field to get allocated data that can be freed by free() */
+void *hwloc_alloc_heap(hwloc_topology_t topology, size_t len);
+
+/* This can be used for the alloc field to get allocated data that can be freed by munmap() */
+void *hwloc_alloc_mmap(hwloc_topology_t topology, size_t len);
+
+/* This can be used for the free_membind field to free data using free() */
+int hwloc_free_heap(hwloc_topology_t topology, void *addr, size_t len);
+
+/* This can be used for the free_membind field to free data using munmap() */
+int hwloc_free_mmap(hwloc_topology_t topology, void *addr, size_t len);
+
+/* Allocates unbound memory or fail, depending on whether STRICT is requested
+ * or not */
+static __hwloc_inline void *
+hwloc_alloc_or_fail(hwloc_topology_t topology, size_t len, int flags)
+{
+ if (flags & HWLOC_MEMBIND_STRICT)
+ return NULL;
+ return hwloc_alloc(topology, len);
+}
+
+extern void hwloc_distances_init(struct hwloc_topology *topology);
+extern void hwloc_distances_destroy(struct hwloc_topology *topology);
+extern void hwloc_distances_set(struct hwloc_topology *topology, hwloc_obj_type_t type, unsigned nbobjs, unsigned *indexes, hwloc_obj_t *objs, float *distances, int force);
+extern void hwloc_distances_set_from_env(struct hwloc_topology *topology);
+extern void hwloc_distances_restrict_os(struct hwloc_topology *topology);
+extern void hwloc_distances_restrict(struct hwloc_topology *topology, unsigned long flags);
+extern void hwloc_distances_finalize_os(struct hwloc_topology *topology);
+extern void hwloc_distances_finalize_logical(struct hwloc_topology *topology);
+extern void hwloc_clear_object_distances(struct hwloc_obj *obj);
+extern void hwloc_clear_object_distances_one(struct hwloc_distances_s *distances);
+extern void hwloc_group_by_distances(struct hwloc_topology *topology);
+
+#ifdef HAVE_USELOCALE
+#include "locale.h"
+#ifdef HAVE_XLOCALE_H
+#include "xlocale.h"
+#endif
+#define hwloc_localeswitch_declare locale_t __old_locale = (locale_t)0, __new_locale
+#define hwloc_localeswitch_init() do { \
+ __new_locale = newlocale(LC_ALL_MASK, "C", (locale_t)0); \
+ if (__new_locale != (locale_t)0) \
+ __old_locale = uselocale(__new_locale); \
+} while (0)
+#define hwloc_localeswitch_fini() do { \
+ if (__new_locale != (locale_t)0) { \
+ uselocale(__old_locale); \
+ freelocale(__new_locale); \
+ } \
+} while(0)
+#else /* HAVE_USELOCALE */
+#define hwloc_localeswitch_declare int __dummy_nolocale __hwloc_attribute_unused
+#define hwloc_localeswitch_init()
+#define hwloc_localeswitch_fini()
+#endif /* HAVE_USELOCALE */
+
+#if !HAVE_DECL_FABSF
+#define fabsf(f) fabs((double)(f))
+#endif
+
+#if HAVE_DECL__SC_PAGE_SIZE
+#define hwloc_getpagesize() sysconf(_SC_PAGE_SIZE)
+#elif HAVE_DECL__SC_PAGESIZE
+#define hwloc_getpagesize() sysconf(_SC_PAGESIZE)
+#elif defined HAVE_GETPAGESIZE
+#define hwloc_getpagesize() getpagesize()
+#else
+#undef hwloc_getpagesize
+#endif
+
+/* encode src buffer into target buffer.
+ * targsize must be at least 4*((srclength+2)/3)+1.
+ * target will be 0-terminated.
+ */
+extern int hwloc_encode_to_base64(const char *src, size_t srclength, char *target, size_t targsize);
+/* decode src buffer into target buffer.
+ * src is 0-terminated.
+ * targsize must be at least srclength*3/4+1 (srclength not including \0)
+ * but only srclength*3/4 characters will be meaningful
+ * (the next one may be partially written during decoding, but it should be ignored).
+ */
+extern int hwloc_decode_from_base64(char const *src, char *target, size_t targsize);
+
+/* Check whether needle matches the beginning of haystack, at least n, and up
+ * to a colon or \0 */
+extern int hwloc_namecoloncmp(const char *haystack, const char *needle, size_t n);
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_FORMAT
+# if HWLOC_HAVE_ATTRIBUTE_FORMAT
+# define __hwloc_attribute_format(type, str, arg) __attribute__((__format__(type, str, arg)))
+# else
+# define __hwloc_attribute_format(type, str, arg)
+# endif
+#else
+# define __hwloc_attribute_format(type, str, arg)
+#endif
+
+#define hwloc_memory_size_printf_value(_size, _verbose) \
+ ((_size) < (10ULL<<20) || _verbose ? (((_size)>>9)+1)>>1 : (_size) < (10ULL<<30) ? (((_size)>>19)+1)>>1 : (_size) < (10ULL<<40) ? (((_size)>>29)+1)>>1 : (((_size)>>39)+1)>>1)
+#define hwloc_memory_size_printf_unit(_size, _verbose) \
+ ((_size) < (10ULL<<20) || _verbose ? "KB" : (_size) < (10ULL<<30) ? "MB" : (_size) < (10ULL<<40) ? "GB" : "TB")
+
+/* On some systems, snprintf returns the size of written data, not the actually
+ * required size. hwloc_snprintf always report the actually required size. */
+extern int hwloc_snprintf(char *str, size_t size, const char *format, ...) __hwloc_attribute_format(printf, 3, 4);
+
+extern void hwloc_obj_add_info_nodup(hwloc_obj_t obj, const char *name, const char *value, int nodup);
+
+/* Return the name of the currently running program, if supported.
+ * If not NULL, must be freed by the caller.
+ */
+extern char * hwloc_progname(struct hwloc_topology *topology);
+
+#define HWLOC_BITMAP_EQUAL 0 /* Bitmaps are equal */
+#define HWLOC_BITMAP_INCLUDED 1 /* First bitmap included in second */
+#define HWLOC_BITMAP_CONTAINS 2 /* First bitmap contains second */
+#define HWLOC_BITMAP_INTERSECTS 3 /* Bitmaps intersect without any inclusion */
+#define HWLOC_BITMAP_DIFFERENT 4 /* Bitmaps do not intersect */
+
+/** \brief Compare bitmaps \p bitmap1 and \p bitmap2 from an inclusion point of view.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_compare_inclusion(hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+#endif /* HWLOC_PRIVATE_H */
diff --git a/ext/hwloc/include/private/solaris-chiptype.h b/ext/hwloc/include/private/solaris-chiptype.h
new file mode 100644
index 0000000..4af80d8
--- /dev/null
+++ b/ext/hwloc/include/private/solaris-chiptype.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright © 2009-2010 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+
+#ifdef HWLOC_INSIDE_PLUGIN
+/*
+ * these declarations are internal only, they are not available to plugins
+ * (functions below are internal static symbols).
+ */
+#error This file should not be used in plugins
+#endif
+
+
+#ifndef HWLOC_PRIVATE_SOLARIS_CHIPTYPE_H
+#define HWLOC_PRIVATE_SOLARIS_CHIPTYPE_H
+
+/* SPARC Chip Modes. */
+#define MODE_UNKNOWN 0
+#define MODE_SPITFIRE 1
+#define MODE_BLACKBIRD 2
+#define MODE_CHEETAH 3
+#define MODE_SPARC64_VI 4
+#define MODE_T1 5
+#define MODE_T2 6
+#define MODE_SPARC64_VII 7
+#define MODE_ROCK 8
+
+/* SPARC Chip Implementations. */
+#define IMPL_SPARC64_VI 0x6
+#define IMPL_SPARC64_VII 0x7
+#define IMPL_SPITFIRE 0x10
+#define IMPL_BLACKBIRD 0x11
+#define IMPL_SABRE 0x12
+#define IMPL_HUMMINGBIRD 0x13
+#define IMPL_CHEETAH 0x14
+#define IMPL_CHEETAHPLUS 0x15
+#define IMPL_JALAPENO 0x16
+#define IMPL_JAGUAR 0x18
+#define IMPL_PANTHER 0x19
+#define IMPL_NIAGARA 0x23
+#define IMPL_NIAGARA_2 0x24
+#define IMPL_ROCK 0x25
+
+/* Default Mfg, Cache, Speed settings */
+#define TI_MANUFACTURER 0x17
+#define TWO_MEG_CACHE 2097152
+#define SPITFIRE_SPEED 142943750
+
+char* hwloc_solaris_get_chip_type(void);
+char* hwloc_solaris_get_chip_model(void);
+
+#endif /* HWLOC_PRIVATE_SOLARIS_CHIPTYPE_H */
diff --git a/ext/hwloc/include/private/xml.h b/ext/hwloc/include/private/xml.h
new file mode 100644
index 0000000..75c6c43
--- /dev/null
+++ b/ext/hwloc/include/private/xml.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright © 2009-2014 Inria. All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#ifndef PRIVATE_XML_H
+#define PRIVATE_XML_H 1
+
+#include <hwloc.h>
+
+#include <sys/types.h>
+
+HWLOC_DECLSPEC int hwloc__xml_verbose(void);
+
+/**************
+ * XML import *
+ **************/
+
+typedef struct hwloc__xml_import_state_s {
+ struct hwloc__xml_import_state_s *parent;
+
+ /* globals shared because the entire stack of states during import */
+ struct hwloc_xml_backend_data_s *global;
+
+ /* opaque data used to store backend-specific data.
+ * statically allocated to allow stack-allocation by the common code without knowing actual backend needs.
+ */
+ char data[32];
+} * hwloc__xml_import_state_t;
+
+HWLOC_DECLSPEC int hwloc__xml_import_diff(hwloc__xml_import_state_t state, hwloc_topology_diff_t *firstdiffp);
+
+struct hwloc_xml_backend_data_s {
+ /* xml backend parameters */
+ int (*look_init)(struct hwloc_xml_backend_data_s *bdata, struct hwloc__xml_import_state_s *state);
+ void (*look_failed)(struct hwloc_xml_backend_data_s *bdata);
+ void (*backend_exit)(struct hwloc_xml_backend_data_s *bdata);
+ int (*next_attr)(struct hwloc__xml_import_state_s * state, char **namep, char **valuep);
+ int (*find_child)(struct hwloc__xml_import_state_s * state, struct hwloc__xml_import_state_s * childstate, char **tagp);
+ int (*close_tag)(struct hwloc__xml_import_state_s * state); /* look for an explicit closing tag </name> */
+ void (*close_child)(struct hwloc__xml_import_state_s * state);
+ int (*get_content)(struct hwloc__xml_import_state_s * state, char **beginp, size_t expected_length);
+ void (*close_content)(struct hwloc__xml_import_state_s * state);
+ char * msgprefix;
+ void *data; /* libxml2 doc, or nolibxml buffer */
+ int nbnumanodes;
+ struct hwloc_xml_imported_distances_s {
+ hwloc_obj_t root;
+ struct hwloc_distances_s distances;
+ struct hwloc_xml_imported_distances_s *prev, *next;
+ } *first_distances, *last_distances;
+};
+
+/**************
+ * XML export *
+ **************/
+
+typedef struct hwloc__xml_export_state_s {
+ struct hwloc__xml_export_state_s *parent;
+
+ void (*new_child)(struct hwloc__xml_export_state_s *parentstate, struct hwloc__xml_export_state_s *state, const char *name);
+ void (*new_prop)(struct hwloc__xml_export_state_s *state, const char *name, const char *value);
+ void (*add_content)(struct hwloc__xml_export_state_s *state, const char *buffer, size_t length);
+ void (*end_object)(struct hwloc__xml_export_state_s *state, const char *name);
+
+ /* opaque data used to store backend-specific data.
+ * statically allocated to allow stack-allocation by the common code without knowing actual backend needs.
+ */
+ char data[40];
+} * hwloc__xml_export_state_t;
+
+HWLOC_DECLSPEC void hwloc__xml_export_object (hwloc__xml_export_state_t state, struct hwloc_topology *topology, struct hwloc_obj *obj);
+
+HWLOC_DECLSPEC void hwloc__xml_export_diff(hwloc__xml_export_state_t parentstate, hwloc_topology_diff_t diff);
+
+/******************
+ * XML components *
+ ******************/
+
+struct hwloc_xml_callbacks {
+ int (*backend_init)(struct hwloc_xml_backend_data_s *bdata, const char *xmlpath, const char *xmlbuffer, int xmlbuflen);
+ int (*export_file)(struct hwloc_topology *topology, const char *filename);
+ int (*export_buffer)(struct hwloc_topology *topology, char **xmlbuffer, int *buflen);
+ void (*free_buffer)(void *xmlbuffer);
+ int (*import_diff)(struct hwloc__xml_import_state_s *state, const char *xmlpath, const char *xmlbuffer, int xmlbuflen, hwloc_topology_diff_t *diff, char **refnamep);
+ int (*export_diff_file)(union hwloc_topology_diff_u *diff, const char *refname, const char *filename);
+ int (*export_diff_buffer)(union hwloc_topology_diff_u *diff, const char *refname, char **xmlbuffer, int *buflen);
+};
+
+struct hwloc_xml_component {
+ struct hwloc_xml_callbacks *nolibxml_callbacks;
+ struct hwloc_xml_callbacks *libxml_callbacks;
+};
+
+HWLOC_DECLSPEC void hwloc_xml_callbacks_register(struct hwloc_xml_component *component);
+HWLOC_DECLSPEC void hwloc_xml_callbacks_reset(void);
+
+#endif /* PRIVATE_XML_H */
diff --git a/ext/hwloc/include/static-components.h b/ext/hwloc/include/static-components.h
new file mode 100644
index 0000000..ad23185
--- /dev/null
+++ b/ext/hwloc/include/static-components.h
@@ -0,0 +1,17 @@
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_noos_component;
+//HWLOC_DECLSPEC extern const struct hwloc_component hwloc_xml_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_synthetic_component;
+//HWLOC_DECLSPEC extern const struct hwloc_component hwloc_xml_nolibxml_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_linux_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_linuxpci_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_x86_component;
+static const struct hwloc_component * hwloc_static_components[] = {
+ &hwloc_noos_component,
+// &hwloc_xml_component,
+ &hwloc_synthetic_component,
+// &hwloc_xml_nolibxml_component,
+ &hwloc_linux_component,
+ &hwloc_linuxpci_component,
+ &hwloc_x86_component,
+ NULL
+};
diff --git a/filters/csv b/filters/csv
deleted file mode 100755
index 654f204..0000000
--- a/filters/csv
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-use warnings;
-
-my $FILTERTYPE = 'csv';
-
-my $SEP = ',';
-my $NL = "\n";
-
-if ($#ARGV < 1) {
- die "Filter failed! Please report bug.\n";
-}
-
-my $filename = $ARGV[0];
-my $fileType = $ARGV[1];
-my $infile = $filename;
-
-open INFILE,"< $filename";
-$filename =~ s/\.tmp/\.$FILTERTYPE/;
-open OUTFILE,"> $filename";
-
-if ($fileType eq 'topology') {
- my $region = 'topo';
- print OUTFILE 'THREADS'.$NL;
-
- while (<INFILE>) {
-
- if (/Cache Topology/) {
- $region = 'cache';
- print OUTFILE 'CACHES'.$NL;
- } elsif (/NUMA Topology/) {
- $region = 'numa';
- print OUTFILE 'NUMA'.$NL;
- }
-
- if ($region eq 'topo') {
- if (/(CPU type):\t(.*)/) {
- print OUTFILE $1.$SEP.$2.$NL;
- }
- elsif (/([A-Za-z ]*):\t([0-9]*)/) {
- print OUTFILE $1.$SEP.$2.$NL;
- } elsif (/(HWThread)\t(Thread)\t\t(Core)\t\t(Socket)/) {
- print OUTFILE $1.$SEP.$2.$SEP.$3.$SEP.$4.$NL;
- } elsif (/([0-9]*)\t\t([0-9]*)\t\t([0-9]*)\t\t([0-9]*)/) {
- print OUTFILE $1.$SEP.$2.$SEP.$3.$SEP.$4.$NL;
- }
- } elsif ($region eq 'cache') {
- if (/(Size):\t([0-9]*) ([kMB]*)/) {
- my $size = $2;
- if ($3 eq 'MB') {
- $size *= 1024;
- }
- print OUTFILE $1.'[kB]'.$SEP.$size.$NL;
- } elsif (/(Cache groups):\t*(.*)/) {
- my @groups = split('\) \(',$2);
-
- my $grpId = 0;
- foreach (@groups) {
- /([0-9 ]+)/;
- print OUTFILE 'Cache group '.$grpId.$SEP.$1.$NL;
- $grpId++;
- }
- } elsif (/(.*):\t*(.*)/) {
- print OUTFILE $1.$SEP.$2.$NL;
- }
- } elsif ($region eq 'numa') {
- if (/Domain ([0-9]*)/) {
- print OUTFILE 'Domain ID'.$SEP.$1.$NL;
- } elsif (/Memory: ([0-9.]+) MB free of total ([0-9.]+) MB/) {
- print OUTFILE 'Free Memory [MB]'.$SEP.$1.$NL;
- print OUTFILE 'Total Memory [MB]'.$SEP.$2.$NL;
- } elsif (/(.*):\t*[ ]*(.*)/) {
- print OUTFILE $1.$SEP.$2.$NL;
- }
- }
- }
-} elsif ($fileType eq 'perfctr') {
- my $header = 0;
- while (<INFILE>) {
- if (/Event[ ]*\|[ ]*(core.*)\|/) {
- if (not $header) {
- my @col = split('\|',$1);
- my $numcol = $#col+1;
- print OUTFILE 'NumColumns'.$SEP.$numcol.$NL;
- print OUTFILE 'Event/Metric';
- foreach (@col) {
- s/[ ]//g;
- print OUTFILE $SEP.$_;
- }
- print OUTFILE $NL;
- $header = 1;
- }
- }elsif (/STAT/) {
-
- }elsif (/\|[ ]+([A-Z0-9_]+)[ ]+\|[ ]*(.*)\|/) {
- my @col = split('\|',$2);
- print OUTFILE $1;
- foreach (@col) {
- s/[ ]//g;
- print OUTFILE $SEP.$_;
- }
- print OUTFILE $NL;
- }
- }
-} else {
- die "Filter failed! Unknown application type $fileType!\n";
-}
-
-unlink($infile);
-close INFILE;
-close OUTFILE;
-
-
diff --git a/filters/xml b/filters/xml
index b72c430..fa24a9d 100755
--- a/filters/xml
+++ b/filters/xml
@@ -15,62 +15,91 @@ my $filename = $ARGV[0];
my $fileType = $ARGV[1];
my $infile = $filename;
-open INFILE,"< $filename";
+if (! -e $filename)
+{
+ die "Input file does not exist!\n";
+}
+
+open INFILE,"<$filename";
$filename =~ s/\.tmp/\.$FILTERTYPE/;
-open OUTFILE,"> $filename";
+open OUTFILE,">$filename";
if ($fileType eq 'topology') {
my $region = 'topo';
my $indomain = 0;
print OUTFILE '<node>'.$NL;
+ print OUTFILE '<info>'.$NL;
while (<INFILE>) {
-
- if (/Cache Topology/) {
+ if (/STRUCT,Cache Topology L1/) {
$region = 'cache';
print OUTFILE '<caches>'.$NL;
- } elsif (/NUMA Topology/) {
+ } elsif (/STRUCT,NUMA Topology/) {
print OUTFILE '</caches>'.$NL;
print OUTFILE '<numa>'.$NL;
$region = 'numa';
}
if ($region eq 'topo') {
- if (/(CPU type):\t([\w ]*)/) {
+ if (/(CPU type):,([\w ]*),/) {
print OUTFILE '<cpu>'.$2.'</cpu>'.$NL;
- } elsif (/CPU clock:\t([\d.]) GHz/) {
+ } elsif (/CPU name:,([^,]+),/) {
+ print OUTFILE '<name>'.$1.'</name>'.$NL;
+ } elsif (/CPU stepping:,(\d+),/) {
+ print OUTFILE '<stepping>'.$1.'</stepping>'.$NL;
+ } elsif (/CPU clock:,([\d.]+) GHz/) {
print OUTFILE '<clock>'.$1.'</clock>'.$NL;
- } elsif (/(Sockets):\t(\d*)/) {
+ } elsif (/(Sockets):,(\d+),/) {
print OUTFILE '<socketsPerNode>'.$2.'</socketsPerNode>'.$NL;
- } elsif (/(Cores per socket):\t(\d*)/) {
+ } elsif (/(Cores per socket):,(\d+),/) {
print OUTFILE '<coresPerSocket>'.$2.'</coresPerSocket>'.$NL;
- } elsif (/(Threads per core):\t(\d*)/) {
+ } elsif (/(Threads per core):,(\d+),/) {
print OUTFILE '<threadsPerCore>'.$2.'</threadsPerCore>'.$NL;
- } elsif (/([0-9]*)\t\t([0-9]*)\t\t([0-9]*)\t\t([0-9]*)/) {
+ } elsif (/HWThread,Thread,Core,Socket,Available/) {
+ print OUTFILE '</info>'.$NL;
+ print OUTFILE '<threads>'.$NL;
+ } elsif (/(\d+),(\d+),(\d+),(\d+),/) {
#TODO Build tree for XML output from table!
+ print OUTFILE '<thread>'.$NL;
+ print OUTFILE '<id>'.$1.'</id>'.$NL;
+ print OUTFILE '<threadid>'.$2.'</threadid>'.$NL;
+ print OUTFILE '<coreid>'.$3.'</coreid>'.$NL;
+ print OUTFILE '<socketid>'.$4.'</socketid>'.$NL;
+ print OUTFILE '</thread>'.$NL;
+ } elsif (/STRUCT,Sockets,/) {
+ print OUTFILE '</threads>'.$NL;
+ $region = 'cache';
}
} elsif ($region eq 'cache') {
- if (/(Size):\t([0-9]*) ([kMB]*)/) {
+ if (/(Size):,(\d+) ([kMB]*)/) {
my $size = $2;
if ($3 eq 'MB') {
$size *= 1024;
}
print OUTFILE '<size>'.$size.'</size>'.$NL;
- } elsif (/(Cache groups):\t*(.*)/) {
+ } elsif (/(Cache groups):,([\d ]+),/) {
print OUTFILE '</cache>'.$NL;
- } elsif (/(Associativity):\t*(.*)/) {
+ } elsif (/Type:,(\w+) cache,/) {
+ print OUTFILE '<type>'.lc $1.'</type>'.$NL;
+ } elsif (/(Associativity):,(\d+)/) {
print OUTFILE '<associativity>'.$2.'</associativity>'.$NL;
- } elsif (/(Number of sets):\t*(.*)/) {
+ } elsif (/(Number of sets):,(\d+)/) {
print OUTFILE '<sets>'.$2.'</sets>'.$NL;
- } elsif (/(Cache line size):\t*(.*)/) {
+ } elsif (/(Cache line size):,(\d+)/) {
print OUTFILE '<linesize>'.$2.'</linesize>'.$NL;
- } elsif (/(Level):\t*(.*)/) {
+ } elsif (/Shared by threads:,(\d+),/) {
+ print OUTFILE '<sharedby>'.$1.'</sharedby>'.$NL;
+ } elsif (/Cache type:,Inclusive/) {
+ print OUTFILE '<inclusive>true</inclusive>'.$NL;
+ } elsif (/Cache type:,Non Inclusive/) {
+ print OUTFILE '<inclusive>false</inclusive>'.$NL;
+ } elsif (/(Level):,(\d+)/) {
print OUTFILE '<cache>'.$NL;
print OUTFILE '<level>'.$2.'</level>'.$NL;
}
} elsif ($region eq 'numa') {
- if (/Domain ([0-9]*)/) {
+ if (/Domain:,(\d+),/) {
if ($indomain )
{
print OUTFILE '</domain>'.$NL;
@@ -78,10 +107,11 @@ if ($fileType eq 'topology') {
print OUTFILE '<domain>'.$NL;
print OUTFILE '<id>'.$1.'</id>'.$NL;
$indomain = 1
- } elsif (/Memory: ([0-9.]+) MB free of total ([0-9.]+) MB/) {
+ } elsif (/Free memory:,([\d.]+) MB,/) {
print OUTFILE '<freememory>'.$1.'</freememory>'.$NL;
- print OUTFILE '<totalmemory>'.$2.'</totalmemory>'.$NL;
- } elsif (/Processors:[ ]+([0-9. ]+)/) {
+ } elsif (/Total memory:,([\d.]+) MB,/) {
+ print OUTFILE '<totalmemory>'.$1.'</totalmemory>'.$NL;
+ } elsif (/Processors:,([\d, ]+)/) {
print OUTFILE '<processors>'.$1.'</processors>'.$NL;
}
}
@@ -96,41 +126,105 @@ if ($fileType eq 'topology') {
} elsif ($fileType eq 'perfctr') {
my $header = 0;
my @col;
+ my @cpus;
+ my $region = 'info';
+ my $group = "1";
print OUTFILE '<perfctr>'.$NL;
while (<INFILE>) {
- if (/Event[ ]*\|[ ]*(core.*)\|/) {
- if (not $header) {
- @col = split('\|',$1);
- foreach (@col) {
- s/core //g;
- s/[ ]//g;
+ if (/TABLE,Info/) {
+ $region = 'info';
+ print OUTFILE '<info>'.$NL;
+ } elsif (/TABLE,Group (\d+) Raw/) {
+ $group = $1;
+ if (/Stat/) {
+ $region = '';
+ } else {
+ $region = 'raw';
+ if ($region eq 'info') {
+ print OUTFILE '</info>'.$NL;
}
- $header = 1;
+ print OUTFILE '<group'.$group.'>'.$NL;
+ print OUTFILE '<rawvalues>'.$NL;
}
- }elsif (/STAT/) {
-
- }elsif (/\|[ ]+([A-Z0-9_]+)[ ]+\|[ ]*(.*)\|/) {
- my @rescol = split('\|',$2);
- my $id = 0;
- print OUTFILE '<result>'.$NL;
- print OUTFILE '<event>'.$1.'</event>'.$NL;
- foreach (@rescol) {
- s/[ ]//g;
- print OUTFILE '<core>'.$NL;
- print OUTFILE '<id>'.$col[$id].'</id>'.$NL;
- print OUTFILE '<value>'.$_.'</value>'.$NL;
- print OUTFILE '</core>'.$NL;
- $id++;
+ } elsif (/TABLE,Group (\d+) Metric/) {
+ $group = $1;
+ if (/Stat/) {
+ if ($region eq 'metric')
+ {
+ print OUTFILE '</metrics>'.$NL;
+ print OUTFILE '</group'.$group.'>'.$NL;
+ }
+ $region = '';
+ } else {
+ $region = 'metric';
+ print OUTFILE '</rawvalues>'.$NL;
+ print OUTFILE '<metrics>'.$NL;
}
- print OUTFILE '</result>'.$NL;
- }
+ }
+ if ($region eq 'info') {
+ if (/(CPU type):,([\w ]*),/) {
+ print OUTFILE '<cpu>'.$2.'</cpu>'.$NL;
+ } elsif (/CPU name:,([^,]+),/) {
+ print OUTFILE '<name>'.$1.'</name>'.$NL;
+ } elsif (/CPU clock:,([\d.]+) GHz/) {
+ print OUTFILE '<clock>'.$1.'</clock>'.$NL;
+ }
+ } elsif ($region eq 'raw') {
+ if (/Event,Counter,(.*)/) {
+ if (not $header) {
+ @cpus = split(',',$1);
+ foreach (@cpus) {
+ s/Core //g;
+ s/[ ]//g;
+ }
+ $header = 1;
+ }
+ } elsif (!/TABLE/) {
+ @col = split(',',$_);
+ print OUTFILE '<event>'.$NL;
+ print OUTFILE '<name>'.$col[0].'</name>'.$NL;
+ print OUTFILE '<counter>'.$col[1].'</counter>'.$NL;
+
+
+ for (my $i=0; $i<@cpus; $i++) {
+
+ print OUTFILE '<cpu'.$cpus[$i].'>'.$col[2+$i].'</cpu'.$cpus[$i].'>'.$NL;
+ }
+ print OUTFILE '</event>'.$NL;
+ }
+ } elsif ($region eq 'metric') {
+ if ((!/Metric,Core/) and (!/TABLE/)) {
+ @col = split(',',$_);
+ print OUTFILE '<metric>'.$NL;
+ my $name = "";
+ my $unit = "";
+ if ($col[0] =~ /\[.*\]/) {
+ $col[0] =~ m/(.*)\s\[(.*)\]/;
+ $name = $1;
+ $unit = $2
+ } else {
+ $name = $col[0]
+ }
+ print OUTFILE '<name>'.$name.'</name>'.$NL;
+ if ($unit ne "")
+ {
+ print OUTFILE '<unit>'.$unit.'</unit>'.$NL;
+ }
+ for (my $i=0; $i<@cpus; $i++) {
+ print OUTFILE '<cpu'.$cpus[$i].'>'.$col[1+$i].'</cpu'.$cpus[$i].'>'.$NL;
+ }
+ print OUTFILE '</metric>'.$NL;
+ }
+ } elsif (/STAT/) {
+
+ }
}
print OUTFILE '</perfctr>'.$NL;
} else {
die "Filter failed! Unknown application type $fileType!\n";
}
-#unlink($infile);
+unlink($infile);
close INFILE;
close OUTFILE;
diff --git a/groups/atom/BRANCH.txt b/groups/atom/BRANCH.txt
index 51d2ddd..4213114 100644
--- a/groups/atom/BRANCH.txt
+++ b/groups/atom/BRANCH.txt
@@ -3,11 +3,14 @@ SHORT Branch prediction miss rate/ratio
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
PMC0 BR_INST_RETIRED_ANY
PMC1 BR_INST_RETIRED_MISPRED
METRICS
-Runtime [s] FIXC1*inverseClock
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Branch rate PMC0/FIXC0
Branch misprediction rate PMC1/FIXC0
@@ -15,5 +18,14 @@ Branch misprediction ratio PMC1/PMC0
Instructions per branch FIXC0/PMC0
LONG
-Bla Bla
+Formulas:
+Branch rate = BR_INST_RETIRED_ANY/INSTR_RETIRED_ANY
+Branch misprediction rate = BR_INST_RETIRED_MISPRED/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_INST_RETIRED_MISPRED/BR_INST_RETIRED_ANY
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ANY
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
diff --git a/groups/atom/DATA.txt b/groups/atom/DATA.txt
index 1c0f4ae..9349354 100644
--- a/groups/atom/DATA.txt
+++ b/groups/atom/DATA.txt
@@ -3,14 +3,20 @@ SHORT Load to store ratio
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
PMC0 L1D_CACHE_LD
PMC1 L1D_CACHE_ST
METRICS
-Runtime [s] FIXC1*inverseClock
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
LONG
-Bla Bla
+Formulas:
+Load to store ratio = L1D_CACHE_LD/L1D_CACHE_ST
+-
+This is a simple metric to determine your load to store ratio.
diff --git a/groups/atom/FLOPS_DP.txt b/groups/atom/FLOPS_DP.txt
index 12905c6..8d966cc 100644
--- a/groups/atom/FLOPS_DP.txt
+++ b/groups/atom/FLOPS_DP.txt
@@ -1,4 +1,4 @@
-SHORT Double Precision MFlops/s
+SHORT Double Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -9,9 +9,9 @@ PMC1 SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE
METRICS
Runtime [s] FIXC1*inverseClock
CPI FIXC1/FIXC0
-DP MFlops/s 1.0E-06*(PMC0*2.0+PMC1)/time
+DP MFLOP/s 1.0E-06*(PMC0*2.0+PMC1)/time
LONG
-Double Precision MFlops/s Double Precision MFlops/s
+Double Precision MFLOP/s Double Precision MFLOP/s
diff --git a/groups/atom/FLOPS_SP.txt b/groups/atom/FLOPS_SP.txt
index f064f38..49ca1f3 100644
--- a/groups/atom/FLOPS_SP.txt
+++ b/groups/atom/FLOPS_SP.txt
@@ -1,4 +1,4 @@
-SHORT Single Precision MFlops/s
+SHORT Single Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -9,8 +9,8 @@ PMC1 SIMD_COMP_INST_RETIRED_SCALAR_SINGLE
METRICS
Runtime [s] FIXC1*inverseClock
CPI FIXC1/FIXC0
-SP MFlops/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
+SP MFLOP/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
LONG
-Single Precision MFlops/s Double Precision MFlops/s
+Single Precision MFLOP/s Double Precision MFLOP/s
diff --git a/groups/atom/FLOPS_X87.txt b/groups/atom/FLOPS_X87.txt
index ad14a4d..57d2d81 100644
--- a/groups/atom/FLOPS_X87.txt
+++ b/groups/atom/FLOPS_X87.txt
@@ -1,4 +1,4 @@
-SHORT X87 MFlops/s
+SHORT X87 MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -8,8 +8,8 @@ PMC0 X87_COMP_OPS_EXE_ANY_AR
METRICS
Runtime [s] FIXC1*inverseClock
CPI FIXC1/FIXC0
-X87 MFlops/s 1.0E-06*PMC0/time
+X87 MFLOP/s 1.0E-06*PMC0/time
LONG
-X87 MFlops/s
+X87 MFLOP/s
diff --git a/groups/atom/MEM.txt b/groups/atom/MEM.txt
index faf9a0a..db580e5 100644
--- a/groups/atom/MEM.txt
+++ b/groups/atom/MEM.txt
@@ -3,13 +3,21 @@ SHORT Main memory bandwidth in MBytes/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
PMC0 BUS_TRANS_MEM_THIS_CORE_THIS_A
METRICS
-Runtime [s] FIXC1*inverseClock
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Memory bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+Memory data volume [GBytes] 1.0E-09*PMC0*64.0
LONG
-Bla Bla
+Formulas:
+Memory bandwidth [MBytes/s] = 1.0E-06*BUS_TRANS_MEM_THIS_CORE_THIS_A*64/time
+Memory data volume [GBytes] = 1.0E-09*BUS_TRANS_MEM_THIS_CORE_THIS_A*64.0
+-
+Profiling group to measure memory bandwidth drawn by this core.
diff --git a/groups/atom/TLB.txt b/groups/atom/TLB.txt
index d36b413..4952e6c 100644
--- a/groups/atom/TLB.txt
+++ b/groups/atom/TLB.txt
@@ -8,8 +8,9 @@ PMC0 DATA_TLB_MISSES_DTLB_MISS
METRICS
Runtime [s] FIXC1*inverseClock
CPI FIXC1/FIXC0
+DTLB misses PMC0
DTLB miss rate PMC0/FIXC0
LONG
-Bla Bla
+The DTLB miss rate gives a measure how often a TLB miss occurred per instruction.
diff --git a/groups/broadwell/BRANCH.txt b/groups/broadwell/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/groups/broadwell/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 BR_INST_RETIRED_ALL_BRANCHES
+PMC1 BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Branch rate PMC0/FIXC0
+Branch misprediction rate PMC1/FIXC0
+Branch misprediction ratio PMC1/PMC0
+Instructions per branch FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/groups/broadwell/CLOCK.txt b/groups/broadwell/CLOCK.txt
new file mode 100644
index 0000000..595d3a1
--- /dev/null
+++ b/groups/broadwell/CLOCK.txt
@@ -0,0 +1,23 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Energy [J] PWR0
+Power [W] PWR0/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/groups/broadwell/DATA.txt b/groups/broadwell/DATA.txt
new file mode 100644
index 0000000..967cbad
--- /dev/null
+++ b/groups/broadwell/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_UOPS_RETIRED_LOADS
+PMC1 MEM_UOPS_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/broadwell/ENERGY.txt b/groups/broadwell/ENERGY.txt
new file mode 100644
index 0000000..ae1756f
--- /dev/null
+++ b/groups/broadwell/ENERGY.txt
@@ -0,0 +1,39 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0 TEMP_CORE
+PWR0 PWR_PKG_ENERGY
+PWR1 PWR_PP0_ENERGY
+PWR2 PWR_PP1_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Temperature [C] TMP0
+Energy [J] PWR0
+Power [W] PWR0/time
+Energy PP0 [J] PWR1
+Power PP0 [W] PWR1/time
+Energy PP1 [J] PWR2
+Power PP1 [W] PWR2/time
+Energy DRAM [J] PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power PP1 = PWR_PP1_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) and DRAM level.
+
diff --git a/groups/broadwell/FALSE_SHARE.txt b/groups/broadwell/FALSE_SHARE.txt
new file mode 100644
index 0000000..bb26898
--- /dev/null
+++ b/groups/broadwell/FALSE_SHARE.txt
@@ -0,0 +1,25 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM
+PMC2 MEM_UOPS_RETIRED_LOADS_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Local LLC false sharing [MByte] 1.E-06*PMC0*64
+Local LLC false sharing rate PMC0/PMC2
+
+LONG
+Formula:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM/MEM_UOPS_RETIRED_LOADS_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory load UOPs as reference.
diff --git a/groups/broadwell/FLOPS_AVX.txt b/groups/broadwell/FLOPS_AVX.txt
new file mode 100644
index 0000000..eb047fa
--- /dev/null
+++ b/groups/broadwell/FLOPS_AVX.txt
@@ -0,0 +1,24 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC1 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Packed SP MFLOP/s 1.0E-06*(PMC0*8.0)/time
+Packed DP MFLOP/s 1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formula:
+Packed SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+-
+FLOP rates of 256 bit packed floating-point instructions
+
diff --git a/groups/broadwell/FLOPS_DP.txt b/groups/broadwell/FLOPS_DP.txt
new file mode 100644
index 0000000..60b5d5a
--- /dev/null
+++ b/groups/broadwell/FLOPS_DP.txt
@@ -0,0 +1,29 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+MFLOP/s 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFLOP/s 1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+-
+AVX/SSE scalar and packed double precision FLOP rates.
+
diff --git a/groups/broadwell/FLOPS_SP.txt b/groups/broadwell/FLOPS_SP.txt
new file mode 100644
index 0000000..2818d94
--- /dev/null
+++ b/groups/broadwell/FLOPS_SP.txt
@@ -0,0 +1,29 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+MFLOP/s 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFLOP/s 1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+-
+AVX/SSE scalar and packed single precision FLOP rates.
+
diff --git a/groups/broadwell/ICACHE.txt b/groups/broadwell/ICACHE.txt
new file mode 100644
index 0000000..5f11ad6
--- /dev/null
+++ b/groups/broadwell/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 ICACHE_ACCESSES
+PMC1 ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/broadwell/L2.txt b/groups/broadwell/L2.txt
new file mode 100644
index 0000000..60c7f79
--- /dev/null
+++ b/groups/broadwell/L2.txt
@@ -0,0 +1,37 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L1D_REPLACEMENT
+PMC1 L2_TRANS_L1D_WB
+PMC2 ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes] 1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line loaded from the L2 to the L2 data cache and the writebacks from
+the L2 data cache to the L2 cache. The group also outputs total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cache lines transferred it the instruction
+cache.
diff --git a/groups/broadwell/L2CACHE.txt b/groups/broadwell/L2CACHE.txt
new file mode 100644
index 0000000..9b5dd4b
--- /dev/null
+++ b/groups/broadwell/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L2_TRANS_ALL_REQUESTS
+PMC1 L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/broadwell/L3.txt b/groups/broadwell/L3.txt
new file mode 100644
index 0000000..4026f85
--- /dev/null
+++ b/groups/broadwell/L3.txt
@@ -0,0 +1,36 @@
+SHORT L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L2_LINES_IN_ALL
+PMC1 L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DEMAND_DIRTY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/groups/broadwell/L3CACHE.txt b/groups/broadwell/L3CACHE.txt
new file mode 100644
index 0000000..f863daa
--- /dev/null
+++ b/groups/broadwell/L3CACHE.txt
@@ -0,0 +1,35 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1 MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2 UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/broadwell/RECOVERY.txt b/groups/broadwell/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/groups/broadwell/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 INT_MISC_RECOVERY_CYCLES
+PMC1 INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/groups/broadwell/TLB_DATA.txt b/groups/broadwell/TLB_DATA.txt
new file mode 100644
index 0000000..8d94e05
--- /dev/null
+++ b/groups/broadwell/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1 DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2 DTLB_LOAD_MISSES_WALK_DURATION
+PMC3 DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1 DTLB load misses PMC0
+L1 DTLB load miss rate PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses PMC1
+L1 DTLB store miss rate PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/broadwell/TLB_INSTR.txt b/groups/broadwell/TLB_INSTR.txt
new file mode 100644
index 0000000..235d977
--- /dev/null
+++ b/groups/broadwell/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 ITLB_MISSES_CAUSES_A_WALK
+PMC1 ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1 ITLB misses PMC0
+L1 ITLB miss rate PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/broadwellD/BRANCH.txt b/groups/broadwellD/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/groups/broadwellD/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 BR_INST_RETIRED_ALL_BRANCHES
+PMC1 BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Branch rate PMC0/FIXC0
+Branch misprediction rate PMC1/FIXC0
+Branch misprediction ratio PMC1/PMC0
+Instructions per branch FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/groups/broadwellD/CACHES.txt b/groups/broadwellD/CACHES.txt
new file mode 100644
index 0000000..3c13a52
--- /dev/null
+++ b/groups/broadwellD/CACHES.txt
@@ -0,0 +1,123 @@
+SHORT Cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L1D_REPLACEMENT
+PMC1 L1D_M_EVICT
+PMC2 L2_LINES_IN_ALL
+PMC3 L2_TRANS_L2_WB
+CBOX0C0 LLC_LOOKUP_DATA_READ
+CBOX1C0 LLC_LOOKUP_DATA_READ
+CBOX2C0 LLC_LOOKUP_DATA_READ
+CBOX3C0 LLC_LOOKUP_DATA_READ
+CBOX4C0 LLC_LOOKUP_DATA_READ
+CBOX5C0 LLC_LOOKUP_DATA_READ
+CBOX6C0 LLC_LOOKUP_DATA_READ
+CBOX7C0 LLC_LOOKUP_DATA_READ
+CBOX8C0 LLC_LOOKUP_DATA_READ
+CBOX9C0 LLC_LOOKUP_DATA_READ
+CBOX10C0 LLC_LOOKUP_DATA_READ
+CBOX11C0 LLC_LOOKUP_DATA_READ
+CBOX12C0 LLC_LOOKUP_DATA_READ
+CBOX13C0 LLC_LOOKUP_DATA_READ
+CBOX14C0 LLC_LOOKUP_DATA_READ
+CBOX15C0 LLC_LOOKUP_DATA_READ
+CBOX16C0 LLC_LOOKUP_DATA_READ
+CBOX17C0 LLC_LOOKUP_DATA_READ
+CBOX0C1 LLC_VICTIMS_M
+CBOX1C1 LLC_VICTIMS_M
+CBOX2C1 LLC_VICTIMS_M
+CBOX3C1 LLC_VICTIMS_M
+CBOX4C1 LLC_VICTIMS_M
+CBOX5C1 LLC_VICTIMS_M
+CBOX6C1 LLC_VICTIMS_M
+CBOX7C1 LLC_VICTIMS_M
+CBOX8C1 LLC_VICTIMS_M
+CBOX9C1 LLC_VICTIMS_M
+CBOX10C1 LLC_VICTIMS_M
+CBOX11C1 LLC_VICTIMS_M
+CBOX12C1 LLC_VICTIMS_M
+CBOX13C1 LLC_VICTIMS_M
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L3 to L2 load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time
+L3 to L2 load data volume [GBytes] 1.0E-09*PMC2*64.0
+L2 to L3 evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time
+L2 to L3 evict data volume [GBytes] 1.0E-06*PMC3*64.0
+L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0)*64.0/time
+System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0)*64.0
+L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64/time
+L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64
+L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64.0/time
+L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64.0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
+L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64
+L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
+L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64
+L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
+L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time
+L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64
+L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time
+L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64
+L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ))*64/time
+System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ))*64
+L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M))*64/time
+L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M))*64
+L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64/time
+L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
+-
+Group to measure cache transfers between L1 and Memory. Please notice that the
+L3 to/from system metrics contain any traffic to the system (memory,
+Intel QPI, etc.) but don't seem to handle anything because commonly memory read
+bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth.
+
diff --git a/groups/broadwellD/CLOCK.txt b/groups/broadwellD/CLOCK.txt
new file mode 100644
index 0000000..595d3a1
--- /dev/null
+++ b/groups/broadwellD/CLOCK.txt
@@ -0,0 +1,23 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Energy [J] PWR0
+Power [W] PWR0/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/groups/broadwellD/DATA.txt b/groups/broadwellD/DATA.txt
new file mode 100644
index 0000000..967cbad
--- /dev/null
+++ b/groups/broadwellD/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_UOPS_RETIRED_LOADS
+PMC1 MEM_UOPS_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/broadwellD/ENERGY.txt b/groups/broadwellD/ENERGY.txt
new file mode 100644
index 0000000..ae1756f
--- /dev/null
+++ b/groups/broadwellD/ENERGY.txt
@@ -0,0 +1,39 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0 TEMP_CORE
+PWR0 PWR_PKG_ENERGY
+PWR1 PWR_PP0_ENERGY
+PWR2 PWR_PP1_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Temperature [C] TMP0
+Energy [J] PWR0
+Power [W] PWR0/time
+Energy PP0 [J] PWR1
+Power PP0 [W] PWR1/time
+Energy PP1 [J] PWR2
+Power PP1 [W] PWR2/time
+Energy DRAM [J] PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power PP1 = PWR_PP1_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) and DRAM level.
+
diff --git a/groups/broadwellD/FALSE_SHARE.txt b/groups/broadwellD/FALSE_SHARE.txt
new file mode 100644
index 0000000..a87f7d4
--- /dev/null
+++ b/groups/broadwellD/FALSE_SHARE.txt
@@ -0,0 +1,25 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM
+PMC2 MEM_LOAD_UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Local LLC false sharing [MByte] 1.E-06*PMC0*64
+Local LLC false sharing rate PMC0/PMC2
+
+LONG
+Formula:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory loads as reference.
diff --git a/groups/broadwellD/FLOPS_AVX.txt b/groups/broadwellD/FLOPS_AVX.txt
new file mode 100644
index 0000000..eb047fa
--- /dev/null
+++ b/groups/broadwellD/FLOPS_AVX.txt
@@ -0,0 +1,24 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC1 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Packed SP MFLOP/s 1.0E-06*(PMC0*8.0)/time
+Packed DP MFLOP/s 1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formula:
+Packed SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+-
+FLOP rates of 256 bit packed floating-point instructions
+
diff --git a/groups/broadwellD/FLOPS_DP.txt b/groups/broadwellD/FLOPS_DP.txt
new file mode 100644
index 0000000..60b5d5a
--- /dev/null
+++ b/groups/broadwellD/FLOPS_DP.txt
@@ -0,0 +1,29 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+MFLOP/s 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFLOP/s 1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+-
+AVX/SSE scalar and packed double precision FLOP rates.
+
diff --git a/groups/broadwellD/FLOPS_SP.txt b/groups/broadwellD/FLOPS_SP.txt
new file mode 100644
index 0000000..2818d94
--- /dev/null
+++ b/groups/broadwellD/FLOPS_SP.txt
@@ -0,0 +1,29 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+MFLOP/s 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFLOP/s 1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+-
+AVX/SSE scalar and packed single precision FLOP rates.
+
diff --git a/groups/broadwellD/HA.txt b/groups/broadwellD/HA.txt
new file mode 100644
index 0000000..1e5a700
--- /dev/null
+++ b/groups/broadwellD/HA.txt
@@ -0,0 +1,40 @@
+SHORT Main memory bandwidth in MBytes/s seen from Home agent
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+BBOX0C0 IMC_READS_NORMAL
+BBOX0C1 BYPASS_IMC_TAKEN
+BBOX0C2 IMC_WRITES_ALL
+BBOX1C0 IMC_READS_NORMAL
+BBOX1C1 BYPASS_IMC_TAKEN
+BBOX1C2 IMC_WRITES_ALL
+
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(BBOX0C2+BBOX1C2)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(BBOX0C2+BBOX1C2)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1+BBOX0C2+BBOX1C2)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1+BBOX0C2+BBOX1C2)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_READS_NORMAL)+SUM(BYPASS_IMC_TAKEN))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(IMC_READS_NORMAL)+SUM(BYPASS_IMC_TAKEN))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_WRITES_ALL))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(IMC_WRITES_ALL))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_READS_NORMAL) + SUM(BYPASS_IMC_TAKEN) + SUM(IMC_WRITES_ALL))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(IMC_READS_NORMAL) + SUM(BYPASS_IMC_TAKEN) + SUM(IMC_WRITES_ALL))*64.0
+-
+This group derives the same metrics as the MEM group but use the events of the
+Home Agent, a central unit that is responsible for the protocol side of memory
+interactions.
diff --git a/groups/broadwellD/ICACHE.txt b/groups/broadwellD/ICACHE.txt
new file mode 100644
index 0000000..5f11ad6
--- /dev/null
+++ b/groups/broadwellD/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 ICACHE_ACCESSES
+PMC1 ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/broadwellD/L2.txt b/groups/broadwellD/L2.txt
new file mode 100644
index 0000000..60c7f79
--- /dev/null
+++ b/groups/broadwellD/L2.txt
@@ -0,0 +1,37 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L1D_REPLACEMENT
+PMC1 L2_TRANS_L1D_WB
+PMC2 ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes] 1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line loaded from the L2 to the L2 data cache and the writebacks from
+the L2 data cache to the L2 cache. The group also outputs total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cache lines transferred it the instruction
+cache.
diff --git a/groups/broadwellD/L2CACHE.txt b/groups/broadwellD/L2CACHE.txt
new file mode 100644
index 0000000..9b5dd4b
--- /dev/null
+++ b/groups/broadwellD/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L2_TRANS_ALL_REQUESTS
+PMC1 L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/broadwellD/L3.txt b/groups/broadwellD/L3.txt
new file mode 100644
index 0000000..4026f85
--- /dev/null
+++ b/groups/broadwellD/L3.txt
@@ -0,0 +1,36 @@
+SHORT L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L2_LINES_IN_ALL
+PMC1 L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DEMAND_DIRTY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/groups/broadwellD/L3CACHE.txt b/groups/broadwellD/L3CACHE.txt
new file mode 100644
index 0000000..f863daa
--- /dev/null
+++ b/groups/broadwellD/L3CACHE.txt
@@ -0,0 +1,35 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1 MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2 UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/broadwellD/MEM.txt b/groups/broadwellD/MEM.txt
new file mode 100644
index 0000000..2a17a2c
--- /dev/null
+++ b/groups/broadwellD/MEM.txt
@@ -0,0 +1,52 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base. Some of the counters may not be available on your system.
+Also outputs total data volume transferred from main memory.
+The same metrics are provided by the HA group.
+
diff --git a/groups/broadwellD/MEM_DP.txt b/groups/broadwellD/MEM_DP.txt
new file mode 100644
index 0000000..bfea358
--- /dev/null
+++ b/groups/broadwellD/MEM_DP.txt
@@ -0,0 +1,66 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Energy [J] PWR0
+Power [W] PWR0/time
+Energy DRAM [J] PWR3
+Power DRAM [W] PWR3/time
+MFLOP/s 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFLOP/s 1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed double precision FLOP rates. Also reports on packed AVX
+32b instructions.
diff --git a/groups/broadwellD/MEM_SP.txt b/groups/broadwellD/MEM_SP.txt
new file mode 100644
index 0000000..e7d4642
--- /dev/null
+++ b/groups/broadwellD/MEM_SP.txt
@@ -0,0 +1,68 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Energy [J] PWR0
+Power [W] PWR0/time
+Energy DRAM [J] PWR3
+Power DRAM [W] PWR3/time
+MFLOP/s 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFLOP/s 1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formula:
+Power [W] = PWR_PKG_ENERGY/runtime
+Power DRAM [W] = PWR_DRAM_ENERGY/runtime
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed single precision FLOP rates. Also reports on packed AVX
+32b instructions.
diff --git a/groups/broadwellD/RECOVERY.txt b/groups/broadwellD/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/groups/broadwellD/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 INT_MISC_RECOVERY_CYCLES
+PMC1 INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/groups/broadwellD/TLB_DATA.txt b/groups/broadwellD/TLB_DATA.txt
new file mode 100644
index 0000000..8d94e05
--- /dev/null
+++ b/groups/broadwellD/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1 DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2 DTLB_LOAD_MISSES_WALK_DURATION
+PMC3 DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1 DTLB load misses PMC0
+L1 DTLB load miss rate PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses PMC1
+L1 DTLB store miss rate PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/broadwellD/TLB_INSTR.txt b/groups/broadwellD/TLB_INSTR.txt
new file mode 100644
index 0000000..235d977
--- /dev/null
+++ b/groups/broadwellD/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 ITLB_MISSES_CAUSES_A_WALK
+PMC1 ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1 ITLB misses PMC0
+L1 ITLB miss rate PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/broadwellEP/BRANCH.txt b/groups/broadwellEP/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/groups/broadwellEP/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 BR_INST_RETIRED_ALL_BRANCHES
+PMC1 BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Branch rate PMC0/FIXC0
+Branch misprediction rate PMC1/FIXC0
+Branch misprediction ratio PMC1/PMC0
+Instructions per branch FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/groups/broadwellEP/CACHES.txt b/groups/broadwellEP/CACHES.txt
new file mode 100644
index 0000000..3c13a52
--- /dev/null
+++ b/groups/broadwellEP/CACHES.txt
@@ -0,0 +1,123 @@
+SHORT Cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L1D_REPLACEMENT
+PMC1 L1D_M_EVICT
+PMC2 L2_LINES_IN_ALL
+PMC3 L2_TRANS_L2_WB
+CBOX0C0 LLC_LOOKUP_DATA_READ
+CBOX1C0 LLC_LOOKUP_DATA_READ
+CBOX2C0 LLC_LOOKUP_DATA_READ
+CBOX3C0 LLC_LOOKUP_DATA_READ
+CBOX4C0 LLC_LOOKUP_DATA_READ
+CBOX5C0 LLC_LOOKUP_DATA_READ
+CBOX6C0 LLC_LOOKUP_DATA_READ
+CBOX7C0 LLC_LOOKUP_DATA_READ
+CBOX8C0 LLC_LOOKUP_DATA_READ
+CBOX9C0 LLC_LOOKUP_DATA_READ
+CBOX10C0 LLC_LOOKUP_DATA_READ
+CBOX11C0 LLC_LOOKUP_DATA_READ
+CBOX12C0 LLC_LOOKUP_DATA_READ
+CBOX13C0 LLC_LOOKUP_DATA_READ
+CBOX14C0 LLC_LOOKUP_DATA_READ
+CBOX15C0 LLC_LOOKUP_DATA_READ
+CBOX16C0 LLC_LOOKUP_DATA_READ
+CBOX17C0 LLC_LOOKUP_DATA_READ
+CBOX0C1 LLC_VICTIMS_M
+CBOX1C1 LLC_VICTIMS_M
+CBOX2C1 LLC_VICTIMS_M
+CBOX3C1 LLC_VICTIMS_M
+CBOX4C1 LLC_VICTIMS_M
+CBOX5C1 LLC_VICTIMS_M
+CBOX6C1 LLC_VICTIMS_M
+CBOX7C1 LLC_VICTIMS_M
+CBOX8C1 LLC_VICTIMS_M
+CBOX9C1 LLC_VICTIMS_M
+CBOX10C1 LLC_VICTIMS_M
+CBOX11C1 LLC_VICTIMS_M
+CBOX12C1 LLC_VICTIMS_M
+CBOX13C1 LLC_VICTIMS_M
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L3 to L2 load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time
+L3 to L2 load data volume [GBytes] 1.0E-09*PMC2*64.0
+L2 to L3 evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time
+L2 to L3 evict data volume [GBytes] 1.0E-06*PMC3*64.0
+L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0)*64.0/time
+System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0)*64.0
+L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64/time
+L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64
+L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64.0/time
+L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64.0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
+L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64
+L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
+L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64
+L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
+L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time
+L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64
+L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time
+L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64
+L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ))*64/time
+System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ))*64
+L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M))*64/time
+L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M))*64
+L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64/time
+L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
+-
+Group to measure cache transfers between L1 and Memory. Please notice that the
+L3 to/from system metrics contain any traffic to the system (memory,
+Intel QPI, etc.) but don't seem to handle anything because commonly memory read
+bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth.
+
diff --git a/groups/broadwellEP/CLOCK.txt b/groups/broadwellEP/CLOCK.txt
new file mode 100644
index 0000000..595d3a1
--- /dev/null
+++ b/groups/broadwellEP/CLOCK.txt
@@ -0,0 +1,23 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Energy [J] PWR0
+Power [W] PWR0/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/groups/broadwellEP/DATA.txt b/groups/broadwellEP/DATA.txt
new file mode 100644
index 0000000..967cbad
--- /dev/null
+++ b/groups/broadwellEP/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_UOPS_RETIRED_LOADS
+PMC1 MEM_UOPS_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/broadwellEP/ENERGY.txt b/groups/broadwellEP/ENERGY.txt
new file mode 100644
index 0000000..28f0256
--- /dev/null
+++ b/groups/broadwellEP/ENERGY.txt
@@ -0,0 +1,35 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0 TEMP_CORE
+PWR0 PWR_PKG_ENERGY
+PWR1 PWR_PP0_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Temperature [C] TMP0
+Energy [J] PWR0
+Power [W] PWR0/time
+Energy PP0 [J] PWR1
+Power PP0 [W] PWR1/time
+Energy DRAM [J] PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) and DRAM level.
+
diff --git a/groups/broadwellEP/FALSE_SHARE.txt b/groups/broadwellEP/FALSE_SHARE.txt
new file mode 100644
index 0000000..9f8a30e
--- /dev/null
+++ b/groups/broadwellEP/FALSE_SHARE.txt
@@ -0,0 +1,29 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM
+PMC2 MEM_UOPS_RETIRED_LOADS_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Local LLC false sharing [MByte] 1.E-06*PMC0*64
+Local LLC false sharing rate PMC0/PMC2
+Remote LLC false sharing [MByte] 1.E-06*PMC1*64
+Remote LLC false sharing rate PMC1/PMC2
+
+LONG
+Formula:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM/MEM_UOPS_RETIRED_LOADS_ALL
+Remote LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM*64
+Remote LLC false sharing rate = MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory load UOPs as reference.
diff --git a/groups/broadwellEP/FLOPS_AVX.txt b/groups/broadwellEP/FLOPS_AVX.txt
new file mode 100644
index 0000000..eb047fa
--- /dev/null
+++ b/groups/broadwellEP/FLOPS_AVX.txt
@@ -0,0 +1,24 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC1 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Packed SP MFLOP/s 1.0E-06*(PMC0*8.0)/time
+Packed DP MFLOP/s 1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formula:
+Packed SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+-
+FLOP rates of 256 bit packed floating-point instructions
+
diff --git a/groups/broadwellEP/FLOPS_DP.txt b/groups/broadwellEP/FLOPS_DP.txt
new file mode 100644
index 0000000..60b5d5a
--- /dev/null
+++ b/groups/broadwellEP/FLOPS_DP.txt
@@ -0,0 +1,29 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+MFLOP/s 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFLOP/s 1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+-
+AVX/SSE scalar and packed double precision FLOP rates.
+
diff --git a/groups/broadwellEP/FLOPS_SP.txt b/groups/broadwellEP/FLOPS_SP.txt
new file mode 100644
index 0000000..2818d94
--- /dev/null
+++ b/groups/broadwellEP/FLOPS_SP.txt
@@ -0,0 +1,29 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+MFLOP/s 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFLOP/s 1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+-
+AVX/SSE scalar and packed single precision FLOP rates.
+
diff --git a/groups/broadwellEP/HA.txt b/groups/broadwellEP/HA.txt
new file mode 100644
index 0000000..1e5a700
--- /dev/null
+++ b/groups/broadwellEP/HA.txt
@@ -0,0 +1,40 @@
+SHORT Main memory bandwidth in MBytes/s seen from Home agent
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+BBOX0C0 IMC_READS_NORMAL
+BBOX0C1 BYPASS_IMC_TAKEN
+BBOX0C2 IMC_WRITES_ALL
+BBOX1C0 IMC_READS_NORMAL
+BBOX1C1 BYPASS_IMC_TAKEN
+BBOX1C2 IMC_WRITES_ALL
+
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(BBOX0C2+BBOX1C2)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(BBOX0C2+BBOX1C2)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1+BBOX0C2+BBOX1C2)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1+BBOX0C2+BBOX1C2)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_READS_NORMAL)+SUM(BYPASS_IMC_TAKEN))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(IMC_READS_NORMAL)+SUM(BYPASS_IMC_TAKEN))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_WRITES_ALL))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(IMC_WRITES_ALL))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_READS_NORMAL) + SUM(BYPASS_IMC_TAKEN) + SUM(IMC_WRITES_ALL))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(IMC_READS_NORMAL) + SUM(BYPASS_IMC_TAKEN) + SUM(IMC_WRITES_ALL))*64.0
+-
+This group derives the same metrics as the MEM group but use the events of the
+Home Agent, a central unit that is responsible for the protocol side of memory
+interactions.
diff --git a/groups/broadwellEP/ICACHE.txt b/groups/broadwellEP/ICACHE.txt
new file mode 100644
index 0000000..5f11ad6
--- /dev/null
+++ b/groups/broadwellEP/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 ICACHE_ACCESSES
+PMC1 ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/broadwellEP/L2.txt b/groups/broadwellEP/L2.txt
new file mode 100644
index 0000000..60c7f79
--- /dev/null
+++ b/groups/broadwellEP/L2.txt
@@ -0,0 +1,37 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L1D_REPLACEMENT
+PMC1 L2_TRANS_L1D_WB
+PMC2 ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes] 1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line loaded from the L2 to the L2 data cache and the writebacks from
+the L2 data cache to the L2 cache. The group also outputs total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cache lines transferred it the instruction
+cache.
diff --git a/groups/broadwellEP/L2CACHE.txt b/groups/broadwellEP/L2CACHE.txt
new file mode 100644
index 0000000..9b5dd4b
--- /dev/null
+++ b/groups/broadwellEP/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L2_TRANS_ALL_REQUESTS
+PMC1 L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/broadwellEP/L3.txt b/groups/broadwellEP/L3.txt
new file mode 100644
index 0000000..7d84636
--- /dev/null
+++ b/groups/broadwellEP/L3.txt
@@ -0,0 +1,36 @@
+SHORT L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L2_LINES_IN_ALL
+PMC1 L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DEMAND_DIRTY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/groups/broadwellEP/L3CACHE.txt b/groups/broadwellEP/L3CACHE.txt
new file mode 100644
index 0000000..f863daa
--- /dev/null
+++ b/groups/broadwellEP/L3CACHE.txt
@@ -0,0 +1,35 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1 MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2 UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/broadwellEP/MEM.txt b/groups/broadwellEP/MEM.txt
new file mode 100644
index 0000000..2a17a2c
--- /dev/null
+++ b/groups/broadwellEP/MEM.txt
@@ -0,0 +1,52 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base. Some of the counters may not be available on your system.
+Also outputs total data volume transferred from main memory.
+The same metrics are provided by the HA group.
+
diff --git a/groups/broadwellEP/MEM_DP.txt b/groups/broadwellEP/MEM_DP.txt
new file mode 100644
index 0000000..bfea358
--- /dev/null
+++ b/groups/broadwellEP/MEM_DP.txt
@@ -0,0 +1,66 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Energy [J] PWR0
+Power [W] PWR0/time
+Energy DRAM [J] PWR3
+Power DRAM [W] PWR3/time
+MFLOP/s 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFLOP/s 1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed double precision FLOP rates. Also reports on packed AVX
+32b instructions.
diff --git a/groups/broadwellEP/MEM_SP.txt b/groups/broadwellEP/MEM_SP.txt
new file mode 100644
index 0000000..e7d4642
--- /dev/null
+++ b/groups/broadwellEP/MEM_SP.txt
@@ -0,0 +1,68 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Energy [J] PWR0
+Power [W] PWR0/time
+Energy DRAM [J] PWR3
+Power DRAM [W] PWR3/time
+MFLOP/s 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFLOP/s 1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formula:
+Power [W] = PWR_PKG_ENERGY/runtime
+Power DRAM [W] = PWR_DRAM_ENERGY/runtime
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed single precision FLOP rates. Also reports on packed AVX
+32b instructions.
diff --git a/groups/broadwellEP/NUMA.txt b/groups/broadwellEP/NUMA.txt
new file mode 100644
index 0000000..8fdd0f1
--- /dev/null
+++ b/groups/broadwellEP/NUMA.txt
@@ -0,0 +1,41 @@
+SHORT Local and remote data transfers
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+BBOX0C0 REQUESTS_READS_LOCAL
+BBOX1C0 REQUESTS_READS_LOCAL
+BBOX0C1 REQUESTS_READS_REMOTE
+BBOX1C1 REQUESTS_READS_REMOTE
+BBOX0C2 REQUESTS_WRITES_LOCAL
+BBOX1C2 REQUESTS_WRITES_LOCAL
+BBOX0C3 REQUESTS_WRITES_REMOTE
+BBOX1C3 REQUESTS_WRITES_REMOTE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Local bandwidth [MByte/s] 1.E-06*((BBOX0C0+BBOX1C0+BBOX0C2+BBOX1C2)*64)/time
+Local data volume [GByte] 1.E-09*(BBOX0C0+BBOX1C0+BBOX0C2+BBOX1C2)*64
+Remote bandwidth [MByte/s] 1.E-06*((BBOX0C1+BBOX1C1+BBOX0C3+BBOX1C3)*64)/time
+Remote data volume [GByte] 1.E-09*(BBOX0C1+BBOX1C1+BBOX0C3+BBOX1C3)*64
+Total bandwidth [MByte/s] 1.E-06*((BBOX0C0+BBOX1C0+BBOX0C2+BBOX1C2+BBOX0C1+BBOX1C1+BBOX0C3+BBOX1C3)*64)/time
+Total data volume [GByte] 1.E-09*(BBOX0C0+BBOX1C0+BBOX0C2+BBOX1C2+BBOX0C1+BBOX1C1+BBOX0C3+BBOX1C3)*64
+
+
+LONG
+Formula:
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+Local bandwidth [MByte/s] = 1.E-06*((SUM(REQUESTS_READS_LOCAL)+SUM(REQUESTS_WRITES_LOCAL))*64)/time
+Local data volume [GByte] = 1.E-09*(SUM(REQUESTS_READS_LOCAL)+SUM(REQUESTS_WRITES_LOCAL))*64
+Remote bandwidth [MByte/s] = 1.E-06*((SUM(REQUESTS_READS_REMOTE)+SUM(REQUESTS_WRITES_REMOTE))*64)/time
+Remote data volume [GByte] = 1.E-09*(SUM(REQUESTS_READS_REMOTE)+SUM(REQUESTS_WRITES_REMOTE))*64
+Total bandwidth [MByte/s] = 1.E-06*((SUM(REQUESTS_READS_LOCAL)+SUM(REQUESTS_WRITES_LOCAL)+SUM(REQUESTS_READS_REMOTE)+SUM(REQUESTS_WRITES_REMOTE))*64)/time
+Total data volume [GByte] = 1.E-09*(SUM(REQUESTS_READS_LOCAL)+SUM(REQUESTS_WRITES_LOCAL)+SUM(REQUESTS_READS_REMOTE)+SUM(REQUESTS_WRITES_REMOTE))*64
+--
+This performance group measures the data traffic of CPU sockets to local and remote
+CPU sockets. It uses the Home Agent for calculation. This may include also data from
+other sources than the memory controllers.
diff --git a/groups/broadwellEP/QPI.txt b/groups/broadwellEP/QPI.txt
new file mode 100644
index 0000000..20d7cdf
--- /dev/null
+++ b/groups/broadwellEP/QPI.txt
@@ -0,0 +1,49 @@
+SHORT QPI Link Layer data
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+QBOX0C0 RXL_FLITS_G0_DATA
+QBOX1C0 RXL_FLITS_G0_DATA
+QBOX0C1 RXL_FLITS_G0_NON_DATA
+QBOX1C1 RXL_FLITS_G0_NON_DATA
+QBOX0C2 TXL_FLITS_G0_DATA
+QBOX1C2 TXL_FLITS_G0_DATA
+QBOX0C3 TXL_FLITS_G0_NON_DATA
+QBOX1C3 TXL_FLITS_G0_NON_DATA
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+QPI send data volume [GByte] 1.E-09*(QBOX0C2+QBOX1C2)*8
+QPI send data bandwidth [MByte/s] 1.E-06*(QBOX0C2+QBOX1C2)*8/time
+QPI send link volume [GByte] 1.E-09*(QBOX0C2+QBOX1C2+QBOX0C3+QBOX1C3)*8
+QPI send link bandwidth [MByte/s] 1.E-06*(QBOX0C2+QBOX1C2+QBOX0C3+QBOX1C3)*8/time
+QPI receive data volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0)*8
+QPI receive data bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0)*8/time
+QPI receive link volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0+QBOX0C1+QBOX1C1)*8
+QPI receive link bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0+QBOX0C1+QBOX1C1)*8/time
+QPI total transfer volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0+QBOX0C2+QBOX1C2+QBOX0C1+QBOX1C1+QBOX0C3+QBOX1C3)*8
+QPI total bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0+QBOX0C2+QBOX1C2+QBOX0C1+QBOX1C1+QBOX0C3+QBOX1C3)*8/time
+
+LONG
+Formula:
+QPI send data volume [GByte] = 1.E-09*(sum(TXL_FLITS_G0_DATA)*8)
+QPI send data bandwidth [MByte/s] = 1.E-06*(sum(TXL_FLITS_G0_DATA)*8)/runtime
+QPI send link volume [GByte] = 1.E-09*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)
+QPI send link bandwidth [MByte/s] = 1.E-06*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)/runtime
+QPI receive data volume [GByte] = 1.E-09*(sum(RXL_FLITS_G0_DATA)*8)
+QPI receive data bandwidth [MByte/s] = 1.E-06*(sum(RXL_FLITS_G0_DATA)*8)/runtime
+QPI receive link volume [GByte] = 1.E-09*((sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8)
+QPI receive link bandwidth [MByte/s] = 1.E-06*((sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8)/runtime
+QPI total transfer volume [GByte] = 1.E-09*(sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA)+sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8
+QPI total bandwidth [MByte/s] = 1.E-06*(sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA)+sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8/time
+--
+The Intel QPI Link Layer is responsible for packetizing requests from the caching agent (CBOXes)
+on the way out to the system interface. For Broadwell EP systems, the Link Layer and the
+Ring interface is separated. The QPI link volume contains header, data and trailer while the
+QPI data volume counts only the data flits.
diff --git a/groups/broadwellEP/TLB_DATA.txt b/groups/broadwellEP/TLB_DATA.txt
new file mode 100644
index 0000000..89841d5
--- /dev/null
+++ b/groups/broadwellEP/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1 DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2 DTLB_LOAD_MISSES_WALK_DURATION
+PMC3 DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1 DTLB load misses PMC0
+L1 DTLB load miss rate PMC0/FIXC0
+L1 DTLB load miss duration PMC2
+L1 DTLB store misses PMC1
+L1 DTLB store miss rate PMC1/FIXC0
+L1 DTLB store miss duration PMC3
+
+LONG
+Formulas:
+L1 DTLB load misses DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration DTLB_LOAD_MISSES_WALK_DURATION
+L1 DTLB store misses DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration DTLB_STORE_MISSES_WALK_DURATION
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/broadwellEP/TLB_INSTR.txt b/groups/broadwellEP/TLB_INSTR.txt
new file mode 100644
index 0000000..b195452
--- /dev/null
+++ b/groups/broadwellEP/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 ITLB_MISSES_CAUSES_A_WALK
+PMC1 ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1 ITLB misses PMC0
+L1 ITLB miss rate PMC0/FIXC0
+L1 ITLB miss duration PMC1
+
+
+LONG
+Formulas:
+L1 ITLB misses ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration ITLB_MISSES_WALK_DURATION
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/core2/BRANCH.txt b/groups/core2/BRANCH.txt
index 2515d6c..3c66c00 100644
--- a/groups/core2/BRANCH.txt
+++ b/groups/core2/BRANCH.txt
@@ -19,12 +19,12 @@ Instructions per branch FIXC0/PMC0
LONG
Formulas:
-Branch rate = BR_INST_RETIRED_ANY / INSTR_RETIRED_ANY
-Branch misprediction rate = BR_INST_RETIRED_MISPRED / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_INST_RETIRED_MISPRED / BR_INST_RETIRED_ANY
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ANY
+Branch rate = BR_INST_RETIRED_ANY/INSTR_RETIRED_ANY
+Branch misprediction rate = BR_INST_RETIRED_MISPRED/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_INST_RETIRED_MISPRED/BR_INST_RETIRED_ANY
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ANY
-
-The rates state how often in average a branch or a mispredicted branch occured
-per instruction retired in total. The Branch misprediction ratio sets directly
-into relation what ration of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate.
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
diff --git a/groups/core2/CACHE.txt b/groups/core2/CACHE.txt
index fd2af0c..1f446b8 100644
--- a/groups/core2/CACHE.txt
+++ b/groups/core2/CACHE.txt
@@ -10,26 +10,25 @@ PMC1 L1D_ALL_CACHE_REF
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-Data cache misses PMC0
-Data cache request rate PMC1/FIXC0
-Data cache miss rate PMC0/FIXC0
-Data cache miss ratio PMC0/PMC1
+data cache misses PMC0
+data cache request rate PMC1/FIXC0
+data cache miss rate PMC0/FIXC0
+data cache miss ratio PMC0/PMC1
LONG
Formulas:
-Data cache request rate = L1D_ALL_CACHE_REF / INSTR_RETIRED_ANY
-Data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
-Data cache miss ratio = L1D_REPL / L1D_ALL_CACHE_REF
+data cache request rate = L1D_ALL_CACHE_REF / INSTR_RETIRED_ANY
+data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
+data cache miss ratio = L1D_REPL / L1D_ALL_CACHE_REF
-
This group measures the locality of your data accesses with regard to the
-L1 Cache. Data cache request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
-The Data cache miss rate gives a measure how often it was necessary to get
-cachelines from higher levels of the memory hierarchy. And finally
-Data cache miss ratio tells you how many of your memory references required
-a cacheline to be loaded from a higher level. While the Data cache miss rate
-might be given by your algorithm you should try to get Data cache miss ratio
+L1 cache. Data cache request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy. And finally
+data cache miss ratio tells you how many of your memory references required
+a cache line to be loaded from a higher level. While the# data cache miss rate
+might be given by your algorithm you should try to get data cache miss ratio
as low as possible by increasing your cache reuse.
diff --git a/groups/core2/CLOCK.txt b/groups/core2/CLOCK.txt
new file mode 100644
index 0000000..4a5986f
--- /dev/null
+++ b/groups/core2/CLOCK.txt
@@ -0,0 +1,19 @@
+SHORT CPU clock information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+
+LONG
+Formula:
+CPI = CPU_CLK_UNHALTED_CORE / INSTR_RETIRED_ANY
+-
+Most basic performance group measuring the the clock frequency of the machine.
+
diff --git a/groups/core2/DATA.txt b/groups/core2/DATA.txt
index c48ad99..0f5bca5 100644
--- a/groups/core2/DATA.txt
+++ b/groups/core2/DATA.txt
@@ -12,11 +12,11 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
LONG
Formulas:
-Load to Store ratio = INST_RETIRED_LOADS / INST_RETIRED_STORES
+Load to store ratio = INST_RETIRED_LOADS/INST_RETIRED_STORES
-
-This is a simple metric to determine your Load to store ratio.
+This is a simple metric to determine your load to store ratio.
diff --git a/groups/core2/FLOPS_DP.txt b/groups/core2/FLOPS_DP.txt
index 8e72f07..8164fd3 100644
--- a/groups/core2/FLOPS_DP.txt
+++ b/groups/core2/FLOPS_DP.txt
@@ -1,4 +1,4 @@
-SHORT Double Precision MFlops/s
+SHORT Double Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -10,15 +10,14 @@ PMC1 SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-DP MFlops/s 1.0E-06*(PMC0*2.0+PMC1)/time
+MFLOP/s 1.0E-06*(PMC0*2.0+PMC1)/time
LONG
Formulas:
-DP MFlops/s = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_DOUBLE*2+SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE)/time
+MFLOP/s = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_DOUBLE*2+SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE)/time
-
-Profiling group to measure double SSE flops. Dont forget that your code might also execute X87 flops.
+Profiling group to measure double SSE FLOPs. Don't forget that your code might also execute X87 FLOPs.
On the number of SIMD_COMP_INST_RETIRED_PACKED_DOUBLE you can see how well your code was vectorized.
diff --git a/groups/core2/FLOPS_SP.txt b/groups/core2/FLOPS_SP.txt
index acd2df7..181be78 100644
--- a/groups/core2/FLOPS_SP.txt
+++ b/groups/core2/FLOPS_SP.txt
@@ -1,4 +1,4 @@
-SHORT Single Precision MFlops/s
+SHORT Single Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -10,15 +10,14 @@ PMC1 SIMD_COMP_INST_RETIRED_SCALAR_SINGLE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-SP MFlops/s 1.0E-06*(PMC0*4.0+PMC1)/time
+MFLOP/s 1.0E-06*(PMC0*4.0+PMC1)/time
LONG
Formulas:
-SP MFlops/s = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_SINGLE*4+SIMD_COMP_INST_RETIRED_SCALAR_SINGLE)/time
+MFLOP/s = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_SINGLE*4+SIMD_COMP_INST_RETIRED_SCALAR_SINGLE)/time
-
-Profiling group to measure single precision SSE flops. Dont forget that your code might also execute X87 flops.
+Profiling group to measure single precision SSE FLOPs. Don't forget that your code might also execute X87 FLOPs.
On the number of SIMD_COMP_INST_RETIRED_PACKED_SINGLE you can see how well your code was vectorized.
diff --git a/groups/core2/FLOPS_X87.txt b/groups/core2/FLOPS_X87.txt
index 052356e..d44a2fa 100644
--- a/groups/core2/FLOPS_X87.txt
+++ b/groups/core2/FLOPS_X87.txt
@@ -1,4 +1,4 @@
-SHORT X87 MFlops/s
+SHORT X87 MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -9,14 +9,13 @@ PMC0 X87_OPS_RETIRED_ANY
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-X87 MFlops/s 1.0E-06*PMC0/time
+X87 MFLOP/s 1.0E-06*PMC0/time
LONG
Formulas:
-X87 MFlops/s = 1.0E-06*X87_OPS_RETIRED_ANY/time
+X87 MFLOP/s = 1.0E-06*X87_OPS_RETIRED_ANY/time
-
-Profiling group to measure X87 flops. Note that also non computational operations
+Profiling group to measure X87 FLOPs. Note that also non computational operations
are measured by this event.
diff --git a/groups/core2/L2.txt b/groups/core2/L2.txt
index 88c75c5..d8cbe0d 100644
--- a/groups/core2/L2.txt
+++ b/groups/core2/L2.txt
@@ -10,23 +10,26 @@ PMC1 L1D_M_EVICT
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-L2 load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 evict [MBytes/s] 1.0E-06*PMC1*64.0/time
+L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes] 1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0
L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
LONG
Formulas:
-L2 Load [MBytes/s] = 1.0E-06*L1D_REPL*64/time
-L2 Evict [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT)*64/time
-L2 data volume [GBytes] 1.0E-09*(L1D_REPL+L1D_M_EVICT)*64.0
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT)*64.0
-
Profiling group to measure L2 cache bandwidth. The bandwidth is
-computed by the number of cacheline allocated in the L1 and the
-number of modified cachelines evicted from the L1.
+computed by the number of cache line allocated in the L1 and the
+number of modified cache lines evicted from the L1.
Note that this bandwidth also includes data transfers due to a
write allocate load on a store miss in L1.
diff --git a/groups/core2/L2CACHE.txt b/groups/core2/L2CACHE.txt
index 34c607a..d3b8776 100644
--- a/groups/core2/L2CACHE.txt
+++ b/groups/core2/L2CACHE.txt
@@ -23,13 +23,12 @@ L2 miss rate = L2_RQSTS_SELF_I_STATE / INSTR_RETIRED_ANY
L2 miss ratio = L2_RQSTS_SELF_I_STATE / L2_RQSTS_THIS_CORE_ALL_MESI
-
This group measures the locality of your data accesses with regard to the
-L2 Cache. L2 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
The L2 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L2 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
diff --git a/groups/core2/MEM.txt b/groups/core2/MEM.txt
index b205dc4..f6522ba 100644
--- a/groups/core2/MEM.txt
+++ b/groups/core2/MEM.txt
@@ -5,18 +5,19 @@ FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 BUS_TRANS_MEM_THIS_CORE_THIS_A
+PMC1 BUS_TRANS_WB_THIS_CORE_ALL_A
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-Memory bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-Memory data volume [GBytes] 1.0E-09*PMC0*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
LONG
Formulas:
Memory bandwidth [MBytes/s] = 1.0E-06*BUS_TRANS_MEM_THIS_CORE_THIS_A*64/time
-Memory data volume [GBytes] 1.0E-09*BUS_TRANS_MEM_THIS_CORE_THIS_A*64.0
+Memory data volume [GBytes] = 1.0E-09*BUS_TRANS_MEM_THIS_CORE_THIS_A*64.0
-
-Profiling group to measure memory bandwidth drawn by this core.
+Profiling group to measure memory bandwidth drawn by this core.
diff --git a/groups/core2/TLB.txt b/groups/core2/TLB.txt
index d536d88..80742f4 100644
--- a/groups/core2/TLB.txt
+++ b/groups/core2/TLB.txt
@@ -10,7 +10,6 @@ PMC1 L1D_ALL_CACHE_REF
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
L1 DTLB request rate PMC1/FIXC0
DTLB miss rate PMC0/FIXC0
@@ -22,9 +21,9 @@ L1 DTLB request rate = L1D_ALL_CACHE_REF / INSTR_RETIRED_ANY
DTLB miss rate = DTLB_MISSES_ANY / INSTR_RETIRED_ANY
L1 DTLB miss ratio = DTLB_MISSES_ANY / L1D_ALL_CACHE_REF
-
-L1 DTLB request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
-The DTLB miss rate gives a measure how often a TLB miss occured
+L1 DTLB request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The DTLB miss rate gives a measure how often a TLB miss occurred
per instruction. And finally L1 DTLB miss ratio tells you how many
-of your memory references required caused a TLB miss in average.
+of your memory references required caused a TLB miss on average.
diff --git a/groups/core2/UOPS.txt b/groups/core2/UOPS.txt
new file mode 100644
index 0000000..8167416
--- /dev/null
+++ b/groups/core2/UOPS.txt
@@ -0,0 +1,22 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 RS_UOPS_DISPATCHED_ALL
+PMC1 UOPS_RETIRED_ANY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Executed UOPs PMC0
+Retired UOPs PMC1
+
+LONG
+Performance group measures the executed and retired micro ops. The difference
+between executed and retired uOPs are the speculatively executed uOPs.
diff --git a/groups/core2/UOPS_RETIRE.txt b/groups/core2/UOPS_RETIRE.txt
new file mode 100644
index 0000000..be0bf73
--- /dev/null
+++ b/groups/core2/UOPS_RETIRE.txt
@@ -0,0 +1,25 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_RETIRED_USED_CYCLES
+PMC1 UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Used cycles ratio PMC0/FIXC1
+Unused cycles ratio PMC1/FIXC1
+
+
+LONG
+Formulas:
+Used cycles ratio = UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio = UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+-
+This performance group returns the ratios of used and unused CPU cycles. Here
+unused cycles are cycles where no operation is performed due to some stall.
diff --git a/groups/haswell/BRANCH.txt b/groups/haswell/BRANCH.txt
index cbaf834..b8d41b2 100644
--- a/groups/haswell/BRANCH.txt
+++ b/groups/haswell/BRANCH.txt
@@ -19,13 +19,13 @@ Instructions per branch FIXC0/PMC0
LONG
Formulas:
-Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
-
-The rates state how often in average a branch or a mispredicted branch occured
-per instruction retired in total. The Branch misprediction ratio sets directly
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
into relation what ratio of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate.
+Instructions per branch is 1/branch rate.
diff --git a/groups/haswell/CACHES.txt b/groups/haswell/CACHES.txt
new file mode 100644
index 0000000..d0d6f33
--- /dev/null
+++ b/groups/haswell/CACHES.txt
@@ -0,0 +1,71 @@
+SHORT Cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L1D_REPLACEMENT
+PMC1 L1D_M_EVICT
+PMC2 L2_LINES_IN_ALL
+PMC3 L2_TRANS_L2_WB
+CBOX0C0 CACHE_LOOKUP_READ_MESI
+CBOX1C0 CACHE_LOOKUP_READ_MESI
+CBOX2C0 CACHE_LOOKUP_READ_MESI
+CBOX3C0 CACHE_LOOKUP_READ_MESI
+CBOX0C1 CACHE_LOOKUP_WRITE_MESI
+CBOX1C1 CACHE_LOOKUP_WRITE_MESI
+CBOX2C1 CACHE_LOOKUP_WRITE_MESI
+CBOX3C1 CACHE_LOOKUP_WRITE_MESI
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L3 to L2 load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time
+L3 to L2 load data volume [GBytes] 1.0E-09*PMC2*64.0
+L2 to L3 evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time
+L2 to L3 evict data volume [GBytes] 1.0E-06*PMC3*64.0
+L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0)*64.0/time
+System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0)*64.0
+L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1)*64/time
+L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1)*64
+L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1)*64.0/time
+L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1)*64.0
+
+LONG
+Formulas:
+L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
+L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64
+L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
+L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64
+L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
+L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time
+L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64
+L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time
+L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64
+L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(CACHE_LOOKUP_READ_MESI))*64/time
+System to L3 data volume [GBytes] = 1.0E-09*(SUM(CACHE_LOOKUP_READ_MESI))*64
+L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(CACHE_LOOKUP_WRITE_MESI))*64/time
+L3 to system data volume [GBytes] = 1.0E-09*(SUM(CACHE_LOOKUP_WRITE_MESI))*64
+L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(CACHE_LOOKUP_READ_MESI)+SUM(CACHE_LOOKUP_WRITE_MESI))*64/time
+L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(CACHE_LOOKUP_READ_MESI)+SUM(CACHE_LOOKUP_WRITE_MESI))*64
+-
+Group to measure cache transfers between L1 and Memory. Please notice that the
+L3 to/from system metrics contain any traffic to the system (memory,
+Intel QPI, etc.) but don't seem to handle anything because commonly memory read
+bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth.
+
diff --git a/groups/haswell/CLOCK.txt b/groups/haswell/CLOCK.txt
index 276cf16..a2556b4 100644
--- a/groups/haswell/CLOCK.txt
+++ b/groups/haswell/CLOCK.txt
@@ -7,7 +7,7 @@ FIXC2 CPU_CLK_UNHALTED_REF
PWR0 PWR_PKG_ENERGY
METRICS
-Runtime (RDTSC) [s] time
+Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
diff --git a/groups/haswell/DATA.txt b/groups/haswell/DATA.txt
index 5f04a23..17948d4 100644
--- a/groups/haswell/DATA.txt
+++ b/groups/haswell/DATA.txt
@@ -4,19 +4,24 @@ EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
-PMC0 MEM_UOP_RETIRED_LOADS
-PMC1 MEM_UOP_RETIRED_STORES
+PMC0 MEM_UOPS_RETIRED_LOADS
+PMC1 MEM_UOPS_RETIRED_STORES
+PMC2 UOPS_RETIRED_ALL
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
+Load ratio PMC0/PMC2
+Store ratio PMC1/PMC2
LONG
Formulas:
-Load to Store ratio = MEM_UOP_RETIRED_LOADS / MEM_UOP_RETIRED_STORES
+Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
+Load ratio = MEM_UOPS_RETIRED_LOADS/UOPS_RETIRED_ALL
+Store ratio = MEM_UOPS_RETIRED_STORES/UOPS_RETIRED_ALL
-
This is a metric to determine your load to store ratio.
diff --git a/groups/haswell/ENERGY.txt b/groups/haswell/ENERGY.txt
index 15b1c45..e8bed3a 100644
--- a/groups/haswell/ENERGY.txt
+++ b/groups/haswell/ENERGY.txt
@@ -7,10 +7,13 @@ FIXC2 CPU_CLK_UNHALTED_REF
TMP0 TEMP_CORE
PWR0 PWR_PKG_ENERGY
PWR1 PWR_PP0_ENERGY
+PWR2 PWR_PP1_ENERGY
PWR3 PWR_DRAM_ENERGY
+
+
METRICS
-Runtime (RDTSC) [s] time
+Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
@@ -19,16 +22,18 @@ Energy [J] PWR0
Power [W] PWR0/time
Energy PP0 [J] PWR1
Power PP0 [W] PWR1/time
+Energy PP1 [J] PWR2
+Power PP1 [W] PWR2/time
Energy DRAM [J] PWR3
Power DRAM [W] PWR3/time
LONG
Formula:
-Power = PWR_PKG_ENERGY / time
+Power = PWR_PKG_ENERGY / time
Power PP0 = PWR_PP0_ENERGY / time
+Power PP1 = PWR_PP1_ENERGY / time
Power DRAM = PWR_DRAM_ENERGY / time
-
Haswell implements the new RAPL interface. This interface enables to
monitor the consumed energy on the package (socket) and DRAM level.
-The PP0 energy domain is often refered to an integrated GPU.
diff --git a/groups/haswell/FALSE_SHARE.txt b/groups/haswell/FALSE_SHARE.txt
new file mode 100644
index 0000000..43ea23b
--- /dev/null
+++ b/groups/haswell/FALSE_SHARE.txt
@@ -0,0 +1,28 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM
+PMC2 MEM_LOAD_UOPS_RETIRED_ALL_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Local LLC hit with false sharing [MByte] 1.E-06*PMC0*64
+Local LLC hit with false sharing rate PMC0/PMC2
+
+LONG
+Formula:
+Formula:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory loads as reference.
+Please keep in mind that the MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM event may
+undercount by as much as 40% (Errata HSD25).
diff --git a/groups/haswell/FLOPS_AVX.txt b/groups/haswell/FLOPS_AVX.txt
new file mode 100644
index 0000000..9efdd1d
--- /dev/null
+++ b/groups/haswell/FLOPS_AVX.txt
@@ -0,0 +1,28 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 AVX_INSTS_CALC
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Packed SP MFLOP/s 1.0E-06*(PMC0*8.0)/time
+Packed DP MFLOP/s 1.0E-06*(PMC0*4.0)/time
+
+LONG
+Formula:
+Packed SP MFLOP/s = 1.0E-06*(AVX_INSTS_CALC*8)/runtime
+Packed DP MFLOP/s = 1.0E-06*(AVX_INSTS_CALC*4)/runtime
+-
+Packed 32b AVX FLOP/s rates. Approximate counts of AVX & AVX2 256-bit instructions.
+May count non-AVX instructions that employ 256-bit operations, including (but
+not necessarily limited to) rep string instructions that use 256-bit loads and
+stores for optimized performance, XSAVE* and XRSTOR*, and operations that
+transition the x87 FPU data registers between x87 and MMX.
+Caution: The event AVX_INSTS_CALC counts the insertf128 instruction often used
+by the Intel C compilers for (unaligned) vector loads.
diff --git a/groups/haswell/ICACHE.txt b/groups/haswell/ICACHE.txt
index 6ce3ce8..f1e2335 100644
--- a/groups/haswell/ICACHE.txt
+++ b/groups/haswell/ICACHE.txt
@@ -6,6 +6,8 @@ FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 ICACHE_ACCESSES
PMC1 ICACHE_MISSES
+PMC2 ICACHE_IFETCH_STALL
+PMC3 ILD_STALL_IQ_FULL
METRICS
Runtime (RDTSC) [s] time
@@ -15,11 +17,17 @@ CPI FIXC1/FIXC0
L1I request rate PMC0/FIXC0
L1I miss rate PMC1/FIXC0
L1I miss ratio PMC1/PMC0
+L1I stalls PMC2
+L1I stall rate PMC2/FIXC0
+L1I queue full stalls PMC3
+L1I queue full stall rate PMC3/FIXC0
LONG
Formulas:
-L2 request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
-L2 miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
-L2 miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I stalls = ICACHE_IFETCH_STALL
+L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY
-
This group measures some L1 instruction cache metrics.
diff --git a/groups/haswell/L2.txt b/groups/haswell/L2.txt
index 47d8ec7..60c7f79 100644
--- a/groups/haswell/L2.txt
+++ b/groups/haswell/L2.txt
@@ -6,6 +6,7 @@ FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 L1D_REPLACEMENT
PMC1 L2_TRANS_L1D_WB
+PMC2 ICACHE_MISSES
METRICS
Runtime (RDTSC) [s] time
@@ -21,13 +22,16 @@ L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
LONG
Formulas:
-L2D load bandwidth [MBytes/s] 1.0E-06*L1D_REPLACEMENT*64.0/time
-L2D load data volume [GBytes] 1.0E-09*L1D_REPLACEMENT*64.0
-L2D evict bandwidth [MBytes/s] 1.0E-06*L2_TRANS_L1D_WB*64.0/time
-L2D evict data volume [GBytes] 1.0E-09*L2_TRANS_L1D_WB*64.0
-L2 bandwidth [MBytes/s] 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time
-L2 data volume [GBytes] 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0
-
Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
-number of cacheline loaded from the L2 to the L1 data cache and the writebacks from
-the L1 data cache to the L2 cache.
+number of cache line loaded from the L2 to the L2 data cache and the writebacks from
+the L2 data cache to the L2 cache. The group also outputs total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cache lines transferred it the instruction
+cache.
diff --git a/groups/haswell/L2CACHE.txt b/groups/haswell/L2CACHE.txt
index 8186f69..9b5dd4b 100644
--- a/groups/haswell/L2CACHE.txt
+++ b/groups/haswell/L2CACHE.txt
@@ -4,7 +4,7 @@ EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
-PMC0 L2_RQSTS_REFERENCES
+PMC0 L2_TRANS_ALL_REQUESTS
PMC1 L2_RQSTS_MISS
METRICS
@@ -18,18 +18,17 @@ L2 miss ratio PMC1/PMC0
LONG
Formulas:
-L2 request rate = L2_RQSTS_REFERENCES / INSTR_RETIRED_ANY
-L2 miss rate = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_RQSTS_REFERENCES
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
-
This group measures the locality of your data accesses with regard to the
-L2 Cache. L2 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
The L2 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L2 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
diff --git a/groups/haswell/L3.txt b/groups/haswell/L3.txt
index 42d6e4a..f63a918 100644
--- a/groups/haswell/L3.txt
+++ b/groups/haswell/L3.txt
@@ -5,28 +5,32 @@ FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 L2_LINES_IN_ALL
-PMC1 L2_LINES_OUT_DEMAND_DIRTY
+PMC1 L2_TRANS_L2_WB
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-L3 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L3 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
+L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0
L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
LONG
Formulas:
-L3 Load [MBytes/s] 1.0E-06*L2_LINES_IN_ALL*64/time
-L3 Evict [MBytes/s] 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64/time
-L3 bandwidth [MBytes/s] 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64/time
-L3 data volume [GBytes] 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
-
Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
-number of cacheline allocated in the L2 and the number of modified cachelines
-evicted from the L2. This group also outputs data volume transfered between the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
L3 and measured cores L2 caches. Note that this bandwidth also includes data
transfers due to a write allocate load on a store miss in L2.
diff --git a/groups/haswell/L3CACHE.txt b/groups/haswell/L3CACHE.txt
index d4fd89e..f863daa 100644
--- a/groups/haswell/L3CACHE.txt
+++ b/groups/haswell/L3CACHE.txt
@@ -6,30 +6,30 @@ FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 MEM_LOAD_UOPS_RETIRED_L3_ALL
PMC1 MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2 UOPS_RETIRED_ALL
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-L3 request rate (PMC0)/FIXC0
-L3 miss rate PMC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
L3 miss ratio PMC1/PMC0
LONG
Formulas:
-L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL / INSTR_RETIRED_ANY
-L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS / INSTR_RETIRED_ANY
-L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS / MEM_LOAD_UOPS_RETIRED_L3_ALL
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
-
This group measures the locality of your data accesses with regard to the
-L3 Cache. L3 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
The L3 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L3 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
diff --git a/groups/haswell/RECOVERY.txt b/groups/haswell/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/groups/haswell/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 INT_MISC_RECOVERY_CYCLES
+PMC1 INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/groups/haswell/TLB_DATA.txt b/groups/haswell/TLB_DATA.txt
index 2f59772..8d94e05 100644
--- a/groups/haswell/TLB_DATA.txt
+++ b/groups/haswell/TLB_DATA.txt
@@ -1,4 +1,4 @@
-SHORT L1 Data TLB miss rate/ratio
+SHORT L2 data TLB miss rate/ratio
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -16,20 +16,20 @@ Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
L1 DTLB load misses PMC0
L1 DTLB load miss rate PMC0/FIXC0
-L1 DTLB load miss duration PMC2
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
L1 DTLB store misses PMC1
L1 DTLB store miss rate PMC1/FIXC0
-L1 DTLB store miss duration PMC3
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
LONG
Formulas:
-L1 DTLB load misses DTLB_LOAD_MISSES_CAUSES_A_WALK
-L1 DTLB load miss rate DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 DTLB load miss duration DTLB_LOAD_MISSES_WALK_DURATION
-L1 DTLB store misses DTLB_STORE_MISSES_CAUSES_A_WALK
-L1 DTLB store miss rate DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 DTLB store miss duration DTLB_STORE_MISSES_WALK_DURATION
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
-
-The DTLB load and store miss rates gives a measure how often a TLB miss occured
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
per instruction. The duration measures the time in cycles how long a walk did take.
diff --git a/groups/haswell/TLB_INSTR.txt b/groups/haswell/TLB_INSTR.txt
index f95f78a..235d977 100644
--- a/groups/haswell/TLB_INSTR.txt
+++ b/groups/haswell/TLB_INSTR.txt
@@ -14,15 +14,15 @@ Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
L1 ITLB misses PMC0
L1 ITLB miss rate PMC0/FIXC0
-L1 ITLB miss duration PMC1
+L1 ITLB miss duration [Cyc] PMC1/PMC0
LONG
Formulas:
-L1 ITLB misses ITLB_MISSES_CAUSES_A_WALK
-L1 ITLB miss rate ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 ITLB miss duration ITLB_MISSES_WALK_DURATION
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
-
-The ITLB miss rates gives a measure how often a TLB miss occured
+The ITLB miss rates gives a measure how often a TLB miss occurred
per instruction. The duration measures the time in cycles how long a walk did take.
diff --git a/groups/haswell/UOPS.txt b/groups/haswell/UOPS.txt
new file mode 100644
index 0000000..178aec5
--- /dev/null
+++ b/groups/haswell/UOPS.txt
@@ -0,0 +1,35 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_ANY
+PMC1 UOPS_EXECUTED_THREAD
+PMC2 UOPS_RETIRED_ALL
+PMC3 UOPS_ISSUED_FLAGS_MERGE
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formula:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/groups/haswell/UOPS_EXEC.txt b/groups/haswell/UOPS_EXEC.txt
new file mode 100644
index 0000000..7042df7
--- /dev/null
+++ b/groups/haswell/UOPS_EXEC.txt
@@ -0,0 +1,31 @@
+SHORT UOPs execution
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_EXECUTED_USED_CYCLES
+PMC1 UOPS_EXECUTED_STALL_CYCLES
+PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT UOPS_EXECUTED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the execution stage in the pipeline. Used cycles are all cycles where uOPs are
+executed while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/haswell/UOPS_ISSUE.txt b/groups/haswell/UOPS_ISSUE.txt
new file mode 100644
index 0000000..9aac923
--- /dev/null
+++ b/groups/haswell/UOPS_ISSUE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs issueing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_USED_CYCLES
+PMC1 UOPS_ISSUED_STALL_CYCLES
+PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT UOPS_ISSUED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the issue stage in the pipeline. Used cycles are all cycles where uOPs are
+issued while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/haswell/UOPS_RETIRE.txt b/groups/haswell/UOPS_RETIRE.txt
new file mode 100644
index 0000000..0f37585
--- /dev/null
+++ b/groups/haswell/UOPS_RETIRE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_RETIRED_USED_CYCLES
+PMC1 UOPS_RETIRED_STALL_CYCLES
+PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the retirement stage in the pipeline (re-order buffer). Used cycles are all
+cycles where uOPs are retired while unused cycles refer to pipeline stalls.
+Moreover, the group calculates the average stall duration in cycles.
diff --git a/groups/haswellEP/BRANCH.txt b/groups/haswellEP/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/groups/haswellEP/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 BR_INST_RETIRED_ALL_BRANCHES
+PMC1 BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Branch rate PMC0/FIXC0
+Branch misprediction rate PMC1/FIXC0
+Branch misprediction ratio PMC1/PMC0
+Instructions per branch FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/groups/haswellEP/CACHES.txt b/groups/haswellEP/CACHES.txt
new file mode 100644
index 0000000..3c13a52
--- /dev/null
+++ b/groups/haswellEP/CACHES.txt
@@ -0,0 +1,123 @@
+SHORT Cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L1D_REPLACEMENT
+PMC1 L1D_M_EVICT
+PMC2 L2_LINES_IN_ALL
+PMC3 L2_TRANS_L2_WB
+CBOX0C0 LLC_LOOKUP_DATA_READ
+CBOX1C0 LLC_LOOKUP_DATA_READ
+CBOX2C0 LLC_LOOKUP_DATA_READ
+CBOX3C0 LLC_LOOKUP_DATA_READ
+CBOX4C0 LLC_LOOKUP_DATA_READ
+CBOX5C0 LLC_LOOKUP_DATA_READ
+CBOX6C0 LLC_LOOKUP_DATA_READ
+CBOX7C0 LLC_LOOKUP_DATA_READ
+CBOX8C0 LLC_LOOKUP_DATA_READ
+CBOX9C0 LLC_LOOKUP_DATA_READ
+CBOX10C0 LLC_LOOKUP_DATA_READ
+CBOX11C0 LLC_LOOKUP_DATA_READ
+CBOX12C0 LLC_LOOKUP_DATA_READ
+CBOX13C0 LLC_LOOKUP_DATA_READ
+CBOX14C0 LLC_LOOKUP_DATA_READ
+CBOX15C0 LLC_LOOKUP_DATA_READ
+CBOX16C0 LLC_LOOKUP_DATA_READ
+CBOX17C0 LLC_LOOKUP_DATA_READ
+CBOX0C1 LLC_VICTIMS_M
+CBOX1C1 LLC_VICTIMS_M
+CBOX2C1 LLC_VICTIMS_M
+CBOX3C1 LLC_VICTIMS_M
+CBOX4C1 LLC_VICTIMS_M
+CBOX5C1 LLC_VICTIMS_M
+CBOX6C1 LLC_VICTIMS_M
+CBOX7C1 LLC_VICTIMS_M
+CBOX8C1 LLC_VICTIMS_M
+CBOX9C1 LLC_VICTIMS_M
+CBOX10C1 LLC_VICTIMS_M
+CBOX11C1 LLC_VICTIMS_M
+CBOX12C1 LLC_VICTIMS_M
+CBOX13C1 LLC_VICTIMS_M
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L3 to L2 load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time
+L3 to L2 load data volume [GBytes] 1.0E-09*PMC2*64.0
+L2 to L3 evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time
+L2 to L3 evict data volume [GBytes] 1.0E-06*PMC3*64.0
+L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0)*64.0/time
+System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0)*64.0
+L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64/time
+L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64
+L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64.0/time
+L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64.0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
+L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64
+L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
+L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64
+L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
+L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time
+L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64
+L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time
+L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64
+L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ))*64/time
+System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ))*64
+L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M))*64/time
+L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M))*64
+L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64/time
+L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
+-
+Group to measure cache transfers between L1 and Memory. Please notice that the
+L3 to/from system metrics contain any traffic to the system (memory,
+Intel QPI, etc.) but don't seem to handle anything because commonly memory read
+bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth.
+
diff --git a/groups/haswellEP/CBOX.txt b/groups/haswellEP/CBOX.txt
new file mode 100644
index 0000000..d9cc13c
--- /dev/null
+++ b/groups/haswellEP/CBOX.txt
@@ -0,0 +1,61 @@
+SHORT CBOX related data and metrics
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+CBOX0C0 LLC_VICTIMS_M
+CBOX1C0 LLC_VICTIMS_M
+CBOX2C0 LLC_VICTIMS_M
+CBOX3C0 LLC_VICTIMS_M
+CBOX4C0 LLC_VICTIMS_M
+CBOX5C0 LLC_VICTIMS_M
+CBOX6C0 LLC_VICTIMS_M
+CBOX7C0 LLC_VICTIMS_M
+CBOX8C0 LLC_VICTIMS_M
+CBOX9C0 LLC_VICTIMS_M
+CBOX10C0 LLC_VICTIMS_M
+CBOX11C0 LLC_VICTIMS_M
+CBOX12C0 LLC_VICTIMS_M
+CBOX13C0 LLC_VICTIMS_M
+CBOX14C0 LLC_VICTIMS_M
+CBOX15C0 LLC_VICTIMS_M
+CBOX16C0 LLC_VICTIMS_M
+CBOX17C0 LLC_VICTIMS_M
+CBOX0C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX1C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX2C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX3C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX4C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX5C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX6C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX7C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX8C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX9C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX10C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX11C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX12C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX13C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX14C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX15C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX16C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX17C1:STATE=0x1 LLC_LOOKUP_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+LLC misses per instruction (CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0)/FIXC0
+LL2 data written to MEM [MBytes] 1E-6*(CBOX0C1:STATE=0x1+CBOX1C1:STATE=0x1+CBOX2C1:STATE=0x1+CBOX3C1:STATE=0x1+CBOX4C1:STATE=0x1+CBOX5C1:STATE=0x1+CBOX6C1:STATE=0x1+CBOX7C1:STATE=0x1+CBOX8C1:STATE=0x1+CBOX9C1:STATE=0x1+CBOX10C1:STATE=0x1+CBOX11C1:STATE=0x1+CBOX12C1:STATE=0x1+CBOX13C1:STATE=0x1+CBOX14C1:STATE=0x1+CBOX15C1:STATE=0x1+CBOX16C1:STATE=0x1+CBOX17C1:STATE=0x1)*64
+
+
+LONG
+Formulas:
+LLC Misses Per Instruction = sum(LLC_VICTIMS_M)/INSTR_RETIRED_ANY
+LL2 data written to MEM [MBytes] = sum(LLC_LOOKUP_ANY)*64*1E-6
+-
+The CBOXes mediate the traffic from the L2 cache to the segmented L3 cache. Each
+CBOX is responsible for one segment (2.5 MByte). The boxes maintain the coherence between all
+CPU cores of the socket. Depending on the CPU core count, some CBOXes are not attached
+to a 2.5 MByte slice but are still active and track the traffic.
diff --git a/groups/haswellEP/CLOCK.txt b/groups/haswellEP/CLOCK.txt
new file mode 100644
index 0000000..a2556b4
--- /dev/null
+++ b/groups/haswellEP/CLOCK.txt
@@ -0,0 +1,23 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Energy [J] PWR0
+Power [W] PWR0/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+-
+Haswell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/groups/haswellEP/DATA.txt b/groups/haswellEP/DATA.txt
new file mode 100644
index 0000000..967cbad
--- /dev/null
+++ b/groups/haswellEP/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_UOPS_RETIRED_LOADS
+PMC1 MEM_UOPS_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/haswellEP/ENERGY.txt b/groups/haswellEP/ENERGY.txt
new file mode 100644
index 0000000..6c26b30
--- /dev/null
+++ b/groups/haswellEP/ENERGY.txt
@@ -0,0 +1,35 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0 TEMP_CORE
+PWR0 PWR_PKG_ENERGY
+PWR1 PWR_PP0_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Temperature [C] TMP0
+Energy [J] PWR0
+Power [W] PWR0/time
+Energy PP0 [J] PWR1
+Power PP0 [W] PWR1/time
+Energy DRAM [J] PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Haswell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) and DRAM level.
+
diff --git a/groups/haswellEP/FALSE_SHARE.txt b/groups/haswellEP/FALSE_SHARE.txt
new file mode 100644
index 0000000..ce1a8bb
--- /dev/null
+++ b/groups/haswellEP/FALSE_SHARE.txt
@@ -0,0 +1,34 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM
+PMC1 MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM
+PMC2 MEM_LOAD_UOPS_RETIRED_ALL_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Local LLC hit with false sharing [MByte] 1.E-06*PMC0*64
+Local LLC hit with false sharing rate PMC0/PMC2
+Remote LLC false sharing [MByte] 1.E-06*PMC1*64
+Remote LLC false sharing rate PMC1/PMC2
+
+LONG
+Formula:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+Remote LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM*64
+Remote LLC false sharing rate = MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory loads as reference.
+For systems with multiple CPU sockets, this performance group also measures the
+false-sharing of cache lines over socket boundaries.
+Please keep in mind that the MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM event may
+undercount by as much as 40% (Errata HSW150).
diff --git a/groups/haswellEP/FLOPS_AVX.txt b/groups/haswellEP/FLOPS_AVX.txt
new file mode 100644
index 0000000..9efdd1d
--- /dev/null
+++ b/groups/haswellEP/FLOPS_AVX.txt
@@ -0,0 +1,28 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 AVX_INSTS_CALC
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Packed SP MFLOP/s 1.0E-06*(PMC0*8.0)/time
+Packed DP MFLOP/s 1.0E-06*(PMC0*4.0)/time
+
+LONG
+Formula:
+Packed SP MFLOP/s = 1.0E-06*(AVX_INSTS_CALC*8)/runtime
+Packed DP MFLOP/s = 1.0E-06*(AVX_INSTS_CALC*4)/runtime
+-
+Packed 32b AVX FLOP/s rates. Approximate counts of AVX & AVX2 256-bit instructions.
+May count non-AVX instructions that employ 256-bit operations, including (but
+not necessarily limited to) rep string instructions that use 256-bit loads and
+stores for optimized performance, XSAVE* and XRSTOR*, and operations that
+transition the x87 FPU data registers between x87 and MMX.
+Caution: The event AVX_INSTS_CALC counts the insertf128 instruction often used
+by the Intel C compilers for (unaligned) vector loads.
diff --git a/groups/haswellEP/HA.txt b/groups/haswellEP/HA.txt
new file mode 100644
index 0000000..1e5a700
--- /dev/null
+++ b/groups/haswellEP/HA.txt
@@ -0,0 +1,40 @@
+SHORT Main memory bandwidth in MBytes/s seen from Home agent
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+BBOX0C0 IMC_READS_NORMAL
+BBOX0C1 BYPASS_IMC_TAKEN
+BBOX0C2 IMC_WRITES_ALL
+BBOX1C0 IMC_READS_NORMAL
+BBOX1C1 BYPASS_IMC_TAKEN
+BBOX1C2 IMC_WRITES_ALL
+
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(BBOX0C2+BBOX1C2)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(BBOX0C2+BBOX1C2)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1+BBOX0C2+BBOX1C2)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1+BBOX0C2+BBOX1C2)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_READS_NORMAL)+SUM(BYPASS_IMC_TAKEN))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(IMC_READS_NORMAL)+SUM(BYPASS_IMC_TAKEN))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_WRITES_ALL))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(IMC_WRITES_ALL))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_READS_NORMAL) + SUM(BYPASS_IMC_TAKEN) + SUM(IMC_WRITES_ALL))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(IMC_READS_NORMAL) + SUM(BYPASS_IMC_TAKEN) + SUM(IMC_WRITES_ALL))*64.0
+-
+This group derives the same metrics as the MEM group but use the events of the
+Home Agent, a central unit that is responsible for the protocol side of memory
+interactions.
diff --git a/groups/haswellEP/ICACHE.txt b/groups/haswellEP/ICACHE.txt
new file mode 100644
index 0000000..f1e2335
--- /dev/null
+++ b/groups/haswellEP/ICACHE.txt
@@ -0,0 +1,33 @@
+SHORT Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 ICACHE_ACCESSES
+PMC1 ICACHE_MISSES
+PMC2 ICACHE_IFETCH_STALL
+PMC3 ILD_STALL_IQ_FULL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+L1I stalls PMC2
+L1I stall rate PMC2/FIXC0
+L1I queue full stalls PMC3
+L1I queue full stall rate PMC3/FIXC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I stalls = ICACHE_IFETCH_STALL
+L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/haswellEP/L2.txt b/groups/haswellEP/L2.txt
new file mode 100644
index 0000000..60c7f79
--- /dev/null
+++ b/groups/haswellEP/L2.txt
@@ -0,0 +1,37 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L1D_REPLACEMENT
+PMC1 L2_TRANS_L1D_WB
+PMC2 ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes] 1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line loaded from the L2 to the L2 data cache and the writebacks from
+the L2 data cache to the L2 cache. The group also outputs total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cache lines transferred it the instruction
+cache.
diff --git a/groups/haswellEP/L2CACHE.txt b/groups/haswellEP/L2CACHE.txt
new file mode 100644
index 0000000..9b5dd4b
--- /dev/null
+++ b/groups/haswellEP/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L2_TRANS_ALL_REQUESTS
+PMC1 L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/haswellEP/L3.txt b/groups/haswellEP/L3.txt
new file mode 100644
index 0000000..0109db3
--- /dev/null
+++ b/groups/haswellEP/L3.txt
@@ -0,0 +1,36 @@
+SHORT L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L2_LINES_IN_ALL
+PMC1 L2_TRANS_L2_WB
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DEMAND_DIRTY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/groups/haswellEP/L3CACHE.txt b/groups/haswellEP/L3CACHE.txt
new file mode 100644
index 0000000..f863daa
--- /dev/null
+++ b/groups/haswellEP/L3CACHE.txt
@@ -0,0 +1,35 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1 MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2 UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/haswellEP/MEM.txt b/groups/haswellEP/MEM.txt
new file mode 100644
index 0000000..2a17a2c
--- /dev/null
+++ b/groups/haswellEP/MEM.txt
@@ -0,0 +1,52 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base. Some of the counters may not be available on your system.
+Also outputs total data volume transferred from main memory.
+The same metrics are provided by the HA group.
+
diff --git a/groups/haswellEP/NUMA.txt b/groups/haswellEP/NUMA.txt
new file mode 100644
index 0000000..0c1b8fb
--- /dev/null
+++ b/groups/haswellEP/NUMA.txt
@@ -0,0 +1,33 @@
+SHORT Local and remote memory accesses
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 OFFCORE_RESPONSE_0_LOCAL_DRAM
+PMC1 OFFCORE_RESPONSE_1_REMOTE_DRAM
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Local DRAM data volume [GByte] 1.E-09*PMC0*64
+Local DRAM bandwidth [MByte/s] 1.E-06*(PMC0*64)/time
+Remote DRAM data volume [GByte] 1.E-09*PMC1*64
+Remote DRAM bandwidth [MByte/s] 1.E-06*(PMC1*64)/time
+Memory data volume [GByte] 1.E-09*(PMC0+PMC1)*64
+Memory bandwidth [MByte/s] 1.E-06*((PMC0+PMC1)*64)/time
+
+LONG
+Formula:
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+Local DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_0_LOCAL_DRAM*64
+Local DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_0_LOCAL_DRAM*64)/time
+Remote DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_1_REMOTE_DRAM*64
+Remote DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_1_REMOTE_DRAM*64)/time
+Memory data volume [GByte] = 1.E-09*(OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64
+Memory bandwidth [MByte/s] = 1.E-06*((OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64)/time
+--
+This performance group measures the data traffic of CPU cores to local and remote
+memory.
diff --git a/groups/haswellEP/QPI.txt b/groups/haswellEP/QPI.txt
new file mode 100644
index 0000000..4ad0cf8
--- /dev/null
+++ b/groups/haswellEP/QPI.txt
@@ -0,0 +1,49 @@
+SHORT QPI Link Layer data
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+QBOX0C0 RXL_FLITS_G0_DATA
+QBOX1C0 RXL_FLITS_G0_DATA
+QBOX0C1 RXL_FLITS_G0_NON_DATA
+QBOX1C1 RXL_FLITS_G0_NON_DATA
+QBOX0C2 TXL_FLITS_G0_DATA
+QBOX1C2 TXL_FLITS_G0_DATA
+QBOX0C3 TXL_FLITS_G0_NON_DATA
+QBOX1C3 TXL_FLITS_G0_NON_DATA
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+QPI send data volume [GByte] 1.E-09*(QBOX0C2+QBOX1C2)*8
+QPI send data bandwidth [MByte/s] 1.E-06*(QBOX0C2+QBOX1C2)*8/time
+QPI send link volume [GByte] 1.E-09*(QBOX0C2+QBOX1C2+QBOX0C3+QBOX1C3)*8
+QPI send link bandwidth [MByte/s] 1.E-06*(QBOX0C2+QBOX1C2+QBOX0C3+QBOX1C3)*8/time
+QPI receive data volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0)*8
+QPI receive data bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0)*8/time
+QPI receive link volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0+QBOX0C1+QBOX1C1)*8
+QPI receive link bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0+QBOX0C1+QBOX1C1)*8/time
+QPI total transfer volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0+QBOX0C2+QBOX1C2+QBOX0C1+QBOX1C1+QBOX0C3+QBOX1C3)*8
+QPI total bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0+QBOX0C2+QBOX1C2+QBOX0C1+QBOX1C1+QBOX0C3+QBOX1C3)*8/time
+
+LONG
+Formula:
+QPI send data volume [GByte] = 1.E-09*(sum(TXL_FLITS_G0_DATA)*8)
+QPI send data bandwidth [MByte/s] = 1.E-06*(sum(TXL_FLITS_G0_DATA)*8)/runtime
+QPI send link volume [GByte] = 1.E-09*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)
+QPI send link bandwidth [MByte/s] = 1.E-06*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)/runtime
+QPI receive data volume [GByte] = 1.E-09*(sum(RXL_FLITS_G0_DATA)*8)
+QPI receive data bandwidth [MByte/s] = 1.E-06*(sum(RXL_FLITS_G0_DATA)*8)/runtime
+QPI receive link volume [GByte] = 1.E-09*((sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8)
+QPI receive link bandwidth [MByte/s] = 1.E-06*((sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8)/runtime
+QPI total transfer volume [GByte] = 1.E-09*(sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA)+sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8
+QPI total bandwidth [MByte/s] = 1.E-06*(sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA)+sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8/time
+--
+The Intel QPI Link Layer is responsible for packetizing requests from the caching agent (CBOXes)
+on the way out to the system interface. For Haswell EP systems, the Link Layer and the
+Ring interface is separated. The QPI link volume contains header, data and trailer while the
+QPI data volume counts only the data flits.
diff --git a/groups/haswellEP/RECOVERY.txt b/groups/haswellEP/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/groups/haswellEP/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 INT_MISC_RECOVERY_CYCLES
+PMC1 INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/groups/haswellEP/SBOX.txt b/groups/haswellEP/SBOX.txt
new file mode 100644
index 0000000..246deea
--- /dev/null
+++ b/groups/haswellEP/SBOX.txt
@@ -0,0 +1,28 @@
+SHORT Ring Transfer bandwidth
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+SBOX0C0 RING_BL_USED_ANY
+SBOX1C0 RING_BL_USED_ANY
+SBOX2C0 RING_BL_USED_ANY
+SBOX3C0 RING_BL_USED_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Ring transfer bandwidth [MByte/s] 1.E-06*(SBOX0C0+SBOX1C0+SBOX2C0+SBOX3C0)*32/time
+Ring transfer data volume [GByte] 1.E-09*(SBOX0C0+SBOX1C0+SBOX2C0+SBOX3C0)*32
+
+LONG
+Formula:
+Ring transfer bandwidth [MByte/s] = 1.E-06*(SUM(SBOXxC0)*32)/time
+Ring transfer data volume [GByte] = 1.E-09*(SUM(SBOXxC0)*32)
+--
+The SBOXes manage the transfer between the socket local ring(s). For micro architectures
+prior to Haswell, the SBOX and QBOX was similar as only a single ring was used.
+Haswell systems with a high core count assemble two rings that are connected through
+the SBOXes, the traffic between the sockets is handled by the QBOXes.
diff --git a/groups/haswellEP/TLB_DATA.txt b/groups/haswellEP/TLB_DATA.txt
new file mode 100644
index 0000000..8d94e05
--- /dev/null
+++ b/groups/haswellEP/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1 DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2 DTLB_LOAD_MISSES_WALK_DURATION
+PMC3 DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1 DTLB load misses PMC0
+L1 DTLB load miss rate PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses PMC1
+L1 DTLB store miss rate PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/haswellEP/TLB_INSTR.txt b/groups/haswellEP/TLB_INSTR.txt
new file mode 100644
index 0000000..235d977
--- /dev/null
+++ b/groups/haswellEP/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 ITLB_MISSES_CAUSES_A_WALK
+PMC1 ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1 ITLB misses PMC0
+L1 ITLB miss rate PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/haswellEP/UOPS.txt b/groups/haswellEP/UOPS.txt
new file mode 100644
index 0000000..178aec5
--- /dev/null
+++ b/groups/haswellEP/UOPS.txt
@@ -0,0 +1,35 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_ANY
+PMC1 UOPS_EXECUTED_THREAD
+PMC2 UOPS_RETIRED_ALL
+PMC3 UOPS_ISSUED_FLAGS_MERGE
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formula:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/groups/haswellEP/UOPS_EXEC.txt b/groups/haswellEP/UOPS_EXEC.txt
new file mode 100644
index 0000000..7042df7
--- /dev/null
+++ b/groups/haswellEP/UOPS_EXEC.txt
@@ -0,0 +1,31 @@
+SHORT UOPs execution
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_EXECUTED_USED_CYCLES
+PMC1 UOPS_EXECUTED_STALL_CYCLES
+PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT UOPS_EXECUTED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the execution stage in the pipeline. Used cycles are all cycles where uOPs are
+executed while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/haswellEP/UOPS_ISSUE.txt b/groups/haswellEP/UOPS_ISSUE.txt
new file mode 100644
index 0000000..9aac923
--- /dev/null
+++ b/groups/haswellEP/UOPS_ISSUE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs issueing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_USED_CYCLES
+PMC1 UOPS_ISSUED_STALL_CYCLES
+PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT UOPS_ISSUED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the issue stage in the pipeline. Used cycles are all cycles where uOPs are
+issued while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/haswellEP/UOPS_RETIRE.txt b/groups/haswellEP/UOPS_RETIRE.txt
new file mode 100644
index 0000000..0f37585
--- /dev/null
+++ b/groups/haswellEP/UOPS_RETIRE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_RETIRED_USED_CYCLES
+PMC1 UOPS_RETIRED_STALL_CYCLES
+PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the retirement stage in the pipeline (re-order buffer). Used cycles are all
+cycles where uOPs are retired while unused cycles refer to pipeline stalls.
+Moreover, the group calculates the average stall duration in cycles.
diff --git a/groups/interlagos/BRANCH.txt b/groups/interlagos/BRANCH.txt
index 1ae9f36..7495b74 100644
--- a/groups/interlagos/BRANCH.txt
+++ b/groups/interlagos/BRANCH.txt
@@ -4,29 +4,23 @@ EVENTSET
PMC0 RETIRED_INSTRUCTIONS
PMC1 RETIRED_BRANCH_INSTR
PMC2 RETIRED_MISPREDICTED_BRANCH_INSTR
-PMC3 RETIRED_TAKEN_BRANCH_INSTR
METRICS
Runtime (RDTSC) [s] time
Branch rate PMC1/PMC0
Branch misprediction rate PMC2/PMC0
Branch misprediction ratio PMC2/PMC1
-Branch taken rate PMC3/PMC0
-Branch taken ratio PMC3/PMC1
Instructions per branch PMC0/PMC1
LONG
Formulas:
-Branch rate = RETIRED_BRANCH_INSTR / RETIRED_INSTRUCTIONS
-Branch misprediction rate = RETIRED_MISPREDICTED_BRANCH_INSTR / RETIRED_INSTRUCTIONS
-Branch misprediction ratio = RETIRED_MISPREDICTED_BRANCH_INSTR / RETIRED_BRANCH_INSTR
-Branch taken rate = RETIRED_TAKEN_BRANCH_INSTR / RETIRED_INSTRUCTIONS
-Branch taken ratio = RETIRED_TAKEN_BRANCH_INSTR / RETIRED_BRANCH_INSTR
-Instructions per branch = RETIRED_INSTRUCTIONS / RETIRED_BRANCH_INSTR
+Branch rate = RETIRED_BRANCH_INSTR/RETIRED_INSTRUCTIONS
+Branch misprediction rate = RETIRED_MISPREDICTED_BRANCH_INSTR/RETIRED_INSTRUCTIONS
+Branch misprediction ratio = RETIRED_MISPREDICTED_BRANCH_INSTR/RETIRED_BRANCH_INSTR
+Instructions per branch = RETIRED_INSTRUCTIONS/RETIRED_BRANCH_INSTR
-
-The rates state how often in average a branch or a mispredicted branch occured
+The rates state how often on average a branch or a mispredicted branch occurred
per instruction retired in total. The branch misprediction ratio sets directly
into relation what ratio of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate. The same applies for the branches
-taken metrics.
+Instructions per branch is 1/branch rate.
diff --git a/groups/interlagos/CACHE.txt b/groups/interlagos/CACHE.txt
index 23343a5..0d785fc 100644
--- a/groups/interlagos/CACHE.txt
+++ b/groups/interlagos/CACHE.txt
@@ -8,25 +8,25 @@ PMC3 DATA_CACHE_MISSES_ALL
METRICS
Runtime (RDTSC) [s] time
-Data cache misses PMC3
-Data cache request rate PMC1/PMC0
-Data cache miss rate (PMC2)/PMC0
-Data cache miss ratio (PMC2)/PMC1
+data cache misses PMC3
+data cache request rate PMC1/PMC0
+data cache miss rate (PMC2)/PMC0
+data cache miss ratio (PMC2)/PMC1
LONG
Formulas:
-Data cache misses = DATA_CACHE_MISSES_ALL
-Data cache request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS
-Data cache miss rate = (DATA_CACHE_REFILLS_VALID) / RETIRED_INSTRUCTIONS
-Data cache miss ratio = (DATA_CACHE_REFILLS_VALID)/DATA_CACHE_ACCESSES
+data cache misses = DATA_CACHE_MISSES_ALL
+data cache request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS
+data cache miss rate = (DATA_CACHE_REFILLS_VALID) / RETIRED_INSTRUCTIONS
+data cache miss ratio = (DATA_CACHE_REFILLS_VALID)/DATA_CACHE_ACCESSES
-
This group measures the locality of your data accesses with regard to the
-L1 Cache. Data cache request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
-The Data cache miss rate gives a measure how often it was necessary to get
-cachelines from higher levels of the memory hierarchy. And finally
-Data cache miss ratio tells you how many of your memory references required
-a cacheline to be loaded from a higher level. While the Data cache miss rate
-might be given by your algorithm you should try to get Data cache miss ratio
+L1 cache. Data cache request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy. And finally
+data cache miss ratio tells you how many of your memory references required
+a cache line to be loaded from a higher level. While the# data cache miss rate
+might be given by your algorithm you should try to get data cache miss ratio
as low as possible by increasing your cache reuse.
diff --git a/groups/interlagos/CPI.txt b/groups/interlagos/CPI.txt
index 47711f4..c0746e7 100644
--- a/groups/interlagos/CPI.txt
+++ b/groups/interlagos/CPI.txt
@@ -13,6 +13,11 @@ CPI (based on uops) PMC1/PMC2
IPC PMC0/PMC1
LONG
+Formulas:
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS
+IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED
+-
This group measures how efficient the processor works with
regard to instruction throughput. Also important as a standalone
metric is RETIRED_INSTRUCTIONS as it tells you how many instruction
diff --git a/groups/interlagos/DATA.txt b/groups/interlagos/DATA.txt
index 78e4c3c..75f1f60 100644
--- a/groups/interlagos/DATA.txt
+++ b/groups/interlagos/DATA.txt
@@ -6,11 +6,11 @@ PMC1 LS_DISPATCH_STORES
METRICS
Runtime (RDTSC) [s] time
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
LONG
Formulas:
-Load to Store ratio = LS_DISPATCH_LOADS / LS_DISPATCH_STORES
+Load to store ratio = LS_DISPATCH_LOADS/LS_DISPATCH_STORES
-
This is a simple metric to determine your load to store ratio.
diff --git a/groups/interlagos/FLOPS_DP.txt b/groups/interlagos/FLOPS_DP.txt
index d7f5f57..27e58c3 100644
--- a/groups/interlagos/FLOPS_DP.txt
+++ b/groups/interlagos/FLOPS_DP.txt
@@ -1,4 +1,4 @@
-SHORT Double Precision MFlops/s
+SHORT Double Precision MFLOP/s
EVENTSET
PMC0 RETIRED_INSTRUCTIONS
@@ -9,15 +9,15 @@ PMC3 RETIRED_FLOPS_DOUBLE_ALL
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] PMC1*inverseClock
-MFlops/s 1.0E-06*(PMC3)/time
+DP MFLOP/s 1.0E-06*(PMC3)/time
CPI PMC1/PMC0
CPI (based on uops) PMC1/PMC2
IPC PMC0/PMC1
LONG
Formulas:
-DP MFlops/s = 1.0E-06*(RETIRED_FLOPS_DOUBLE_ALL)/time
+DP MFLOP/s = 1.0E-06*(RETIRED_FLOPS_DOUBLE_ALL)/time
-
-Profiling group to measure double precisision flop rate.
+Profiling group to measure double precisision FLOP rate.
diff --git a/groups/interlagos/FLOPS_SP.txt b/groups/interlagos/FLOPS_SP.txt
index 1c4dcc3..7db569f 100644
--- a/groups/interlagos/FLOPS_SP.txt
+++ b/groups/interlagos/FLOPS_SP.txt
@@ -1,4 +1,4 @@
-SHORT Single Precision MFlops/s
+SHORT Single Precision MFLOP/s
EVENTSET
PMC0 RETIRED_INSTRUCTIONS
@@ -9,15 +9,15 @@ PMC3 RETIRED_FLOPS_SINGLE_ALL
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] PMC1*inverseClock
-MFlops/s 1.0E-06*(PMC3)/time
+SP MFLOP/s 1.0E-06*(PMC3)/time
CPI PMC1/PMC0
CPI (based on uops) PMC1/PMC2
IPC PMC0/PMC1
LONG
Formulas:
-SP MFlops/s = 1.0E-06*(RETIRED_FLOPS_SINGLE_ALL)/time
+SP MFLOP/s = 1.0E-06*(RETIRED_FLOPS_SINGLE_ALL)/time
-
-Profiling group to measure single precision flop rate.
+Profiling group to measure single precision FLOP rate.
diff --git a/groups/interlagos/FPU_EXCEPTION.txt b/groups/interlagos/FPU_EXCEPTION.txt
index 5c586e4..0969ae1 100644
--- a/groups/interlagos/FPU_EXCEPTION.txt
+++ b/groups/interlagos/FPU_EXCEPTION.txt
@@ -15,7 +15,7 @@ Formulas:
Overall FP exception rate = FPU_EXCEPTIONS_ALL / INSTRUCTIONS_RETIRED
FP exception rate = FPU_EXCEPTIONS_ALL / FP_INSTRUCTIONS_RETIRED_ALL
-
-Floating point exceptions occur e.g. on the treatment of Denormals.
+Floating point exceptions occur e.g. on the treatment of denormal numbers.
There might be a large penalty if there are too many floating point
exceptions.
diff --git a/groups/interlagos/ICACHE.txt b/groups/interlagos/ICACHE.txt
index be5e5f5..62b91d6 100644
--- a/groups/interlagos/ICACHE.txt
+++ b/groups/interlagos/ICACHE.txt
@@ -8,18 +8,16 @@ PMC3 RETIRED_INSTRUCTIONS
METRICS
Runtime (RDTSC) [s] time
-Instruction cache misses PMC1+PMC2
-Instruction cache request rate PMC0/PMC3
-Instruction cache miss rate (PMC1+PMC2)/PMC3
-Instruction cache miss ratio (PMC1+PMC2)/PMC0
+L1I request rate PMC0/PMC3
+L1I miss rate (PMC1+PMC2)/PMC3
+L1I miss ratio (PMC1+PMC2)/PMC0
LONG
Formulas:
-Instruction cache misses INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS
-Instruction cache request rate INSTRUCTION_CACHE_FETCHES / RETIRED_INSTRUCTIONS
-Instruction cache miss rate (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/RETIRED_INSTRUCTIONS
-Instruction cache miss ratio (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/INSTRUCTION_CACHE_FETCHES
+L1I request rate = INSTRUCTION_CACHE_FETCHES / RETIRED_INSTRUCTIONS
+L1I miss rate = (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/RETIRED_INSTRUCTIONS
+L1I miss ratio = (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/INSTRUCTION_CACHE_FETCHES
-
This group measures the locality of your instruction code with regard to the
-L1 I-Cache.
+L1 I-Cache.
diff --git a/groups/interlagos/L2.txt b/groups/interlagos/L2.txt
index a1f5714..5bf1843 100644
--- a/groups/interlagos/L2.txt
+++ b/groups/interlagos/L2.txt
@@ -16,14 +16,14 @@ LONG
Formulas:
L2 bandwidth [MBytes/s] 1.0E-06*(DATA_CACHE_REFILLS_ALL-DATA_CACHE_REFILLS_SYSTEM)*64/time
L2 data volume [GBytes] 1.0E-09*(DATA_CACHE_REFILLS_ALL-DATA_CACHE_REFILLS_SYSTEM)*64
-Cache refill bandwidth System/L2 [MBytes/s] 1.0E-06*DATA_CACHE_REFILLS_ALL*64/time
-Cache refill bandwidth System [MBytes/s] 1.0E-06*DATA_CACHE_REFILLS_SYSTEM*64/time
+Cache refill bandwidth system/L2 [MBytes/s] 1.0E-06*DATA_CACHE_REFILLS_ALL*64/time
+Cache refill bandwidth system [MBytes/s] 1.0E-06*DATA_CACHE_REFILLS_SYSTEM*64/time
-
Profiling group to measure L2 cache bandwidth. The bandwidth is
-computed by the number of cacheline loaded from L2 to L1 and the
-number of modified cachelines evicted from the L1.
-Note that this bandwidth also includes data transfers due to a
+computed by the number of cache line loaded from L2 to L1 and the
+number of modified cache lines evicted from the L1.
+Note that this bandwidth also included data transfers due to a
write allocate load on a store miss in L1 and copy back transfers if
-originated from L2. L2-L1 data volume is the total data volume transfered
+originated from L2. L2-L2 data volume is the total data volume transferred
between L2 and L1.
diff --git a/groups/interlagos/L2CACHE.txt b/groups/interlagos/L2CACHE.txt
index 17209e8..49b9555 100644
--- a/groups/interlagos/L2CACHE.txt
+++ b/groups/interlagos/L2CACHE.txt
@@ -7,23 +7,23 @@ PMC2 L2_CACHE_MISS_DC_FILL
METRICS
Runtime (RDTSC) [s] time
-L2 request rate (PMC1)/PMC0
+L2 request rate PMC1/PMC0
L2 miss rate PMC2/PMC0
-L2 miss ratio PMC2/(PMC1)
+L2 miss ratio PMC2/PMC1
LONG
Formulas:
-L2 request rate = (L2_REQUESTS_ALL)/INSTRUCTIONS_RETIRED
+L2 request rate = L2_REQUESTS_ALL/INSTRUCTIONS_RETIRED
L2 miss rate = L2_MISSES_ALL/INSTRUCTIONS_RETIRED
-L2 miss ratio = L2_MISSES_ALL/(L2_REQUESTS_ALL)
+L2 miss ratio = L2_MISSES_ALL/L2_REQUESTS_ALL
-
This group measures the locality of your data accesses with regard to the L2
Cache. L2 request rate tells you how data intensive your code is or how many
-Data accesses you have in average per instruction. The L2 miss rate gives a
-measure how often it was necessary to get cachelines from memory. And finally
-L2 miss ratio tells you how many of your memory references required a cacheline
-to be loaded from a higher level. While the Data cache miss rate might be
-given by your algorithm you should try to get Data cache miss ratio as low as
+data accesses you have on average per instruction. The L2 miss rate gives a
+measure how often it was necessary to get cache lines from memory. And finally
+L2 miss ratio tells you how many of your memory references required a cache line
+to be loaded from a higher level. While the# data cache miss rate might be
+given by your algorithm you should try to get data cache miss ratio as low as
possible by increasing your cache reuse. This group is inspired from the
whitepaper -Basic Performance Measurements for AMD Athlon 64, AMD Opteron and
AMD Phenom Processors- from Paul J. Drongowski.
diff --git a/groups/interlagos/L3.txt b/groups/interlagos/L3.txt
index c1a6f17..5c9ea4d 100644
--- a/groups/interlagos/L3.txt
+++ b/groups/interlagos/L3.txt
@@ -7,18 +7,23 @@ PMC2 CPU_CLOCKS_UNHALTED
METRICS
Runtime (RDTSC) [s] time
-L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
-L3 refill bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-L3 evict [MBytes/s] 1.0E-06*PMC1*64.0/time
+L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
LONG
Formulas:
-L3 bandwidth [MBytes/s] 1.0E-06*(L2_FILL_WB_FILL+L2_FILL_WB_WB)*64/time
-L3 data volume [GBytes] 1.0E-09*(L2_FILL_WB_FILL+L2_FILL_WB_WB)*64
-L3 refill bandwidth [MBytes/s] 1.0E-06*L2_FILL_WB_FILL*64/time
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_FILL_WB_FILL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_FILL_WB_FILL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_FILL_WB_WB*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_FILL_WB_WB*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_FILL_WB_FILL+L2_FILL_WB_WB)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_FILL_WB_FILL+L2_FILL_WB_WB)*64
-
Profiling group to measure L3 cache bandwidth. The bandwidth is
-computed by the number of cacheline loaded from L3 to L2 and the
-number of modified cachelines evicted from the L2.
+computed by the number of cache line loaded from L3 to L2 and the
+number of modified cache lines evicted from the L2.
diff --git a/groups/interlagos/L3CACHE.txt b/groups/interlagos/L3CACHE.txt
index 4bef1a7..5a442c6 100644
--- a/groups/interlagos/L3CACHE.txt
+++ b/groups/interlagos/L3CACHE.txt
@@ -16,20 +16,20 @@ L3 average access latency [cycles] UPMC2/UPMC3
LONG
Formulas:
-L3 request rate = (UNC_READ_REQ_TO_L3_ALL)/INSTRUCTIONS_RETIRED
-L3 miss rate = UNC_L3_CACHE_MISS_ALL/INSTRUCTIONS_RETIRED
+L3 request rate = UNC_READ_REQ_TO_L3_ALL/INSTRUCTIONS_RETIRED
+L3 miss rate = UNC_L3_CACHE_MISS_ALL/INSTRUCTIONS_RETIRED
L3 miss ratio = UNC_L3_CACHE_MISS_ALL/UNC_READ_REQ_TO_L3_ALL
L3 average access latency = UNC_L3_LATENCY_CYCLE_COUNT/UNC_L3_LATENCY_REQUEST_COUNT
-
This group measures the locality of your data accesses with regard to the L3
Cache. L3 request rate tells you how data intensive your code is or how many
-Data accesses you have in average per instruction. The L3 miss rate gives a
-measure how often it was necessary to get cachelines from memory. And finally
-L3 miss ratio tells you how many of your memory references required a cacheline
-to be loaded from a higher level. While the Data cache miss rate might be
-given by your algorithm you should try to get Data cache miss ratio as low as
+data accesses you have on average per instruction. The L3 miss rate gives a
+measure how often it was necessary to get cache lines from memory. And finally
+L3 miss ratio tells you how many of your memory references required a cache line
+to be loaded from a higher level. While the# data cache miss rate might be
+given by your algorithm you should try to get data cache miss ratio as low as
possible by increasing your cache reuse. This group was inspired from the
-whitepaper -Basic Performance Measurements for AMD Athlon 64, AMD Opteron and
-AMD Phenom Processors- from Paul J. Drongowski.
+whitepaper - Basic Performance Measurements for AMD Athlon 64, AMD Opteron and
+AMD Phenom Processors - from Paul J. Drongowski.
diff --git a/groups/interlagos/LINKS.txt b/groups/interlagos/LINKS.txt
index 649f0d1..4b8ac22 100644
--- a/groups/interlagos/LINKS.txt
+++ b/groups/interlagos/LINKS.txt
@@ -20,7 +20,7 @@ Link bandwidth L1 [MBytes/s] 1.0E-06*UNC_LINK_TRANSMIT_BW_L1_USE*4.0/time
Link bandwidth L2 [MBytes/s] 1.0E-06*UNC_LINK_TRANSMIT_BW_L2_USE*4.0/time
Link bandwidth L3 [MBytes/s] 1.0E-06*UNC_LINK_TRANSMIT_BW_L3_USE*4.0/time
-
-Profiling group to measure the Hypertransport link bandwidth for the four links
-of a local node. This indicates the data flow between different ccNUMA nodes.
+Profiling group to measure the HyperTransport link bandwidth for the four links
+of a local node. This indicates the# data flow between different ccNUMA nodes.
diff --git a/groups/interlagos/MEM.txt b/groups/interlagos/MEM.txt
index 22aa19e..2fa9dfe 100644
--- a/groups/interlagos/MEM.txt
+++ b/groups/interlagos/MEM.txt
@@ -16,5 +16,5 @@ Memory data volume [GBytes] = 1.0E-09*(DRAM_ACCESSES_DCTO_ALL+DRAM_ACCESSES_DCT1
-
Profiling group to measure memory bandwidth drawn by all cores of a socket.
Note: As this group measures the accesses from all cores it only makes sense
-to measure with one core per socket, similiar as with the Intel Nehalem Uncore events.
+to measure with one core per socket, similar as with the Intel Nehalem Uncore events.
diff --git a/groups/interlagos/NUMA.txt b/groups/interlagos/NUMA.txt
index d94e735..ed13dbe 100644
--- a/groups/interlagos/NUMA.txt
+++ b/groups/interlagos/NUMA.txt
@@ -20,8 +20,8 @@ DRAM read/write local to 1 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/ti
DRAM read/write local to 2 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time
DRAM read/write local to 3 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time
-
-Profiling group to measure the traffic from local CPU to the different
-DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded
+Profiling group to measure the traffic from local CPU to the different
+DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded
code. You must first determine on which memory domains your code is running.
A code should only have significant traffic to its own memory domain.
diff --git a/groups/interlagos/NUMA_0_3.txt b/groups/interlagos/NUMA_0_3.txt
new file mode 100644
index 0000000..ed13dbe
--- /dev/null
+++ b/groups/interlagos/NUMA_0_3.txt
@@ -0,0 +1,28 @@
+SHORT Read/Write Events between the ccNUMA nodes
+
+EVENTSET
+UPMC0 UNC_CPU_TO_DRAM_LOCAL_TO_0
+UPMC1 UNC_CPU_TO_DRAM_LOCAL_TO_1
+UPMC2 UNC_CPU_TO_DRAM_LOCAL_TO_2
+UPMC3 UNC_CPU_TO_DRAM_LOCAL_TO_3
+
+METRICS
+Runtime (RDTSC) [s] time
+DRAM read/write local to 0 [MegaEvents/s] 1.0E-06*UPMC0/time
+DRAM read/write local to 1 [MegaEvents/s] 1.0E-06*UPMC1/time
+DRAM read/write local to 2 [MegaEvents/s] 1.0E-06*UPMC2/time
+DRAM read/write local to 3 [MegaEvents/s] 1.0E-06*UPMC3/time
+
+LONG
+Formulas:
+DRAM read/write local to 0 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_0/time
+DRAM read/write local to 1 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/time
+DRAM read/write local to 2 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time
+DRAM read/write local to 3 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time
+-
+Profiling group to measure the traffic from local CPU to the different
+DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded
+code. You must first determine on which memory domains your code is running.
+A code should only have significant traffic to its own memory domain.
+
+
diff --git a/groups/interlagos/NUMA_4_7.txt b/groups/interlagos/NUMA_4_7.txt
new file mode 100644
index 0000000..ae16499
--- /dev/null
+++ b/groups/interlagos/NUMA_4_7.txt
@@ -0,0 +1,28 @@
+SHORT Read/Write Events between the ccNUMA nodes
+
+EVENTSET
+UPMC0 UNC_CPU_TO_DRAM_LOCAL_TO_4
+UPMC1 UNC_CPU_TO_DRAM_LOCAL_TO_5
+UPMC2 UNC_CPU_TO_DRAM_LOCAL_TO_6
+UPMC3 UNC_CPU_TO_DRAM_LOCAL_TO_7
+
+METRICS
+Runtime (RDTSC) [s] time
+DRAM read/write local to 4 [MegaEvents/s] 1.0E-06*UPMC0/time
+DRAM read/write local to 5 [MegaEvents/s] 1.0E-06*UPMC1/time
+DRAM read/write local to 6 [MegaEvents/s] 1.0E-06*UPMC2/time
+DRAM read/write local to 7 [MegaEvents/s] 1.0E-06*UPMC3/time
+
+LONG
+Formulas:
+DRAM read/write local to 4 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_0/time
+DRAM read/write local to 5 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/time
+DRAM read/write local to 6 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time
+DRAM read/write local to 7 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time
+-
+Profiling group to measure the traffic from local CPU to the different
+DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded
+code. You must first determine on which memory domains your code is running.
+A code should only have significant traffic to its own memory domain.
+
+
diff --git a/groups/ivybridge/BRANCH.txt b/groups/ivybridge/BRANCH.txt
index cbaf834..b8d41b2 100644
--- a/groups/ivybridge/BRANCH.txt
+++ b/groups/ivybridge/BRANCH.txt
@@ -19,13 +19,13 @@ Instructions per branch FIXC0/PMC0
LONG
Formulas:
-Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
-
-The rates state how often in average a branch or a mispredicted branch occured
-per instruction retired in total. The Branch misprediction ratio sets directly
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
into relation what ratio of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate.
+Instructions per branch is 1/branch rate.
diff --git a/groups/ivybridge/CLOCK.txt b/groups/ivybridge/CLOCK.txt
index 80891d4..278821e 100644
--- a/groups/ivybridge/CLOCK.txt
+++ b/groups/ivybridge/CLOCK.txt
@@ -7,7 +7,7 @@ FIXC2 CPU_CLK_UNHALTED_REF
PWR0 PWR_PKG_ENERGY
METRICS
-Runtime (RDTSC) [s] time
+Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
diff --git a/groups/ivybridge/DATA.txt b/groups/ivybridge/DATA.txt
index 5f04a23..967cbad 100644
--- a/groups/ivybridge/DATA.txt
+++ b/groups/ivybridge/DATA.txt
@@ -4,19 +4,19 @@ EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
-PMC0 MEM_UOP_RETIRED_LOADS
-PMC1 MEM_UOP_RETIRED_STORES
+PMC0 MEM_UOPS_RETIRED_LOADS
+PMC1 MEM_UOPS_RETIRED_STORES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
LONG
Formulas:
-Load to Store ratio = MEM_UOP_RETIRED_LOADS / MEM_UOP_RETIRED_STORES
+Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
-
This is a metric to determine your load to store ratio.
diff --git a/groups/ivybridge/ENERGY.txt b/groups/ivybridge/ENERGY.txt
index 3f70077..541c3ad 100644
--- a/groups/ivybridge/ENERGY.txt
+++ b/groups/ivybridge/ENERGY.txt
@@ -7,6 +7,7 @@ FIXC2 CPU_CLK_UNHALTED_REF
TMP0 TEMP_CORE
PWR0 PWR_PKG_ENERGY
PWR1 PWR_PP0_ENERGY
+PWR2 PWR_PP1_ENERGY
PWR3 PWR_DRAM_ENERGY
METRICS
@@ -19,15 +20,18 @@ Energy [J] PWR0
Power [W] PWR0/time
Energy PP0 [J] PWR1
Power PP0 [W] PWR1/time
+Energy PP1 [J] PWR2
+Power PP1 [W] PWR2/time
Energy DRAM [J] PWR3
Power DRAM [W] PWR3/time
LONG
Formula:
-Power = PWR_PKG_ENERGY / time
-Power PP0 [W] PWR1/time
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power PP1 = PWR_PP1_ENERGY / time
Power DRAM = PWR_DRAM_ENERGY / time
-
IvyBridge implements the new RAPL interface. This interface enables to
-monitor the consumed energy on the package (socket) and DRAM level.
-
+monitor the consumed energy on the package (socket), the PP0 domain
+and DRAM level. The PP0 domain often refers to only the CPU cores.
diff --git a/groups/ivybridge/FALSE_SHARE.txt b/groups/ivybridge/FALSE_SHARE.txt
new file mode 100644
index 0000000..a87f7d4
--- /dev/null
+++ b/groups/ivybridge/FALSE_SHARE.txt
@@ -0,0 +1,25 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM
+PMC2 MEM_LOAD_UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Local LLC false sharing [MByte] 1.E-06*PMC0*64
+Local LLC false sharing rate PMC0/PMC2
+
+LONG
+Formula:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory loads as reference.
diff --git a/groups/ivybridge/FLOPS_AVX.txt b/groups/ivybridge/FLOPS_AVX.txt
index e8074c1..ea459f4 100644
--- a/groups/ivybridge/FLOPS_AVX.txt
+++ b/groups/ivybridge/FLOPS_AVX.txt
@@ -1,4 +1,4 @@
-SHORT Packed AVX MFlops/s
+SHORT Packed AVX MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -12,14 +12,14 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-32b packed SP MFlops/s 1.0E-06*(PMC0*8.0)/time
-32b packed DP MFlops/s 1.0E-06*(PMC1*4.0)/time
+Packed SP MFLOP/s 1.0E-06*(PMC0*8.0)/time
+Packed DP MFLOP/s 1.0E-06*(PMC1*4.0)/time
LONG
Formula:
-SP MFlops/s = (SIMD_FP_256_PACKED_SINGLE*8)/ runtime
-DP MFlops/s = (SIMD_FP_256_PACKED_DOUBLE*4)/ runtime
+Packed SP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed DP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
-
-AVX flops rates. Please note that the current flop measurements on IvyBridge are
+Packed 32b AVX FLOPs rates. Please note that the current FLOP measurements on IvyBridge are
potentially wrong. So you cannot trust these counters at the moment!
diff --git a/groups/ivybridge/FLOPS_DP.txt b/groups/ivybridge/FLOPS_DP.txt
index 1e47b50..b5e8273 100644
--- a/groups/ivybridge/FLOPS_DP.txt
+++ b/groups/ivybridge/FLOPS_DP.txt
@@ -1,4 +1,4 @@
-SHORT Double Precision MFlops/s
+SHORT Double Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -13,16 +13,19 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-MFlops/s 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
-AVX MFlops/s 1.0E-06*(PMC2*4.0)/time
+MFLOP/s 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFLOP/s 1.0E-06*(PMC2*4.0)/time
Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
Scalar MUOPS/s 1.0E-06*PMC1/time
LONG
Formula:
-MFlops/s = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 + FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-AVX MFlops/s = (SIMD_FP_256_PACKED_DOUBLE*4)/ runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
-
-SSE scalar and packed double precision flop rates. Please note that the current flop measurements on SandyBridge are
-potentially wrong. So you cannot trust these counters at the moment!
+SSE scalar and packed double precision FLOP rates. Please note that the current
+FLOP measurements on IvyBridge are potentially wrong.
+So you cannot trust these counters at the moment!
diff --git a/groups/ivybridge/FLOPS_SP.txt b/groups/ivybridge/FLOPS_SP.txt
index 0be0721..819b81c 100644
--- a/groups/ivybridge/FLOPS_SP.txt
+++ b/groups/ivybridge/FLOPS_SP.txt
@@ -1,4 +1,4 @@
-SHORT Single Precision MFlops/s
+SHORT Single Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -13,17 +13,19 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-MFlops/s 1.0E-06*(PMC0*4.0+PMC1)/time
-32b AVX MFlops/s 1.0E-06*(PMC2*8.0)/time
+MFLOP/s 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFLOP/s 1.0E-06*(PMC2*8.0)/time
Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
Scalar MUOPS/s 1.0E-06*PMC1/time
LONG
Formula:
-MFlops/s = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 + FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-AVX MFlops/s = (FP_256_PACKED_SINGLE*8)/ runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
-
-SSE scalar and packed single precision flop rates. Please note that the current
-flop measurements on IvyBridge are potentially wrong. So you cannot trust
-these counters at the moment!
+SSE scalar and packed single precision FLOP rates. Please note that the current
+FLOP measurements on IvyBridge are potentially wrong.
+So you cannot trust these counters at the moment!
diff --git a/groups/ivybridge/ICACHE.txt b/groups/ivybridge/ICACHE.txt
index 6ce3ce8..f1e2335 100644
--- a/groups/ivybridge/ICACHE.txt
+++ b/groups/ivybridge/ICACHE.txt
@@ -6,6 +6,8 @@ FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 ICACHE_ACCESSES
PMC1 ICACHE_MISSES
+PMC2 ICACHE_IFETCH_STALL
+PMC3 ILD_STALL_IQ_FULL
METRICS
Runtime (RDTSC) [s] time
@@ -15,11 +17,17 @@ CPI FIXC1/FIXC0
L1I request rate PMC0/FIXC0
L1I miss rate PMC1/FIXC0
L1I miss ratio PMC1/PMC0
+L1I stalls PMC2
+L1I stall rate PMC2/FIXC0
+L1I queue full stalls PMC3
+L1I queue full stall rate PMC3/FIXC0
LONG
Formulas:
-L2 request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
-L2 miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
-L2 miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I stalls = ICACHE_IFETCH_STALL
+L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY
-
This group measures some L1 instruction cache metrics.
diff --git a/groups/ivybridge/L2.txt b/groups/ivybridge/L2.txt
index 5345b7a..376e974 100644
--- a/groups/ivybridge/L2.txt
+++ b/groups/ivybridge/L2.txt
@@ -6,27 +6,33 @@ FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 L1D_REPLACEMENT
PMC1 L1D_M_EVICT
+PMC2 ICACHE_MISSES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
-L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes] 1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
LONG
Formulas:
-L2 Load [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
-L2 Evict [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
-L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
-L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64
-
Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
-number of cacheline allocated in the L1 and the number of modified cachelines
-evicted from the L1. The group also output total data volume transfered between
+number of cache line allocated in the L1 and the number of modified cache lines
+evicted from the L1. The group also outputs total data volume transferred between
L2 and L1. Note that this bandwidth also includes data transfers due to a write
-allocate load on a store miss in L1.
+allocate load on a store miss in L1 and cache lines transferred it the instruction
+cache.
diff --git a/groups/ivybridge/L2CACHE.txt b/groups/ivybridge/L2CACHE.txt
index 3d7c36e..9b5dd4b 100644
--- a/groups/ivybridge/L2CACHE.txt
+++ b/groups/ivybridge/L2CACHE.txt
@@ -18,18 +18,17 @@ L2 miss ratio PMC1/PMC0
LONG
Formulas:
-L2 request rate = L2_TRANS_ALL_REQUESTS / INSTR_RETIRED_ANY
-L2 miss rate = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_TRANS_ALL_REQUESTS
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
-
This group measures the locality of your data accesses with regard to the
-L2 Cache. L2 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
The L2 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L2 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
diff --git a/groups/ivybridge/L3.txt b/groups/ivybridge/L3.txt
index 9a7c914..f0a8aad 100644
--- a/groups/ivybridge/L3.txt
+++ b/groups/ivybridge/L3.txt
@@ -12,21 +12,25 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-L3 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L3 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
+L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0
L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
LONG
Formulas:
-L3 Load [MBytes/s] 1.0E-06*L2_LINES_IN_ALL*64/time
-L3 Evict [MBytes/s] 1.0E-06*L2_LINES_OUT_DIRTY_ALL*64/time
-L3 bandwidth [MBytes/s] 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64/time
-L3 data volume [GBytes] 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DIRTY_ALL*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DIRTY_ALL*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64
-
Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
-number of cacheline allocated in the L2 and the number of modified cachelines
-evicted from the L2. This group also outputs data volume transfered between the
-L3 and measured cores L2 caches. Note that this bandwidth also includes data
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and measured cores L2 caches. Note that this bandwidth also includes data
transfers due to a write allocate load on a store miss in L2.
diff --git a/groups/ivybridge/L3CACHE.txt b/groups/ivybridge/L3CACHE.txt
index d4fd89e..9f3036f 100644
--- a/groups/ivybridge/L3CACHE.txt
+++ b/groups/ivybridge/L3CACHE.txt
@@ -6,30 +6,31 @@ FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 MEM_LOAD_UOPS_RETIRED_L3_ALL
PMC1 MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2 UOPS_RETIRED_ALL
+
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-L3 request rate (PMC0)/FIXC0
-L3 miss rate PMC1/FIXC0
+L3 request rate (PMC0)/PMC2
+L3 miss rate PMC1/PMC2
L3 miss ratio PMC1/PMC0
LONG
Formulas:
-L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL / INSTR_RETIRED_ANY
-L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS / INSTR_RETIRED_ANY
-L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS / MEM_LOAD_UOPS_RETIRED_L3_ALL
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
-
This group measures the locality of your data accesses with regard to the
-L3 Cache. L3 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
The L3 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L3 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
diff --git a/groups/ivybridge/MEM.txt b/groups/ivybridge/MEM.txt
deleted file mode 100644
index 1f9ff4a..0000000
--- a/groups/ivybridge/MEM.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-SHORT Main memory bandwidth in MBytes/s
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-MBOX0C0 CAS_COUNT_RD
-MBOX0C1 CAS_COUNT_WR
-MBOX1C0 CAS_COUNT_RD
-MBOX1C1 CAS_COUNT_WR
-MBOX2C0 CAS_COUNT_RD
-MBOX2C1 CAS_COUNT_WR
-MBOX3C0 CAS_COUNT_RD
-MBOX3C1 CAS_COUNT_WR
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI FIXC1/FIXC0
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0
-
-LONG
-Profiling group to measure main memory bandwidth drawn by all cores of
-a socket. Since this group is based on uncore events it is only possible to
-measure on the granularity of a socket. If a thread group contains multiple
-threads only one thread per socket will show the results. Also outputs total
-data volume transfered from main memory.
-
diff --git a/groups/ivybridge/MEM_DP.txt b/groups/ivybridge/MEM_DP.txt
deleted file mode 100644
index 7bc76cd..0000000
--- a/groups/ivybridge/MEM_DP.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-SHORT Power and Energy consumption
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-TMP0 TEMP_CORE
-PWR0 PWR_PKG_ENERGY
-PWR3 PWR_DRAM_ENERGY
-PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
-PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
-PMC2 SIMD_FP_256_PACKED_DOUBLE
-MBOX0C0 CAS_COUNT_RD
-MBOX0C1 CAS_COUNT_WR
-MBOX1C0 CAS_COUNT_RD
-MBOX1C1 CAS_COUNT_WR
-MBOX2C0 CAS_COUNT_RD
-MBOX2C1 CAS_COUNT_WR
-MBOX3C0 CAS_COUNT_RD
-MBOX3C1 CAS_COUNT_WR
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI FIXC1/FIXC0
-Temperature TMP0
-Energy [J] PWR0
-Power [W] PWR0/time
-Energy DRAM [J] PWR3
-Power DRAM [W] PWR3/time
-AVX MFlops/s 1.0E-06*(4.0*PMC2)/time
-MFlops/s 1.0E-06*(PMC0*2.0+PMC1)/time
-Packed MUOPS/s 1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0
-
-LONG
-Formula:
-Power = PWR_PKG_ENERGY / runtime
-Power DRAM = PWR_DRAM_ENERGY / runtime
-MFlops/s = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 + FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-AVX MFlops/s = (SIMD_FP_256_PACKED_DOUBLE*4)/ runtime
-Memory Read BW [MBytes/s] 1.0E-06*(CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR)*64.0/time
---
-Profiling group to measure memory bandwidth drawn by all cores of a socket.
-Since this group is based on uncore events it is only possible to measure on
-a per socket base. Also outputs total data volume transfered from main memory,
-SSE scalar and packed double precision flop rates as well as consumed energy and
-temperature. Also reports on packed AVX 32b instructions. Please note that the
-current flop measurements on IvyBridge are potentially wrong. So you cannot trust
-these counters at the moment!
-
diff --git a/groups/ivybridge/MEM_SP.txt b/groups/ivybridge/MEM_SP.txt
deleted file mode 100644
index 4388cc4..0000000
--- a/groups/ivybridge/MEM_SP.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-SHORT Power and Energy consumption
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-TMP0 TEMP_CORE
-PWR0 PWR_PKG_ENERGY
-PWR3 PWR_DRAM_ENERGY
-PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
-PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
-PMC2 SIMD_FP_256_PACKED_SINGLE
-MBOX0C0 CAS_COUNT_RD
-MBOX0C1 CAS_COUNT_WR
-MBOX1C0 CAS_COUNT_RD
-MBOX1C1 CAS_COUNT_WR
-MBOX2C0 CAS_COUNT_RD
-MBOX2C1 CAS_COUNT_WR
-MBOX3C0 CAS_COUNT_RD
-MBOX3C1 CAS_COUNT_WR
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI FIXC1/FIXC0
-Temperature TMP0
-Energy [J] PWR0
-Power [W] PWR0/time
-Energy DRAM [J] PWR3
-Power DRAM [W] PWR3/time
-AVX MFlops/s 1.0E-06*(8.0*PMC2)/time
-MFlops/s 1.0E-06*(PMC0*4.0+PMC1)/time
-Packed MUOPS/s 1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0
-
-LONG
-Formula:
-Power = PWR_PKG_ENERGY / runtime
-Power DRAM = PWR_DRAM_ENERGY / runtime
-MFlops/s = (FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE * 4 + FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE) / runtime
-AVX MFlops/s = (SIMD_FP_256_PACKED_SINGLE * 8) / runtime
-Memory Read BW [MBytes/s] 1.0E-06*(CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR)*64.0/time
---
-Profiling group to measure memory bandwidth drawn by all cores of a socket.
-Since this group is based on uncore events it is only possible to measure on
-a per socket base. Also outputs total data volume transfered from main memory.
-SSE scalar and packed single precision flop rates as well as consumed energy and
-temperature. Also reports on packed AVX 32b instructions. Please note that the
-current flop measurements on SandyBridge are potentially wrong. So you cannot
-trust these counters at the moment!
-
diff --git a/groups/ivybridge/RECOVERY.txt b/groups/ivybridge/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/groups/ivybridge/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 INT_MISC_RECOVERY_CYCLES
+PMC1 INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/groups/ivybridge/TLB_DATA.txt b/groups/ivybridge/TLB_DATA.txt
index 2f59772..8d94e05 100644
--- a/groups/ivybridge/TLB_DATA.txt
+++ b/groups/ivybridge/TLB_DATA.txt
@@ -1,4 +1,4 @@
-SHORT L1 Data TLB miss rate/ratio
+SHORT L2 data TLB miss rate/ratio
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -16,20 +16,20 @@ Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
L1 DTLB load misses PMC0
L1 DTLB load miss rate PMC0/FIXC0
-L1 DTLB load miss duration PMC2
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
L1 DTLB store misses PMC1
L1 DTLB store miss rate PMC1/FIXC0
-L1 DTLB store miss duration PMC3
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
LONG
Formulas:
-L1 DTLB load misses DTLB_LOAD_MISSES_CAUSES_A_WALK
-L1 DTLB load miss rate DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 DTLB load miss duration DTLB_LOAD_MISSES_WALK_DURATION
-L1 DTLB store misses DTLB_STORE_MISSES_CAUSES_A_WALK
-L1 DTLB store miss rate DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 DTLB store miss duration DTLB_STORE_MISSES_WALK_DURATION
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
-
-The DTLB load and store miss rates gives a measure how often a TLB miss occured
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
per instruction. The duration measures the time in cycles how long a walk did take.
diff --git a/groups/ivybridge/TLB_INSTR.txt b/groups/ivybridge/TLB_INSTR.txt
index f95f78a..235d977 100644
--- a/groups/ivybridge/TLB_INSTR.txt
+++ b/groups/ivybridge/TLB_INSTR.txt
@@ -14,15 +14,15 @@ Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
L1 ITLB misses PMC0
L1 ITLB miss rate PMC0/FIXC0
-L1 ITLB miss duration PMC1
+L1 ITLB miss duration [Cyc] PMC1/PMC0
LONG
Formulas:
-L1 ITLB misses ITLB_MISSES_CAUSES_A_WALK
-L1 ITLB miss rate ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 ITLB miss duration ITLB_MISSES_WALK_DURATION
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
-
-The ITLB miss rates gives a measure how often a TLB miss occured
+The ITLB miss rates gives a measure how often a TLB miss occurred
per instruction. The duration measures the time in cycles how long a walk did take.
diff --git a/groups/ivybridge/UOPS.txt b/groups/ivybridge/UOPS.txt
new file mode 100644
index 0000000..178aec5
--- /dev/null
+++ b/groups/ivybridge/UOPS.txt
@@ -0,0 +1,35 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_ANY
+PMC1 UOPS_EXECUTED_THREAD
+PMC2 UOPS_RETIRED_ALL
+PMC3 UOPS_ISSUED_FLAGS_MERGE
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formula:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/groups/ivybridge/UOPS_EXEC.txt b/groups/ivybridge/UOPS_EXEC.txt
new file mode 100644
index 0000000..7042df7
--- /dev/null
+++ b/groups/ivybridge/UOPS_EXEC.txt
@@ -0,0 +1,31 @@
+SHORT UOPs execution
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_EXECUTED_USED_CYCLES
+PMC1 UOPS_EXECUTED_STALL_CYCLES
+PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT UOPS_EXECUTED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the execution stage in the pipeline. Used cycles are all cycles where uOPs are
+executed while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/ivybridge/UOPS_ISSUE.txt b/groups/ivybridge/UOPS_ISSUE.txt
new file mode 100644
index 0000000..9aac923
--- /dev/null
+++ b/groups/ivybridge/UOPS_ISSUE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs issueing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_USED_CYCLES
+PMC1 UOPS_ISSUED_STALL_CYCLES
+PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT UOPS_ISSUED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the issue stage in the pipeline. Used cycles are all cycles where uOPs are
+issued while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/ivybridge/UOPS_RETIRE.txt b/groups/ivybridge/UOPS_RETIRE.txt
new file mode 100644
index 0000000..0f37585
--- /dev/null
+++ b/groups/ivybridge/UOPS_RETIRE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_RETIRED_USED_CYCLES
+PMC1 UOPS_RETIRED_STALL_CYCLES
+PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the retirement stage in the pipeline (re-order buffer). Used cycles are all
+cycles where uOPs are retired while unused cycles refer to pipeline stalls.
+Moreover, the group calculates the average stall duration in cycles.
diff --git a/groups/ivybridgeEP/BRANCH.txt b/groups/ivybridgeEP/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/groups/ivybridgeEP/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 BR_INST_RETIRED_ALL_BRANCHES
+PMC1 BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Branch rate PMC0/FIXC0
+Branch misprediction rate PMC1/FIXC0
+Branch misprediction ratio PMC1/PMC0
+Instructions per branch FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/groups/ivybridgeEP/CACHES.txt b/groups/ivybridgeEP/CACHES.txt
new file mode 100644
index 0000000..ad63925
--- /dev/null
+++ b/groups/ivybridgeEP/CACHES.txt
@@ -0,0 +1,121 @@
+SHORT Cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L1D_REPLACEMENT
+PMC1 L1D_M_EVICT
+PMC2 L2_LINES_IN_ALL
+PMC3 L2_LINES_OUT_DIRTY_ALL
+CBOX0C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX1C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX2C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX3C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX4C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX5C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX6C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX7C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX8C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX9C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX10C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX11C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX12C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX13C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX14C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX0C1 LLC_VICTIMS_M_STATE
+CBOX1C1 LLC_VICTIMS_M_STATE
+CBOX2C1 LLC_VICTIMS_M_STATE
+CBOX3C1 LLC_VICTIMS_M_STATE
+CBOX4C1 LLC_VICTIMS_M_STATE
+CBOX5C1 LLC_VICTIMS_M_STATE
+CBOX6C1 LLC_VICTIMS_M_STATE
+CBOX7C1 LLC_VICTIMS_M_STATE
+CBOX8C1 LLC_VICTIMS_M_STATE
+CBOX9C1 LLC_VICTIMS_M_STATE
+CBOX10C1 LLC_VICTIMS_M_STATE
+CBOX11C1 LLC_VICTIMS_M_STATE
+CBOX12C1 LLC_VICTIMS_M_STATE
+CBOX13C1 LLC_VICTIMS_M_STATE
+CBOX14C1 LLC_VICTIMS_M_STATE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L3 to L2 load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time
+L3 to L2 load data volume [GBytes] 1.0E-09*PMC2*64.0
+L2 to L3 evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time
+L2 to L3 evict data volume [GBytes] 1.0E-06*PMC3*64.0
+L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F+CBOX8C0:STATE=0x3F+CBOX9C0:STATE=0x3F+CBOX10C0:STATE=0x3F+CBOX11C0:STATE=0x3F+CBOX12C0:STATE=0x3F+CBOX13C0:STATE=0x3F+CBOX14C0:STATE=0x3F)*64.0/time
+System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F+CBOX8C0:STATE=0x3F+CBOX9C0:STATE=0x3F+CBOX10C0:STATE=0x3F+CBOX11C0:STATE=0x3F+CBOX12C0:STATE=0x3F+CBOX13C0:STATE=0x3F+CBOX14C0:STATE=0x3F)*64.0
+L3 to memory bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1)*64/time
+L3 to memory data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1)*64
+L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F+CBOX8C0:STATE=0x3F+CBOX9C0:STATE=0x3F+CBOX10C0:STATE=0x3F+CBOX11C0:STATE=0x3F+CBOX12C0:STATE=0x3F+CBOX13C0:STATE=0x3F+CBOX14C0:STATE=0x3F+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1)*64.0/time
+L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F+CBOX8C0:STATE=0x3F+CBOX9C0:STATE=0x3F+CBOX10C0:STATE=0x3F+CBOX11C0:STATE=0x3F+CBOX12C0:STATE=0x3F+CBOX13C0:STATE=0x3F+CBOX14C0:STATE=0x3F+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1)*64.0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
+L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64
+L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
+L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64
+L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
+L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time
+L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64
+L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time
+L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64
+L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F))*64/time
+System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F))*64
+L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M_STATE))*64/time
+L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M_STATE))*64
+L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F)+SUM(LLC_VICTIMS_M_STATE))*64/time
+L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F)+SUM(LLC_VICTIMS_M_STATE))*64
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
+-
+Group to measure cache transfers between L1 and Memory. Please notice that the
+L3 to/from system metrics contain any traffic to the system (memory,
+Intel QPI, etc.) but don't seem to handle anything because commonly memory read
+bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth.
+
diff --git a/groups/ivybridgeEP/CBOX.txt b/groups/ivybridgeEP/CBOX.txt
new file mode 100644
index 0000000..ca6c6d5
--- /dev/null
+++ b/groups/ivybridgeEP/CBOX.txt
@@ -0,0 +1,55 @@
+SHORT CBOX related data and metrics
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+CBOX0C0 LLC_VICTIMS_M_STATE
+CBOX1C0 LLC_VICTIMS_M_STATE
+CBOX2C0 LLC_VICTIMS_M_STATE
+CBOX3C0 LLC_VICTIMS_M_STATE
+CBOX4C0 LLC_VICTIMS_M_STATE
+CBOX5C0 LLC_VICTIMS_M_STATE
+CBOX6C0 LLC_VICTIMS_M_STATE
+CBOX7C0 LLC_VICTIMS_M_STATE
+CBOX8C0 LLC_VICTIMS_M_STATE
+CBOX9C0 LLC_VICTIMS_M_STATE
+CBOX10C0 LLC_VICTIMS_M_STATE
+CBOX11C0 LLC_VICTIMS_M_STATE
+CBOX12C0 LLC_VICTIMS_M_STATE
+CBOX13C0 LLC_VICTIMS_M_STATE
+CBOX14C0 LLC_VICTIMS_M_STATE
+CBOX0C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX1C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX2C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX3C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX4C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX5C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX6C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX7C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX8C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX9C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX10C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX11C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX12C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX13C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX14C1:STATE=0x1 LLC_LOOKUP_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+LLC misses per instruction (CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0)/FIXC0
+LLC data written to MEM [MBytes] 1E-6*(CBOX0C1:STATE=0x1+CBOX1C1:STATE=0x1+CBOX2C1:STATE=0x1+CBOX3C1:STATE=0x1+CBOX4C1:STATE=0x1+CBOX5C1:STATE=0x1+CBOX6C1:STATE=0x1+CBOX7C1:STATE=0x1+CBOX8C1:STATE=0x1+CBOX9C1:STATE=0x1+CBOX10C1:STATE=0x1+CBOX11C1:STATE=0x1+CBOX12C1:STATE=0x1+CBOX13C1:STATE=0x1+CBOX14C1:STATE=0x1)*64
+
+
+LONG
+Formulas:
+LLC misses per instruction sum(LLC_VICTIMS_M_STATE)/INSTR_RETIRED_ANY
+LLC data written to MEM [MBytes] sum(LLC_LOOKUP_ANY)*64*1E-6
+--
+The CBOXes mediate the traffic from the L2 cache to the segmented L3 cache. Each
+CBOX is responsible for one segment (2.5 MByte). The boxes maintain the coherence between all
+CPU cores of the socket. Depending on the CPU core count, some CBOXes are not attached
+to a 2.5 MByte slice but are still active and track the traffic.
diff --git a/groups/ivybridgeEP/CLOCK.txt b/groups/ivybridgeEP/CLOCK.txt
new file mode 100644
index 0000000..278821e
--- /dev/null
+++ b/groups/ivybridgeEP/CLOCK.txt
@@ -0,0 +1,23 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Energy [J] PWR0
+Power [W] PWR0/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+-
+IvyBridge implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/groups/ivybridgeEP/DATA.txt b/groups/ivybridgeEP/DATA.txt
new file mode 100644
index 0000000..967cbad
--- /dev/null
+++ b/groups/ivybridgeEP/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_UOPS_RETIRED_LOADS
+PMC1 MEM_UOPS_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/ivybridgeEP/ENERGY.txt b/groups/ivybridgeEP/ENERGY.txt
new file mode 100644
index 0000000..07bc59c
--- /dev/null
+++ b/groups/ivybridgeEP/ENERGY.txt
@@ -0,0 +1,33 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0 TEMP_CORE
+PWR0 PWR_PKG_ENERGY
+PWR1 PWR_PP0_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Temperature [C] TMP0
+Energy [J] PWR0
+Power [W] PWR0/time
+Energy PP0 [J] PWR1
+Power PP0 [W] PWR1/time
+Energy DRAM [J] PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+IvyBridge implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket), the PP0 domain
+and DRAM level. The PP0 domain often refers to only the CPU cores.
diff --git a/groups/ivybridgeEP/FALSE_SHARE.txt b/groups/ivybridgeEP/FALSE_SHARE.txt
new file mode 100644
index 0000000..1d0a49e
--- /dev/null
+++ b/groups/ivybridgeEP/FALSE_SHARE.txt
@@ -0,0 +1,32 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM
+PMC1 MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_HITM
+PMC2 MEM_LOAD_UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Local LLC false sharing [MByte] 1.E-06*PMC0*64
+Local LLC false sharing rate PMC0/PMC2
+Remote LLC false sharing [MByte] 1.E-06*PMC1*64
+Remote LLC false sharing rate PMC1/PMC2
+
+LONG
+Formula:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+Remote LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_HITM*64
+Remote LLC false sharing rate = MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory loads as reference.
+For systems with multiple CPU sockets, this performance group also measures the
+false-sharing of cache lines over socket boundaries.
diff --git a/groups/ivybridgeEP/FLOPS_AVX.txt b/groups/ivybridgeEP/FLOPS_AVX.txt
new file mode 100644
index 0000000..7ca4aca
--- /dev/null
+++ b/groups/ivybridgeEP/FLOPS_AVX.txt
@@ -0,0 +1,26 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 SIMD_FP_256_PACKED_SINGLE
+PMC1 SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Packed SP MFLOP/s 1.0E-06*(PMC0*8.0)/time
+Packed DP MFLOP/s 1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formula:
+Packed SP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed DP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+-
+Packed 32b AVX FLOPs rates. Please note that the current FLOP measurements on
+IvyBridge are potentially wrong.
+So you cannot trust these counters at the moment!
+
diff --git a/groups/ivybridgeEP/FLOPS_DP.txt b/groups/ivybridgeEP/FLOPS_DP.txt
new file mode 100644
index 0000000..b5e8273
--- /dev/null
+++ b/groups/ivybridgeEP/FLOPS_DP.txt
@@ -0,0 +1,31 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2 SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+MFLOP/s 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFLOP/s 1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+-
+SSE scalar and packed double precision FLOP rates. Please note that the current
+FLOP measurements on IvyBridge are potentially wrong.
+So you cannot trust these counters at the moment!
+
diff --git a/groups/ivybridgeEP/FLOPS_SP.txt b/groups/ivybridgeEP/FLOPS_SP.txt
new file mode 100644
index 0000000..819b81c
--- /dev/null
+++ b/groups/ivybridgeEP/FLOPS_SP.txt
@@ -0,0 +1,31 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2 SIMD_FP_256_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+MFLOP/s 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFLOP/s 1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+-
+SSE scalar and packed single precision FLOP rates. Please note that the current
+FLOP measurements on IvyBridge are potentially wrong.
+So you cannot trust these counters at the moment!
+
diff --git a/groups/ivybridgeEP/ICACHE.txt b/groups/ivybridgeEP/ICACHE.txt
new file mode 100644
index 0000000..f1e2335
--- /dev/null
+++ b/groups/ivybridgeEP/ICACHE.txt
@@ -0,0 +1,33 @@
+SHORT Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 ICACHE_ACCESSES
+PMC1 ICACHE_MISSES
+PMC2 ICACHE_IFETCH_STALL
+PMC3 ILD_STALL_IQ_FULL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+L1I stalls PMC2
+L1I stall rate PMC2/FIXC0
+L1I queue full stalls PMC3
+L1I queue full stall rate PMC3/FIXC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I stalls = ICACHE_IFETCH_STALL
+L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/ivybridgeEP/L2.txt b/groups/ivybridgeEP/L2.txt
new file mode 100644
index 0000000..376e974
--- /dev/null
+++ b/groups/ivybridgeEP/L2.txt
@@ -0,0 +1,38 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L1D_REPLACEMENT
+PMC1 L1D_M_EVICT
+PMC2 ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes] 1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L1 and the number of modified cache lines
+evicted from the L1. The group also outputs total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cache lines transferred it the instruction
+cache.
+
diff --git a/groups/ivybridgeEP/L2CACHE.txt b/groups/ivybridgeEP/L2CACHE.txt
new file mode 100644
index 0000000..9b5dd4b
--- /dev/null
+++ b/groups/ivybridgeEP/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L2_TRANS_ALL_REQUESTS
+PMC1 L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/ivybridgeEP/L3.txt b/groups/ivybridgeEP/L3.txt
new file mode 100644
index 0000000..f0a8aad
--- /dev/null
+++ b/groups/ivybridgeEP/L3.txt
@@ -0,0 +1,36 @@
+SHORT L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L2_LINES_IN_ALL
+PMC1 L2_LINES_OUT_DIRTY_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DIRTY_ALL*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DIRTY_ALL*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/groups/ivybridgeEP/L3CACHE.txt b/groups/ivybridgeEP/L3CACHE.txt
new file mode 100644
index 0000000..9f3036f
--- /dev/null
+++ b/groups/ivybridgeEP/L3CACHE.txt
@@ -0,0 +1,36 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1 MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2 UOPS_RETIRED_ALL
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L3 request rate (PMC0)/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/ivybridgeEP/MEM.txt b/groups/ivybridgeEP/MEM.txt
new file mode 100644
index 0000000..fd80c2c
--- /dev/null
+++ b/groups/ivybridgeEP/MEM.txt
@@ -0,0 +1,49 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base. Some of the counters may not be available on your system.
+Also outputs total data volume transferred from main memory.
+
diff --git a/groups/ivybridgeEP/MEM_DP.txt b/groups/ivybridgeEP/MEM_DP.txt
new file mode 100644
index 0000000..da40bb9
--- /dev/null
+++ b/groups/ivybridgeEP/MEM_DP.txt
@@ -0,0 +1,68 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2 SIMD_FP_256_PACKED_DOUBLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Energy [J] PWR0
+Power [W] PWR0/time
+Energy DRAM [J] PWR3
+Power DRAM [W] PWR3/time
+MFLOP/s 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFLOP/s 1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed double precision FLOP rates. Also reports on packed AVX
+32b instructions. Please note that the current FLOP measurements on SandyBridge
+are potentially wrong. So you cannot trust these counters at the moment!
+
diff --git a/groups/ivybridgeEP/MEM_SP.txt b/groups/ivybridgeEP/MEM_SP.txt
new file mode 100644
index 0000000..7fe9ea9
--- /dev/null
+++ b/groups/ivybridgeEP/MEM_SP.txt
@@ -0,0 +1,70 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2 SIMD_FP_256_PACKED_SINGLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Energy [J] PWR0
+Power [W] PWR0/time
+Energy DRAM [J] PWR3
+Power DRAM [W] PWR3/time
+MFLOP/s 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFLOP/s 1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formula:
+Power [W] = PWR_PKG_ENERGY/runtime
+Power DRAM [W] = PWR_DRAM_ENERGY/runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed single precision FLOP rates. Also reports on packed AVX
+32b instructions. Please note that the current FLOP measurements on IvyBridge
+are potentially wrong. So you cannot trust these counters at the moment!
+
diff --git a/groups/ivybridgeEP/NUMA.txt b/groups/ivybridgeEP/NUMA.txt
new file mode 100644
index 0000000..0c1b8fb
--- /dev/null
+++ b/groups/ivybridgeEP/NUMA.txt
@@ -0,0 +1,33 @@
+SHORT Local and remote memory accesses
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 OFFCORE_RESPONSE_0_LOCAL_DRAM
+PMC1 OFFCORE_RESPONSE_1_REMOTE_DRAM
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Local DRAM data volume [GByte] 1.E-09*PMC0*64
+Local DRAM bandwidth [MByte/s] 1.E-06*(PMC0*64)/time
+Remote DRAM data volume [GByte] 1.E-09*PMC1*64
+Remote DRAM bandwidth [MByte/s] 1.E-06*(PMC1*64)/time
+Memory data volume [GByte] 1.E-09*(PMC0+PMC1)*64
+Memory bandwidth [MByte/s] 1.E-06*((PMC0+PMC1)*64)/time
+
+LONG
+Formula:
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+Local DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_0_LOCAL_DRAM*64
+Local DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_0_LOCAL_DRAM*64)/time
+Remote DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_1_REMOTE_DRAM*64
+Remote DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_1_REMOTE_DRAM*64)/time
+Memory data volume [GByte] = 1.E-09*(OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64
+Memory bandwidth [MByte/s] = 1.E-06*((OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64)/time
+--
+This performance group measures the data traffic of CPU cores to local and remote
+memory.
diff --git a/groups/ivybridgeEP/QPI.txt b/groups/ivybridgeEP/QPI.txt
new file mode 100644
index 0000000..4dbf8a4
--- /dev/null
+++ b/groups/ivybridgeEP/QPI.txt
@@ -0,0 +1,52 @@
+SHORT QPI Link Layer data
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+SBOX0C0 DIRECT2CORE_SUCCESS_RBT_HIT
+SBOX1C0 DIRECT2CORE_SUCCESS_RBT_HIT
+SBOX2C0 DIRECT2CORE_SUCCESS_RBT_HIT
+SBOX0C1 TXL_FLITS_G0_DATA
+SBOX1C1 TXL_FLITS_G0_DATA
+SBOX2C1 TXL_FLITS_G0_DATA
+SBOX0C2 TXL_FLITS_G0_NON_DATA
+SBOX1C2 TXL_FLITS_G0_NON_DATA
+SBOX2C2 TXL_FLITS_G0_NON_DATA
+SBOX0C3 SBOX_CLOCKTICKS
+SBOX1C3 SBOX_CLOCKTICKS
+SBOX2C3 SBOX_CLOCKTICKS
+SBOX0FIX QPI_RATE
+SBOX1FIX QPI_RATE
+SBOX2FIX QPI_RATE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+QPI Speed Link 0 [GT/s] ((SBOX0C3)/time)*inverseClock*(8/1000)
+QPI Speed Link 1 [GT/s] ((SBOX1C3)/time)*inverseClock*(8/1000)
+QPI Speed Link 2 [GT/s] ((SBOX2C3)/time)*inverseClock*(8/1000)
+QPI Rate Link 0 [GT/s] 1.E-09*SBOX0FIX
+QPI Rate Link 1 [GT/s] 1.E-09*SBOX1FIX
+QPI Rate Link 2 [GT/s] 1.E-09*SBOX2FIX
+data from QPI to LLC [MByte] 1.E-06*(SBOX0C0+SBOX1C0+SBOX2C0)*8
+QPI data volume [MByte] 1.E-06*(SBOX0C1+SBOX1C1+SBOX2C1)*8
+QPI data bandwidth [MByte/s] 1.E-06*(SBOX0C1+SBOX1C1+SBOX2C1)*8/time
+QPI link volume [MByte] 1.E-06*(SBOX0C1+SBOX1C1+SBOX2C1+SBOX0C2+SBOX1C2+SBOX2C2)*8
+QPI link bandwidth [MByte/s] 1.E-06*(SBOX0C1+SBOX1C1+SBOX2C1+SBOX0C2+SBOX1C2+SBOX2C2)*8/time
+
+LONG
+Formula:
+QPI Speed Link 0/1/2 [GT/s] = ((SBOX_CLOCKTICKS)/time)*clock*(8/1000)
+QPI Rate Link 0/1/2 [GT/s] = 1.E-09*(QPI_RATE)
+data from QPI to LLC [MByte] = 1.E-06*(sum(DIRECT2CORE_SUCCESS_RBT_HIT)*64)
+QPI data volume [MByte] = 1.E-06*(sum(TXL_FLITS_G0_DATA)*8)
+QPI data bandwidth [MByte/s] = 1.E-06*(sum(TXL_FLITS_G0_DATA)*8)/runtime
+QPI link volume [MByte] = 1.E-06*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)
+QPI link bandwidth [MByte/s] = 1.E-06*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)/runtime
+--
+The Intel QPI Link Layer is responsible for packetizing requests from the caching agent (CBOXes)
+on the way out to the system interface.
+
diff --git a/groups/ivybridgeEP/RECOVERY.txt b/groups/ivybridgeEP/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/groups/ivybridgeEP/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 INT_MISC_RECOVERY_CYCLES
+PMC1 INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/groups/ivybridgeEP/TLB_DATA.txt b/groups/ivybridgeEP/TLB_DATA.txt
new file mode 100644
index 0000000..8d94e05
--- /dev/null
+++ b/groups/ivybridgeEP/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1 DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2 DTLB_LOAD_MISSES_WALK_DURATION
+PMC3 DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1 DTLB load misses PMC0
+L1 DTLB load miss rate PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses PMC1
+L1 DTLB store miss rate PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/ivybridgeEP/TLB_INSTR.txt b/groups/ivybridgeEP/TLB_INSTR.txt
new file mode 100644
index 0000000..235d977
--- /dev/null
+++ b/groups/ivybridgeEP/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 ITLB_MISSES_CAUSES_A_WALK
+PMC1 ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1 ITLB misses PMC0
+L1 ITLB miss rate PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/ivybridgeEP/UNCORECLOCK.txt b/groups/ivybridgeEP/UNCORECLOCK.txt
new file mode 100644
index 0000000..fef0d36
--- /dev/null
+++ b/groups/ivybridgeEP/UNCORECLOCK.txt
@@ -0,0 +1,84 @@
+SHORT All Clocks
+
+EVENTSET
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+CBOX0C0 CBOX_CLOCKTICKS
+CBOX1C0 CBOX_CLOCKTICKS
+CBOX2C0 CBOX_CLOCKTICKS
+CBOX3C0 CBOX_CLOCKTICKS
+CBOX4C0 CBOX_CLOCKTICKS
+CBOX5C0 CBOX_CLOCKTICKS
+CBOX6C0 CBOX_CLOCKTICKS
+CBOX7C0 CBOX_CLOCKTICKS
+CBOX8C0 CBOX_CLOCKTICKS
+CBOX9C0 CBOX_CLOCKTICKS
+CBOX10C0 CBOX_CLOCKTICKS
+CBOX11C0 CBOX_CLOCKTICKS
+CBOX12C0 CBOX_CLOCKTICKS
+CBOX13C0 CBOX_CLOCKTICKS
+CBOX14C0 CBOX_CLOCKTICKS
+MBOX0C0 DRAM_CLOCKTICKS
+MBOX1C0 DRAM_CLOCKTICKS
+MBOX2C0 DRAM_CLOCKTICKS
+MBOX3C0 DRAM_CLOCKTICKS
+MBOX0FIX DRAM_CLOCKTICKS
+MBOX1FIX DRAM_CLOCKTICKS
+MBOX2FIX DRAM_CLOCKTICKS
+MBOX3FIX DRAM_CLOCKTICKS
+SBOX0C0 SBOX_CLOCKTICKS
+SBOX1C0 SBOX_CLOCKTICKS
+SBOX2C0 SBOX_CLOCKTICKS
+UBOXFIX UBOX_CLOCKTICKS
+BBOX0C0 BBOX_CLOCKTICKS
+BBOX1C0 BBOX_CLOCKTICKS
+WBOX0 WBOX_CLOCKTICKS
+PBOX0 PBOX_CLOCKTICKS
+RBOX0C0 RBOX_CLOCKTICKS
+RBOX1C0 RBOX_CLOCKTICKS
+RBOX2C0 RBOX_CLOCKTICKS
+IBOX0 IBOX_CLOCKTICKS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CBOX0 Frequency [GHz] 1.E-09*CBOX0C0/(FIXC1*inverseClock)
+CBOX1 Frequency [GHz] 1.E-09*CBOX1C0/(FIXC1*inverseClock)
+CBOX2 Frequency [GHz] 1.E-09*CBOX2C0/(FIXC1*inverseClock)
+CBOX3 Frequency [GHz] 1.E-09*CBOX3C0/(FIXC1*inverseClock)
+CBOX4 Frequency [GHz] 1.E-09*CBOX4C0/(FIXC1*inverseClock)
+CBOX5 Frequency [GHz] 1.E-09*CBOX5C0/(FIXC1*inverseClock)
+CBOX6 Frequency [GHz] 1.E-09*CBOX6C0/(FIXC1*inverseClock)
+CBOX7 Frequency [GHz] 1.E-09*CBOX7C0/(FIXC1*inverseClock)
+CBOX8 Frequency [GHz] 1.E-09*CBOX8C0/(FIXC1*inverseClock)
+CBOX9 Frequency [GHz] 1.E-09*CBOX9C0/(FIXC1*inverseClock)
+CBOX10 Frequency [GHz] 1.E-09*CBOX10C0/(FIXC1*inverseClock)
+CBOX11 Frequency [GHz] 1.E-09*CBOX11C0/(FIXC1*inverseClock)
+CBOX12 Frequency [GHz] 1.E-09*CBOX12C0/(FIXC1*inverseClock)
+CBOX13 Frequency [GHz] 1.E-09*CBOX13C0/(FIXC1*inverseClock)
+CBOX14 Frequency [GHz] 1.E-09*CBOX14C0/(FIXC1*inverseClock)
+MBOX0 Frequency [GHz] 1.E-09*MBOX0C0/(FIXC1*inverseClock)
+MBOX0FIX Frequency [GHz] 1.E-09*MBOX0FIX/(FIXC1*inverseClock)
+MBOX1 Frequency [GHz] 1.E-09*MBOX1C0/(FIXC1*inverseClock)
+MBOX1FIX Frequency [GHz] 1.E-09*MBOX1FIX/(FIXC1*inverseClock)
+MBOX2 Frequency [GHz] 1.E-09*MBOX2C0/(FIXC1*inverseClock)
+MBOX2FIX Frequency [GHz] 1.E-09*MBOX2FIX/(FIXC1*inverseClock)
+MBOX3 Frequency [GHz] 1.E-09*MBOX3C0/(FIXC1*inverseClock)
+MBOX3FIX Frequency [GHz] 1.E-09*MBOX3FIX/(FIXC1*inverseClock)
+SBOX0 Frequency [GHz] 1.E-09*SBOX0C0/(FIXC1*inverseClock)
+SBOX1 Frequency [GHz] 1.E-09*SBOX1C0/(FIXC1*inverseClock)
+SBOX2 Frequency [GHz] 1.E-09*SBOX2C0/(FIXC1*inverseClock)
+UBOX Frequency [GHz] 1.E-09*UBOXFIX/(FIXC1*inverseClock)
+BBOX0 Frequency [GHz] 1.E-09*BBOX0C0/(FIXC1*inverseClock)
+BBOX1 Frequency [GHz] 1.E-09*BBOX1C0/(FIXC1*inverseClock)
+WBOX Frequency [GHz] 1.E-09*WBOX0/(FIXC1*inverseClock)
+PBOX Frequency [GHz] 1.E-09*PBOX0/(FIXC1*inverseClock)
+RBOX0 Frequency [GHz] 1.E-09*RBOX0C0/(FIXC1*inverseClock)
+RBOX1 Frequency [GHz] 1.E-09*RBOX1C0/(FIXC1*inverseClock)
+RBOX2 Frequency [GHz] 1.E-09*RBOX2C0/(FIXC1*inverseClock)
+IBOX Frequency [GHz] 1.E-09*IBOX0/(FIXC1*inverseClock)
+
+
+LONG
+Formulas:
diff --git a/groups/ivybridgeEP/UOPS.txt b/groups/ivybridgeEP/UOPS.txt
new file mode 100644
index 0000000..178aec5
--- /dev/null
+++ b/groups/ivybridgeEP/UOPS.txt
@@ -0,0 +1,35 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_ANY
+PMC1 UOPS_EXECUTED_THREAD
+PMC2 UOPS_RETIRED_ALL
+PMC3 UOPS_ISSUED_FLAGS_MERGE
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formula:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/groups/ivybridgeEP/UOPS_EXEC.txt b/groups/ivybridgeEP/UOPS_EXEC.txt
new file mode 100644
index 0000000..7042df7
--- /dev/null
+++ b/groups/ivybridgeEP/UOPS_EXEC.txt
@@ -0,0 +1,31 @@
+SHORT UOPs execution
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_EXECUTED_USED_CYCLES
+PMC1 UOPS_EXECUTED_STALL_CYCLES
+PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT UOPS_EXECUTED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the execution stage in the pipeline. Used cycles are all cycles where uOPs are
+executed while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/ivybridgeEP/UOPS_ISSUE.txt b/groups/ivybridgeEP/UOPS_ISSUE.txt
new file mode 100644
index 0000000..9aac923
--- /dev/null
+++ b/groups/ivybridgeEP/UOPS_ISSUE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs issueing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_USED_CYCLES
+PMC1 UOPS_ISSUED_STALL_CYCLES
+PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT UOPS_ISSUED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the issue stage in the pipeline. Used cycles are all cycles where uOPs are
+issued while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/ivybridgeEP/UOPS_RETIRE.txt b/groups/ivybridgeEP/UOPS_RETIRE.txt
new file mode 100644
index 0000000..0f37585
--- /dev/null
+++ b/groups/ivybridgeEP/UOPS_RETIRE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_RETIRED_USED_CYCLES
+PMC1 UOPS_RETIRED_STALL_CYCLES
+PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the retirement stage in the pipeline (re-order buffer). Used cycles are all
+cycles where uOPs are retired while unused cycles refer to pipeline stalls.
+Moreover, the group calculates the average stall duration in cycles.
diff --git a/groups/k10/BRANCH.txt b/groups/k10/BRANCH.txt
index cbc6da6..5c4207e 100644
--- a/groups/k10/BRANCH.txt
+++ b/groups/k10/BRANCH.txt
@@ -4,29 +4,23 @@ EVENTSET
PMC0 INSTRUCTIONS_RETIRED
PMC1 BRANCH_RETIRED
PMC2 BRANCH_MISPREDICT_RETIRED
-PMC3 BRANCH_TAKEN_RETIRED
METRICS
Runtime (RDTSC) [s] time
Branch rate PMC1/PMC0
Branch misprediction rate PMC2/PMC0
Branch misprediction ratio PMC2/PMC1
-Branch taken rate PMC3/PMC0
-Branch taken ratio PMC3/PMC1
Instructions per branch PMC0/PMC1
LONG
Formulas:
-Branch rate = BRANCH_RETIRED / INSTRUCTIONS_RETIRED
-Branch misprediction rate = BRANCH_MISPREDICT_RETIRED / INSTRUCTIONS_RETIRED
-Branch misprediction ratio = BRANCH_MISPREDICT_RETIRED / BRANCH_RETIRED
-Branch taken rate = BRANCH_TAKEN_RETIRED / INSTRUCTIONS_RETIRED
-Branch taken ratio = BRANCH_TAKEN_RETIRED / BRANCH_RETIRED
-Instructions per branch = INSTRUCTIONS_RETIRED / BRANCH_RETIRED
+Branch rate = BRANCH_RETIRED/INSTRUCTIONS_RETIRED
+Branch misprediction rate = BRANCH_MISPREDICT_RETIRED/INSTRUCTIONS_RETIRED
+Branch misprediction ratio = BRANCH_MISPREDICT_RETIRED/BRANCH_RETIRED
+Instructions per branch = INSTRUCTIONS_RETIRED/BRANCH_RETIRED
-
-The rates state how often in average a branch or a mispredicted branch occured
-per instruction retired in total. The Branch misprediction ratio sets directly
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
into relation what ration of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate. The same applies for the branches
-taken metrics.
+Instructions per branch is 1/branch rate.
diff --git a/groups/k10/CACHE.txt b/groups/k10/CACHE.txt
index e70823e..26d799f 100644
--- a/groups/k10/CACHE.txt
+++ b/groups/k10/CACHE.txt
@@ -8,26 +8,26 @@ PMC3 DATA_CACHE_REFILLS_NORTHBRIDGE_ALL
METRICS
Runtime (RDTSC) [s] time
-Data cache misses PMC2+PMC3
-Data cache request rate PMC1/PMC0
-Data cache miss rate (PMC2+PMC3)/PMC0
-Data cache miss ratio (PMC2+PMC3)/PMC1
+data cache misses PMC2+PMC3
+data cache request rate PMC1/PMC0
+data cache miss rate (PMC2+PMC3)/PMC0
+data cache miss ratio (PMC2+PMC3)/PMC1
LONG
Formulas:
-Data cache misses = DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL
-Data cache request rate = DATA_CACHE_ACCESSES / INSTRUCTIONS_RETIRED
-Data cache miss rate = (DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL)/INSTRUCTIONS_RETIRED
-Data cache miss ratio = (DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL)/DATA_CACHE_ACCESSES
+data cache misses = DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL
+data cache request rate = DATA_CACHE_ACCESSES / INSTRUCTIONS_RETIRED
+data cache miss rate = (DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL)/INSTRUCTIONS_RETIRED
+data cache miss ratio = (DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL)/DATA_CACHE_ACCESSES
-
This group measures the locality of your data accesses with regard to the
-L1 Cache. Data cache request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
-The Data cache miss rate gives a measure how often it was necessary to get
-cachelines from higher levels of the memory hierarchy. And finally
-Data cache miss ratio tells you how many of your memory references required
-a cacheline to be loaded from a higher level. While the Data cache miss rate
-might be given by your algorithm you should try to get Data cache miss ratio
+L1 cache. Data cache request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy. And finally
+data cache miss ratio tells you how many of your memory references required
+a cache line to be loaded from a higher level. While the# data cache miss rate
+might be given by your algorithm you should try to get data cache miss ratio
as low as possible by increasing your cache reuse.
This group was taken from the whitepaper -Basic Performance Measurements for AMD Athlon 64,
AMD Opteron and AMD Phenom Processors- from Paul J. Drongowski.
diff --git a/groups/k10/CPI.txt b/groups/k10/CPI.txt
index 6595c2d..850afed 100644
--- a/groups/k10/CPI.txt
+++ b/groups/k10/CPI.txt
@@ -13,6 +13,11 @@ CPI (based on uops) PMC1/PMC2
IPC PMC0/PMC1
LONG
+Formulas:
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS
+IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED
+-
This group measures how efficient the processor works with
regard to instruction throughput. Also important as a standalone
metric is INSTRUCTIONS_RETIRED as it tells you how many instruction
diff --git a/groups/k10/FLOPS_DP.txt b/groups/k10/FLOPS_DP.txt
index 4eccf8b..aa05d77 100644
--- a/groups/k10/FLOPS_DP.txt
+++ b/groups/k10/FLOPS_DP.txt
@@ -1,4 +1,4 @@
-SHORT Double Precision MFlops/s
+SHORT Double Precision MFLOP/s
EVENTSET
PMC0 SSE_RETIRED_ADD_DOUBLE_FLOPS
@@ -8,15 +8,17 @@ PMC2 CPU_CLOCKS_UNHALTED
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] PMC2*inverseClock
-DP MFlops/s 1.0E-06*(PMC0+PMC1)/time
-DP Add MFlops/s 1.0E-06*PMC0/time
-DP Mult MFlops/s 1.0E-06*PMC1/time
+DP MFLOP/s 1.0E-06*(PMC0+PMC1)/time
+DP Add MFLOP/s 1.0E-06*PMC0/time
+DP Mult MFLOP/s 1.0E-06*PMC1/time
LONG
Formulas:
-DP MFlops/s = 1.0E-06*(SSE_RETIRED_ADD_DOUBLE_FLOPS+SSE_RETIRED_MULT_DOUBLE_FLOPS)/time
+DP MFLOP/s = 1.0E-06*(SSE_RETIRED_ADD_DOUBLE_FLOPS+SSE_RETIRED_MULT_DOUBLE_FLOPS)/time
+DP Add MFLOP/s = 1.0E-06*(SSE_RETIRED_ADD_DOUBLE_FLOPS)/time
+DP Mult MFLOP/s = 1.0E-06*(SSE_RETIRED_MULT_DOUBLE_FLOPS)/time
-
-Profiling group to measure double SSE flops.
-Dont forget that your code might also execute X87 flops.
+Profiling group to measure double SSE FLOPs.
+Don't forget that your code might also execute X87 FLOPs.
diff --git a/groups/k10/FLOPS_SP.txt b/groups/k10/FLOPS_SP.txt
index 7a0bd52..8869557 100644
--- a/groups/k10/FLOPS_SP.txt
+++ b/groups/k10/FLOPS_SP.txt
@@ -1,4 +1,4 @@
-SHORT Single Precision MFlops/s
+SHORT Single Precision MFLOP/s
EVENTSET
PMC0 SSE_RETIRED_ADD_SINGLE_FLOPS
@@ -8,15 +8,17 @@ PMC2 CPU_CLOCKS_UNHALTED
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] PMC2*inverseClock
-SP MFlops/s 1.0E-06*(PMC0+PMC1)/time
-SP Add MFlops/s 1.0E-06*PMC0/time
-SP Mult MFlops/s 1.0E-06*PMC1/time
+SP MFLOP/s 1.0E-06*(PMC0+PMC1)/time
+SP Add MFLOP/s 1.0E-06*PMC0/time
+SP Mult MFLOP/s 1.0E-06*PMC1/time
LONG
Formulas:
-SP MFlops/s = 1.0E-06*(SSE_RETIRED_ADD_SINGLE_FLOPS+SSE_RETIRED_MULT_SINGLE_FLOPS)/time
+SP MFLOP/s = 1.0E-06*(SSE_RETIRED_ADD_SINGLE_FLOPS+SSE_RETIRED_MULT_SINGLE_FLOPS)/time
+SP Add MFLOP/s = 1.0E-06*(SSE_RETIRED_ADD_SINGLE_FLOPS)/time
+SP Mult MFLOP/s = 1.0E-06*(SSE_RETIRED_MULT_SINGLE_FLOPS)/time
-
-Profiling group to measure single precision SSE flops.
-Dont forget that your code might also execute X87 flops.
+Profiling group to measure single precision SSE FLOPs.
+Don't forget that your code might also execute X87 FLOPs.
diff --git a/groups/k10/FLOPS_X87.txt b/groups/k10/FLOPS_X87.txt
index 9a585b4..015ee19 100644
--- a/groups/k10/FLOPS_X87.txt
+++ b/groups/k10/FLOPS_X87.txt
@@ -1,4 +1,4 @@
-SHORT X87 MFlops/s
+SHORT X87 MFLOP/s
EVENTSET
PMC0 X87_FLOPS_RETIRED_ADD
@@ -9,11 +9,17 @@ PMC3 CPU_CLOCKS_UNHALTED
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] PMC3*inverseClock
-X87 MFlops/s 1.0E-06*(PMC0+PMC1+PMC2)/time
-X87 Add MFlops/s 1.0E-06*PMC0/time
-X87 Mult MFlops/s 1.0E-06*PMC1/time
-X87 Div MFlops/s 1.0E-06*PMC2/time
+X87 MFLOP/s 1.0E-06*(PMC0+PMC1+PMC2)/time
+X87 Add MFLOP/s 1.0E-06*PMC0/time
+X87 Mult MFLOP/s 1.0E-06*PMC1/time
+X87 Div MFLOP/s 1.0E-06*PMC2/time
LONG
-Profiling group to measure X87 flop rates.
+Formulas:
+X87 MFLOP/s = 1.0E-06*(X87_FLOPS_RETIRED_ADD+X87_FLOPS_RETIRED_MULT+X87_FLOPS_RETIRED_DIV)/time
+X87 Add MFLOP/s = 1.0E-06*X87_FLOPS_RETIRED_ADD/time
+X87 Mult MFLOP/s = 1.0E-06*X87_FLOPS_RETIRED_MULT/time
+X87 Div MFLOP/s = 1.0E-06*X87_FLOPS_RETIRED_DIV/time
+-
+Profiling group to measure X87 FLOP rates.
diff --git a/groups/k10/FPU_EXCEPTION.txt b/groups/k10/FPU_EXCEPTION.txt
index eff87fc..23d3c54 100644
--- a/groups/k10/FPU_EXCEPTION.txt
+++ b/groups/k10/FPU_EXCEPTION.txt
@@ -15,7 +15,7 @@ Formulas:
Overall FP exception rate = FPU_EXCEPTIONS_ALL / INSTRUCTIONS_RETIRED
FP exception rate = FPU_EXCEPTIONS_ALL / FP_INSTRUCTIONS_RETIRED_ALL
-
-Floating point exceptions occur e.g. on the treatment of Denormals.
+Floating point exceptions occur e.g. on the treatment of denormal numbers.
There might be a large penalty if there are too many floating point
exceptions.
diff --git a/groups/k10/ICACHE.txt b/groups/k10/ICACHE.txt
index 222ea5d..5150496 100644
--- a/groups/k10/ICACHE.txt
+++ b/groups/k10/ICACHE.txt
@@ -8,18 +8,16 @@ PMC3 ICACHE_REFILLS_MEM
METRICS
Runtime (RDTSC) [s] time
-Instruction cache misses PMC2+PMC3
-Instruction cache request rate PMC1/PMC0
-Instruction cache miss rate (PMC2+PMC3)/PMC0
-Instruction cache miss ratio (PMC2+PMC3)/PMC1
+L1I request rate PMC1/PMC0
+L1I miss rate (PMC2+PMC3)/PMC0
+L1I miss ratio (PMC2+PMC3)/PMC1
LONG
Formulas:
-Instruction cache misses ICACHE_REFILLS_L2 + ICACHE_REFILLS_MEM
-Instruction cache request rate ICACHE_FETCHES / INSTRUCTIONS_RETIRED
-Instruction cache miss rate (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/INSTRUCTIONS_RETIRED
-Instruction cache miss ratio (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/ICACHE_FETCHES
+L1I request rate = ICACHE_FETCHES / INSTRUCTIONS_RETIRED
+L1I miss rate = (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/INSTRUCTIONS_RETIRED
+L1I miss ratio = (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/ICACHE_FETCHES
-
This group measures the locality of your instruction code with regard to the
-L1 I-Cache.
+L1 I-Cache.
diff --git a/groups/k10/L2.txt b/groups/k10/L2.txt
index 8b61bcc..fae6fb0 100644
--- a/groups/k10/L2.txt
+++ b/groups/k10/L2.txt
@@ -8,21 +8,25 @@ PMC2 CPU_CLOCKS_UNHALTED
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] PMC2*inverseClock
+L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes] 1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0
L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
-L2 refill bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 evict [MBytes/s] 1.0E-06*PMC1*64.0/time
LONG
Formulas:
-L2 bandwidth [MBytes/s] 1.0E-06*(DATA_CACHE_REFILLS_L2_ALL+DATA_CACHE_EVICTED_ALL)*64/time
-L2 data volume [GBytes] 1.0E-09*(DATA_CACHE_REFILLS_L2_ALL+DATA_CACHE_EVICTED_ALL)*64
-L2 refill bandwidth [MBytes/s] 1.0E-06*DATA_CACHE_REFILLS_L2_ALL*64/time
-L2 evict [MBytes/s] 1.0E-06*DATA_CACHE_EVICTED_ALL*64/time
+L2D load bandwidth [MBytes/s] = 1.0E-06*DATA_CACHE_REFILLS_L2_ALL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*DATA_CACHE_REFILLS_L2_ALL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*DATA_CACHE_EVICTED_ALL*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*DATA_CACHE_EVICTED_ALL*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_L2_ALL+DATA_CACHE_EVICTED_ALL)*64/time
+L2 data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_L2_ALL+DATA_CACHE_EVICTED_ALL)*64
-
Profiling group to measure L2 cache bandwidth. The bandwidth is
-computed by the number of cacheline loaded from L2 to L1 and the
-number of modified cachelines evicted from the L1.
+computed by the number of cache line loaded from L2 to L1 and the
+number of modified cache lines evicted from the L1.
Note that this bandwidth also includes data transfers due to a
write allocate load on a store miss in L1 and copy back transfers if
originated from L2.
diff --git a/groups/k10/L2CACHE.txt b/groups/k10/L2CACHE.txt
index d384c48..2d29e43 100644
--- a/groups/k10/L2CACHE.txt
+++ b/groups/k10/L2CACHE.txt
@@ -19,13 +19,13 @@ L2 miss rate = L2_MISSES_ALL/INSTRUCTIONS_RETIRED
L2 miss ratio = L2_MISSES_ALL/(L2_REQUESTS_ALL+L2_FILL_ALL)
-
This group measures the locality of your data accesses with regard to the
-L2 Cache. L2 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
The L2 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L2 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
This group was taken from the whitepaper -Basic Performance Measurements for AMD Athlon 64,
AMD Opteron and AMD Phenom Processors- from Paul J. Drongowski.
diff --git a/groups/k10/L3CACHE.txt b/groups/k10/L3CACHE.txt
index 85b4522..e3a2d72 100644
--- a/groups/k10/L3CACHE.txt
+++ b/groups/k10/L3CACHE.txt
@@ -13,20 +13,20 @@ L3 miss ratio PMC2/PMC1
LONG
Formulas:
-L3 request rate = L3_READ_REQUEST_ALL_ALL_CORES / INSTRUCTIONS_RETIRED
-L3 miss rate = L3_MISSES_ALL_ALL_CORES / INSTRUCTIONS_RETIRED
-L3 miss ratio = L3_MISSES_ALL_ALL_CORES / L3_READ_REQUEST_ALL_ALL_CORES
+L3 request rate = L3_READ_REQUEST_ALL_ALL_CORES/INSTRUCTIONS_RETIRED
+L3 miss rate = L3_MISSES_ALL_ALL_CORES/INSTRUCTIONS_RETIRED
+L3 miss ratio = L3_MISSES_ALL_ALL_CORES/L3_READ_REQUEST_ALL_ALL_CORES
-
This group measures the locality of your data accesses with regard to the
-L3 Cache. L3 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
The L3 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L3 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
Note: As this group measures the accesses from all cores it only makes sense
-to measure with one core per socket, similiar as with the Intel Nehalem Uncore events.
+to measure with one core per socket, similar as with the Intel Nehalem Uncore events.
This group was taken from the whitepaper -Basic Performance Measurements for AMD Athlon 64,
AMD Opteron and AMD Phenom Processors- from Paul J. Drongowski.
diff --git a/groups/k10/MEM.txt b/groups/k10/MEM.txt
index b6c9f33..f9f5a91 100644
--- a/groups/k10/MEM.txt
+++ b/groups/k10/MEM.txt
@@ -8,19 +8,28 @@ PMC3 DRAM_ACCESSES_DCT1_ALL
METRICS
Runtime (RDTSC) [s] time
-Read data bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-Write data bandwidth [MBytes/s] 1.0E-06*PMC1*8.0/time
+Memory read bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+Memory read data volume [GBytes] 1.0E-09*PMC0*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*PMC1*8.0/time
+Memory write data volume [GBytes] 1.0E-09*PMC1*8.0
Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
LONG
Formulas:
-Read data bandwidth (MBytes/s) 1.0E-06*NORTHBRIDGE_READ_RESPONSE_ALL*64/time
-Write data bandwidth (MBytes/s) 1.0E-06*OCTWORDS_WRITE_TRANSFERS*8/time
+Memory read bandwidth [MBytes/s] = 1.0E-06*NORTHBRIDGE_READ_RESPONSE_ALL*64/time
+Memory read data volume [GBytes] = 1.0E-09*NORTHBRIDGE_READ_RESPONSE_ALL*64
+Memory write bandwidth [MBytes/s] = 1.0E-06*OCTWORDS_WRITE_TRANSFERS*8/time
+Memory write data volume [GBytes] = 1.0E-09*OCTWORDS_WRITE_TRANSFERS*8
Memory bandwidth [MBytes/s] = 1.0E-06*(DRAM_ACCESSES_DCTO_ALL+DRAM_ACCESSES_DCT1_ALL)*64/time
Memory data volume [GBytes] = 1.0E-09*(DRAM_ACCESSES_DCTO_ALL+DRAM_ACCESSES_DCT1_ALL)*64
-
Profiling group to measure memory bandwidth drawn by all cores of a socket.
Note: As this group measures the accesses from all cores it only makes sense
-to measure with one core per socket, similiar as with the Intel Nehalem Uncore events.
+to measure with one core per socket, similar as with the Intel Nehalem Uncore events.
+The memory read bandwidth contains all data from DRAM, L3, or another cache,
+including another core on the same node. The event OCTWORDS_WRITE_TRANSFERS counts
+16 Byte transfers, not 64 Byte.
+
+
diff --git a/groups/k10/NUMA.txt b/groups/k10/NUMA.txt
deleted file mode 100644
index 9734e3c..0000000
--- a/groups/k10/NUMA.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-SHORT Bandwidth on the Hypertransport links
-
-EVENTSET
-PMC0 CPU_TO_DRAM_LOCAL_TO_0
-PMC1 CPU_TO_DRAM_LOCAL_TO_1
-PMC2 CPU_TO_DRAM_LOCAL_TO_2
-PMC3 CPU_TO_DRAM_LOCAL_TO_3
-
-METRICS
-Runtime (RDTSC) [s] time
-Mega requests per second to Node 0 1.0E-06*PMC0/time
-Mega requests per second to Node 1 1.0E-06*PMC1/time
-Mega requests per second to Node 2 1.0E-06*PMC2/time
-Mega requests per second to Node 3 1.0E-06*PMC3/time
-
-LONG
-Formulas:
-Mega requests per second to Node X 1.0E-06*PMCX/time
--
-Profiling group to measure the traffic from local CPU to the different
-DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded
-code. You must first determine on which memory domains your code is running.
-A code should only have significant traffic to its own memory domain.
-
-
diff --git a/groups/k10/NUMA2.txt b/groups/k10/NUMA2.txt
deleted file mode 100644
index dbfbbb0..0000000
--- a/groups/k10/NUMA2.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-SHORT Bandwidth on the Hypertransport links
-
-EVENTSET
-PMC0 CPU_TO_DRAM_LOCAL_TO_4
-PMC1 CPU_TO_DRAM_LOCAL_TO_5
-PMC2 CPU_TO_DRAM_LOCAL_TO_6
-PMC3 CPU_TO_DRAM_LOCAL_TO_7
-
-METRICS
-Runtime (RDTSC) [s] time
-Hyper Transport link0 bandwidth (MBytes/s) 1.0E-06*PMC0*4.0/time
-Hyper Transport link1 bandwidth (MBytes/s) 1.0E-06*PMC1*4.0/time
-Hyper Transport link2 bandwidth (MBytes/s) 1.0E-06*PMC2*4.0/time
-Hyper Transport link3 bandwidth (MBytes/s) 1.0E-06*PMC3*4.0/time
-
-LONG
-Formulas:
-Hyper Transport linkn bandwidth (MBytes/s) 1.0E-06*HYPERTRANSPORT_LINK0_ALL_SENT*4.0/time
--
-Profiling group to measure the bandwidth over the Hypertransport links. Can be used
-to detect NUMA problems. Usually there should be only limited traffic over the QPI
-links for optimal performance.
-
-
diff --git a/groups/k10/NUMA_0_3.txt b/groups/k10/NUMA_0_3.txt
new file mode 100644
index 0000000..bdda6e0
--- /dev/null
+++ b/groups/k10/NUMA_0_3.txt
@@ -0,0 +1,27 @@
+SHORT Bandwidth on the Hypertransport links
+
+EVENTSET
+PMC0 CPU_TO_DRAM_LOCAL_TO_0
+PMC1 CPU_TO_DRAM_LOCAL_TO_1
+PMC2 CPU_TO_DRAM_LOCAL_TO_2
+PMC3 CPU_TO_DRAM_LOCAL_TO_3
+
+METRICS
+Runtime (RDTSC) [s] time
+Hyper Transport link0 bandwidth [MBytes/s] 1.0E-06*PMC0*4.0/time
+Hyper Transport link1 bandwidth [MBytes/s] 1.0E-06*PMC1*4.0/time
+Hyper Transport link2 bandwidth [MBytes/s] 1.0E-06*PMC2*4.0/time
+Hyper Transport link3 bandwidth [MBytes/s] 1.0E-06*PMC3*4.0/time
+
+LONG
+Formulas:
+Hyper Transport link0 bandwidth [MBytes/s] 1.0E-06*CPU_TO_DRAM_LOCAL_TO_0*4.0/time
+Hyper Transport link1 bandwidth [MBytes/s] 1.0E-06*CPU_TO_DRAM_LOCAL_TO_1*4.0/time
+Hyper Transport link2 bandwidth [MBytes/s] 1.0E-06*CPU_TO_DRAM_LOCAL_TO_2*4.0/time
+Hyper Transport link3 bandwidth [MBytes/s] 1.0E-06*CPU_TO_DRAM_LOCAL_TO_3*4.0/time
+-
+Profiling group to measure the bandwidth over the Hypertransport links. Can be used
+to detect NUMA problems. Usually there should be only limited traffic over the QPI
+links for optimal performance.
+
+
diff --git a/groups/k10/NUMA_4_7.txt b/groups/k10/NUMA_4_7.txt
new file mode 100644
index 0000000..aa10be0
--- /dev/null
+++ b/groups/k10/NUMA_4_7.txt
@@ -0,0 +1,27 @@
+SHORT Bandwidth on the Hypertransport links
+
+EVENTSET
+PMC0 CPU_TO_DRAM_LOCAL_TO_4
+PMC1 CPU_TO_DRAM_LOCAL_TO_5
+PMC2 CPU_TO_DRAM_LOCAL_TO_6
+PMC3 CPU_TO_DRAM_LOCAL_TO_7
+
+METRICS
+Runtime (RDTSC) [s] time
+Hyper Transport link4 bandwidth [MBytes/s] 1.0E-06*PMC0*4.0/time
+Hyper Transport link5 bandwidth [MBytes/s] 1.0E-06*PMC1*4.0/time
+Hyper Transport link6 bandwidth [MBytes/s] 1.0E-06*PMC2*4.0/time
+Hyper Transport link7 bandwidth [MBytes/s] 1.0E-06*PMC3*4.0/time
+
+LONG
+Formulas:
+Hyper Transport link4 bandwidth [MBytes/s] 1.0E-06*CPU_TO_DRAM_LOCAL_TO_0*4.0/time
+Hyper Transport link5 bandwidth [MBytes/s] 1.0E-06*CPU_TO_DRAM_LOCAL_TO_1*4.0/time
+Hyper Transport link6 bandwidth [MBytes/s] 1.0E-06*CPU_TO_DRAM_LOCAL_TO_2*4.0/time
+Hyper Transport link7 bandwidth [MBytes/s] 1.0E-06*CPU_TO_DRAM_LOCAL_TO_3*4.0/time
+-
+Profiling group to measure the bandwidth over the Hypertransport links. Can be used
+to detect NUMA problems. Usually there should be only limited traffic over the QPI
+links for optimal performance.
+
+
diff --git a/groups/k10/TLB.txt b/groups/k10/TLB.txt
index 2984491..2491c8d 100644
--- a/groups/k10/TLB.txt
+++ b/groups/k10/TLB.txt
@@ -26,10 +26,10 @@ L2 DTLB miss rate DTLB_L2_MISS_ALL / INSTRUCTIONS_RETIRED
L2 DTLB miss ratio DTLB_L2_MISS_ALL / (DTLB_L2_HIT_ALL+DTLB_L2_MISS_ALL)
-
L1 DTLB request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
-The DTLB miss rate gives a measure how often a TLB miss occured
+or how many data accesses you have on average per instruction.
+The DTLB miss rate gives a measure how often a TLB miss occurred
per instruction. And finally L1 DTLB miss ratio tells you how many
-of your memory references required caused a TLB miss in average.
+of your memory references required caused a TLB miss on average.
NOTE: The L2 metrics are only relevant if L2 DTLB request rate is equal to the L1 DTLB miss rate!
This group was taken from the whitepaper Basic -Performance Measurements for AMD Athlon 64,
AMD Opteron and AMD Phenom Processors- from Paul J. Drongowski.
diff --git a/groups/k8/BRANCH.txt b/groups/k8/BRANCH.txt
index 64e10cd..f465335 100644
--- a/groups/k8/BRANCH.txt
+++ b/groups/k8/BRANCH.txt
@@ -4,28 +4,22 @@ EVENTSET
PMC0 INSTRUCTIONS_RETIRED
PMC1 BRANCH_RETIRED
PMC2 BRANCH_MISPREDICT_RETIRED
-PMC3 BRANCH_TAKEN_RETIRED
METRICS
Runtime (RDTSC) [s] time
Branch rate PMC1/PMC0
Branch misprediction rate PMC2/PMC0
Branch misprediction ratio PMC2/PMC1
-Branch taken rate PMC3/PMC0
-Branch taken ratio PMC3/PMC1
Instructions per branch PMC0/PMC1
LONG
Formulas:
-Branch rate = BRANCH_RETIRED / INSTRUCTIONS_RETIRED
-Branch misprediction rate = BRANCH_MISPREDICT_RETIRED / INSTRUCTIONS_RETIRED
-Branch misprediction ratio = BRANCH_MISPREDICT_RETIRED / BRANCH_RETIRED
-Branch taken rate = BRANCH_TAKEN_RETIRED / INSTRUCTIONS_RETIRED
-Branch taken ratio = BRANCH_TAKEN_RETIRED / BRANCH_RETIRED
-Instructions per branch = INSTRUCTIONS_RETIRED / BRANCH_RETIRED
+Branch rate = BRANCH_RETIRED/INSTRUCTIONS_RETIRED
+Branch misprediction rate = BRANCH_MISPREDICT_RETIRED/INSTRUCTIONS_RETIRED
+Branch misprediction ratio = BRANCH_MISPREDICT_RETIRED/BRANCH_RETIRED
+Instructions per branch = INSTRUCTIONS_RETIRED/BRANCH_RETIRED
-
-The rates state how often in average a branch or a mispredicted branch occured
-per instruction retired in total. The Branch misprediction ratio sets directly
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
into relation what ration of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate. The same applies for the branches
-taken metrics.
+Instructions per branch is 1/branch rate.
diff --git a/groups/k8/CACHE.txt b/groups/k8/CACHE.txt
index ff20b5e..e5e813e 100644
--- a/groups/k8/CACHE.txt
+++ b/groups/k8/CACHE.txt
@@ -8,26 +8,26 @@ PMC3 DATA_CACHE_REFILLS_NORTHBRIDGE_ALL
METRICS
Runtime (RDTSC) [s] time
-Data cache misses PMC2+PMC3
-Data cache request rate PMC1/PMC0
-Data cache miss rate (PMC2+PMC3)/PMC0
-Data cache miss ratio (PMC2+PMC3)/PMC1
+data cache misses PMC2+PMC3
+data cache request rate PMC1/PMC0
+data cache miss rate (PMC2+PMC3)/PMC0
+data cache miss ratio (PMC2+PMC3)/PMC1
LONG
Formulas:
-Data cache misses = DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL
-Data cache request rate = DATA_CACHE_ACCESSES / INSTRUCTIONS_RETIRED
-Data cache miss rate = (DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL)/INSTRUCTIONS_RETIRED
-Data cache miss ratio = (DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL)/DATA_CACHE_ACCESSES
+data cache misses = DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL
+data cache request rate = DATA_CACHE_ACCESSES / INSTRUCTIONS_RETIRED
+data cache miss rate = (DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL)/INSTRUCTIONS_RETIRED
+data cache miss ratio = (DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL)/DATA_CACHE_ACCESSES
-
This group measures the locality of your data accesses with regard to the
-L1 Cache. Data cache request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
-The Data cache miss rate gives a measure how often it was necessary to get
-cachelines from higher levels of the memory hierarchy. And finally
-Data cache miss ratio tells you how many of your memory references required
-a cacheline to be loaded from a higher level. While the Data cache miss rate
-might be given by your algorithm you should try to get Data cache miss ratio
+L1 cache. Data cache request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy. And finally
+data cache miss ratio tells you how many of your memory references required
+a cache line to be loaded from a higher level. While the# data cache miss rate
+might be given by your algorithm you should try to get data cache miss ratio
as low as possible by increasing your cache reuse.
This group was taken from the whitepaper -Basic Performance Measurements for AMD Athlon 64,
AMD Opteron and AMD Phenom Processors- from Paul J. Drongowski.
diff --git a/groups/k8/CPI.txt b/groups/k8/CPI.txt
index 6595c2d..850afed 100644
--- a/groups/k8/CPI.txt
+++ b/groups/k8/CPI.txt
@@ -13,6 +13,11 @@ CPI (based on uops) PMC1/PMC2
IPC PMC0/PMC1
LONG
+Formulas:
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS
+IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED
+-
This group measures how efficient the processor works with
regard to instruction throughput. Also important as a standalone
metric is INSTRUCTIONS_RETIRED as it tells you how many instruction
diff --git a/groups/k8/ICACHE.txt b/groups/k8/ICACHE.txt
index 222ea5d..5150496 100644
--- a/groups/k8/ICACHE.txt
+++ b/groups/k8/ICACHE.txt
@@ -8,18 +8,16 @@ PMC3 ICACHE_REFILLS_MEM
METRICS
Runtime (RDTSC) [s] time
-Instruction cache misses PMC2+PMC3
-Instruction cache request rate PMC1/PMC0
-Instruction cache miss rate (PMC2+PMC3)/PMC0
-Instruction cache miss ratio (PMC2+PMC3)/PMC1
+L1I request rate PMC1/PMC0
+L1I miss rate (PMC2+PMC3)/PMC0
+L1I miss ratio (PMC2+PMC3)/PMC1
LONG
Formulas:
-Instruction cache misses ICACHE_REFILLS_L2 + ICACHE_REFILLS_MEM
-Instruction cache request rate ICACHE_FETCHES / INSTRUCTIONS_RETIRED
-Instruction cache miss rate (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/INSTRUCTIONS_RETIRED
-Instruction cache miss ratio (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/ICACHE_FETCHES
+L1I request rate = ICACHE_FETCHES / INSTRUCTIONS_RETIRED
+L1I miss rate = (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/INSTRUCTIONS_RETIRED
+L1I miss ratio = (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/ICACHE_FETCHES
-
This group measures the locality of your instruction code with regard to the
-L1 I-Cache.
+L1 I-Cache.
diff --git a/groups/k8/L2.txt b/groups/k8/L2.txt
index 58eae3b..c3ad517 100644
--- a/groups/k8/L2.txt
+++ b/groups/k8/L2.txt
@@ -21,8 +21,8 @@ L2 refill bandwidth [MBytes/s] 1.0E-06*DATA_CACHE_REFILLS_L2_ALL*64/time
L2 evict [MBytes/s] 1.0E-06*DATA_CACHE_EVICTED_ALL*64/time
-
Profiling group to measure L2 cache bandwidth. The bandwidth is
-computed by the number of cacheline loaded from L2 to L1 and the
-number of modified cachelines evicted from the L1.
+computed by the number of cache line loaded from L2 to L1 and the
+number of modified cache lines evicted from the L1.
Note that this bandwidth also includes data transfers due to a
write allocate load on a store miss in L1 and copy back transfers if
originated from L2.
diff --git a/groups/kabini/BRANCH.txt b/groups/kabini/BRANCH.txt
index 1ae9f36..7495b74 100644
--- a/groups/kabini/BRANCH.txt
+++ b/groups/kabini/BRANCH.txt
@@ -4,29 +4,23 @@ EVENTSET
PMC0 RETIRED_INSTRUCTIONS
PMC1 RETIRED_BRANCH_INSTR
PMC2 RETIRED_MISPREDICTED_BRANCH_INSTR
-PMC3 RETIRED_TAKEN_BRANCH_INSTR
METRICS
Runtime (RDTSC) [s] time
Branch rate PMC1/PMC0
Branch misprediction rate PMC2/PMC0
Branch misprediction ratio PMC2/PMC1
-Branch taken rate PMC3/PMC0
-Branch taken ratio PMC3/PMC1
Instructions per branch PMC0/PMC1
LONG
Formulas:
-Branch rate = RETIRED_BRANCH_INSTR / RETIRED_INSTRUCTIONS
-Branch misprediction rate = RETIRED_MISPREDICTED_BRANCH_INSTR / RETIRED_INSTRUCTIONS
-Branch misprediction ratio = RETIRED_MISPREDICTED_BRANCH_INSTR / RETIRED_BRANCH_INSTR
-Branch taken rate = RETIRED_TAKEN_BRANCH_INSTR / RETIRED_INSTRUCTIONS
-Branch taken ratio = RETIRED_TAKEN_BRANCH_INSTR / RETIRED_BRANCH_INSTR
-Instructions per branch = RETIRED_INSTRUCTIONS / RETIRED_BRANCH_INSTR
+Branch rate = RETIRED_BRANCH_INSTR/RETIRED_INSTRUCTIONS
+Branch misprediction rate = RETIRED_MISPREDICTED_BRANCH_INSTR/RETIRED_INSTRUCTIONS
+Branch misprediction ratio = RETIRED_MISPREDICTED_BRANCH_INSTR/RETIRED_BRANCH_INSTR
+Instructions per branch = RETIRED_INSTRUCTIONS/RETIRED_BRANCH_INSTR
-
-The rates state how often in average a branch or a mispredicted branch occured
+The rates state how often on average a branch or a mispredicted branch occurred
per instruction retired in total. The branch misprediction ratio sets directly
into relation what ratio of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate. The same applies for the branches
-taken metrics.
+Instructions per branch is 1/branch rate.
diff --git a/groups/kabini/CACHE.txt b/groups/kabini/CACHE.txt
index ef62f76..8a59288 100644
--- a/groups/kabini/CACHE.txt
+++ b/groups/kabini/CACHE.txt
@@ -8,25 +8,25 @@ PMC3 DATA_CACHE_REFILLS_NB_ALL
METRICS
Runtime (RDTSC) [s] time
-Data cache misses PMC2+PMC3
-Data cache request rate PMC1/PMC0
-Data cache miss rate (PMC2+PMC3)/PMC0
-Data cache miss ratio (PMC2+PMC3)/PMC1
+data cache misses PMC2+PMC3
+data cache request rate PMC1/PMC0
+data cache miss rate (PMC2+PMC3)/PMC0
+data cache miss ratio (PMC2+PMC3)/PMC1
LONG
Formulas:
-Data cache misses = DATA_CACHE_REFILLS_ALL + DATA_CACHE_REFILLS_NB_ALL
-Data cache request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS
-Data cache miss rate = (DATA_CACHE_REFILLS_ALL + DATA_CACHE_REFILLS_NB_ALL)/RETIRED_INSTRUCTIONS
-Data cache miss ratio = (DATA_CACHE_REFILLS_ALL + DATA_CACHE_REFILLS_NB_ALL)/DATA_CACHE_ACCESSES
+data cache misses = DATA_CACHE_REFILLS_ALL + DATA_CACHE_REFILLS_NB_ALL
+data cache request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS
+data cache miss rate = (DATA_CACHE_REFILLS_ALL + DATA_CACHE_REFILLS_NB_ALL)/RETIRED_INSTRUCTIONS
+data cache miss ratio = (DATA_CACHE_REFILLS_ALL + DATA_CACHE_REFILLS_NB_ALL)/DATA_CACHE_ACCESSES
-
This group measures the locality of your data accesses with regard to the
-L1 Cache. Data cache request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
-The Data cache miss rate gives a measure how often it was necessary to get
-cachelines from higher levels of the memory hierarchy. And finally
-Data cache miss ratio tells you how many of your memory references required
-a cacheline to be loaded from a higher level. While the Data cache miss rate
-might be given by your algorithm you should try to get Data cache miss ratio
+L1 cache. Data cache request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy. And finally
+data cache miss ratio tells you how many of your memory references required
+a cache line to be loaded from a higher level. While the# data cache miss rate
+might be given by your algorithm you should try to get data cache miss ratio
as low as possible by increasing your cache reuse.
diff --git a/groups/kabini/CPI.txt b/groups/kabini/CPI.txt
index 47711f4..c0746e7 100644
--- a/groups/kabini/CPI.txt
+++ b/groups/kabini/CPI.txt
@@ -13,6 +13,11 @@ CPI (based on uops) PMC1/PMC2
IPC PMC0/PMC1
LONG
+Formulas:
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS
+IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED
+-
This group measures how efficient the processor works with
regard to instruction throughput. Also important as a standalone
metric is RETIRED_INSTRUCTIONS as it tells you how many instruction
diff --git a/groups/kabini/DATA.txt b/groups/kabini/DATA.txt
index 78e4c3c..75f1f60 100644
--- a/groups/kabini/DATA.txt
+++ b/groups/kabini/DATA.txt
@@ -6,11 +6,11 @@ PMC1 LS_DISPATCH_STORES
METRICS
Runtime (RDTSC) [s] time
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
LONG
Formulas:
-Load to Store ratio = LS_DISPATCH_LOADS / LS_DISPATCH_STORES
+Load to store ratio = LS_DISPATCH_LOADS/LS_DISPATCH_STORES
-
This is a simple metric to determine your load to store ratio.
diff --git a/groups/kabini/FLOPS_DP.txt b/groups/kabini/FLOPS_DP.txt
index d7f5f57..d6af2e2 100644
--- a/groups/kabini/FLOPS_DP.txt
+++ b/groups/kabini/FLOPS_DP.txt
@@ -1,4 +1,4 @@
-SHORT Double Precision MFlops/s
+SHORT Double Precision MFLOP/s
EVENTSET
PMC0 RETIRED_INSTRUCTIONS
@@ -9,15 +9,18 @@ PMC3 RETIRED_FLOPS_DOUBLE_ALL
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] PMC1*inverseClock
-MFlops/s 1.0E-06*(PMC3)/time
+DP MFLOP/s 1.0E-06*(PMC3)/time
CPI PMC1/PMC0
CPI (based on uops) PMC1/PMC2
IPC PMC0/PMC1
LONG
Formulas:
-DP MFlops/s = 1.0E-06*(RETIRED_FLOPS_DOUBLE_ALL)/time
+DP MFLOP/s = 1.0E-06*(RETIRED_FLOPS_DOUBLE_ALL)/time
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS
+IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED
-
-Profiling group to measure double precisision flop rate.
+Profiling group to measure double precisision FLOP rate.
diff --git a/groups/kabini/FLOPS_SP.txt b/groups/kabini/FLOPS_SP.txt
index 1c4dcc3..0fe4e54 100644
--- a/groups/kabini/FLOPS_SP.txt
+++ b/groups/kabini/FLOPS_SP.txt
@@ -1,4 +1,4 @@
-SHORT Single Precision MFlops/s
+SHORT Single Precision MFLOP/s
EVENTSET
PMC0 RETIRED_INSTRUCTIONS
@@ -9,15 +9,18 @@ PMC3 RETIRED_FLOPS_SINGLE_ALL
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] PMC1*inverseClock
-MFlops/s 1.0E-06*(PMC3)/time
+SP MFLOP/s 1.0E-06*(PMC3)/time
CPI PMC1/PMC0
CPI (based on uops) PMC1/PMC2
IPC PMC0/PMC1
LONG
Formulas:
-SP MFlops/s = 1.0E-06*(RETIRED_FLOPS_SINGLE_ALL)/time
+SP MFLOP/s = 1.0E-06*(RETIRED_FLOPS_SINGLE_ALL)/time
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS
+IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED
-
-Profiling group to measure single precision flop rate.
+Profiling group to measure single precision FLOP rate.
diff --git a/groups/kabini/FPU_EXCEPTION.txt b/groups/kabini/FPU_EXCEPTION.txt
index 23814da..5ed02c6 100644
--- a/groups/kabini/FPU_EXCEPTION.txt
+++ b/groups/kabini/FPU_EXCEPTION.txt
@@ -15,7 +15,7 @@ Formulas:
Overall FP exception rate = FPU_EXCEPTIONS_ALL / RETIRED_INSTRUCTIONS
FP exception rate = FPU_EXCEPTIONS_ALL / FP_INSTRUCTIONS_RETIRED_ALL
-
-Floating point exceptions occur e.g. on the treatment of Denormals.
+Floating point exceptions occur e.g. on the treatment of denormal numbers.
There might be a large penalty if there are too many floating point
exceptions.
diff --git a/groups/kabini/ICACHE.txt b/groups/kabini/ICACHE.txt
index be5e5f5..62b91d6 100644
--- a/groups/kabini/ICACHE.txt
+++ b/groups/kabini/ICACHE.txt
@@ -8,18 +8,16 @@ PMC3 RETIRED_INSTRUCTIONS
METRICS
Runtime (RDTSC) [s] time
-Instruction cache misses PMC1+PMC2
-Instruction cache request rate PMC0/PMC3
-Instruction cache miss rate (PMC1+PMC2)/PMC3
-Instruction cache miss ratio (PMC1+PMC2)/PMC0
+L1I request rate PMC0/PMC3
+L1I miss rate (PMC1+PMC2)/PMC3
+L1I miss ratio (PMC1+PMC2)/PMC0
LONG
Formulas:
-Instruction cache misses INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS
-Instruction cache request rate INSTRUCTION_CACHE_FETCHES / RETIRED_INSTRUCTIONS
-Instruction cache miss rate (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/RETIRED_INSTRUCTIONS
-Instruction cache miss ratio (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/INSTRUCTION_CACHE_FETCHES
+L1I request rate = INSTRUCTION_CACHE_FETCHES / RETIRED_INSTRUCTIONS
+L1I miss rate = (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/RETIRED_INSTRUCTIONS
+L1I miss ratio = (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/INSTRUCTION_CACHE_FETCHES
-
This group measures the locality of your instruction code with regard to the
-L1 I-Cache.
+L1 I-Cache.
diff --git a/groups/kabini/L2.txt b/groups/kabini/L2.txt
index d06d809..3598a54 100644
--- a/groups/kabini/L2.txt
+++ b/groups/kabini/L2.txt
@@ -8,21 +8,25 @@ PMC2 CPU_CLOCKS_UNHALTED
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] PMC2*inverseClock
+L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes] 1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0
L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
-L2 refill bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 evict [MBytes/s] 1.0E-06*PMC1*64.0/time
LONG
Formulas:
-L2 bandwidth [MBytes/s] 1.0E-06*(DATA_CACHE_REFILLS_ALL+DATA_CACHE_EVICTED_ALL)*64/time
-L2 data volume [GBytes] 1.0E-09*(DATA_CACHE_REFILLS_ALL+DATA_CACHE_EVICTED_ALL)*64
-L2 refill bandwidth [MBytes/s] 1.0E-06*DATA_CACHE_REFILLS_ALL*64/time
-L2 evict [MBytes/s] 1.0E-06*DATA_CACHE_EVICTED_ALL*64/time
+L2D load bandwidth [MBytes/s] = 1.0E-06*DATA_CACHE_REFILLS_ALL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*DATA_CACHE_REFILLS_ALL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*DATA_CACHE_EVICTED_ALL*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*DATA_CACHE_EVICTED_ALL*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_ALL+DATA_CACHE_EVICTED_ALL)*64/time
+L2 data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_ALL+DATA_CACHE_EVICTED_ALL)*64
-
Profiling group to measure L2 cache bandwidth. The bandwidth is
-computed by the number of cacheline loaded from L2 to L1 and the
-number of modified cachelines evicted from the L1.
+computed by the number of cache line loaded from L2 to L1 and the
+number of modified cache lines evicted from the L1.
Note that this bandwidth also includes data transfers due to a
write allocate load on a store miss in L1 and copy back transfers if
originated from L2.
diff --git a/groups/kabini/MEM.txt b/groups/kabini/MEM.txt
index 22aa19e..2fa9dfe 100644
--- a/groups/kabini/MEM.txt
+++ b/groups/kabini/MEM.txt
@@ -16,5 +16,5 @@ Memory data volume [GBytes] = 1.0E-09*(DRAM_ACCESSES_DCTO_ALL+DRAM_ACCESSES_DCT1
-
Profiling group to measure memory bandwidth drawn by all cores of a socket.
Note: As this group measures the accesses from all cores it only makes sense
-to measure with one core per socket, similiar as with the Intel Nehalem Uncore events.
+to measure with one core per socket, similar as with the Intel Nehalem Uncore events.
diff --git a/groups/kabini/NUMA.txt b/groups/kabini/NUMA.txt
deleted file mode 100644
index d94e735..0000000
--- a/groups/kabini/NUMA.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-SHORT Read/Write Events between the ccNUMA nodes
-
-EVENTSET
-UPMC0 UNC_CPU_TO_DRAM_LOCAL_TO_0
-UPMC1 UNC_CPU_TO_DRAM_LOCAL_TO_1
-UPMC2 UNC_CPU_TO_DRAM_LOCAL_TO_2
-UPMC3 UNC_CPU_TO_DRAM_LOCAL_TO_3
-
-METRICS
-Runtime (RDTSC) [s] time
-DRAM read/write local to 0 [MegaEvents/s] 1.0E-06*UPMC0/time
-DRAM read/write local to 1 [MegaEvents/s] 1.0E-06*UPMC1/time
-DRAM read/write local to 2 [MegaEvents/s] 1.0E-06*UPMC2/time
-DRAM read/write local to 3 [MegaEvents/s] 1.0E-06*UPMC3/time
-
-LONG
-Formulas:
-DRAM read/write local to 0 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_0/time
-DRAM read/write local to 1 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/time
-DRAM read/write local to 2 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time
-DRAM read/write local to 3 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time
--
-Profiling group to measure the traffic from local CPU to the different
-DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded
-code. You must first determine on which memory domains your code is running.
-A code should only have significant traffic to its own memory domain.
-
-
diff --git a/groups/kabini/NUMA2.txt b/groups/kabini/NUMA2.txt
deleted file mode 100644
index b10e6fb..0000000
--- a/groups/kabini/NUMA2.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-SHORT Read/Write Events between the ccNUMA nodes
-
-EVENTSET
-UPMC0 UNC_CPU_TO_DRAM_LOCAL_TO_4
-UPMC1 UNC_CPU_TO_DRAM_LOCAL_TO_5
-UPMC2 UNC_CPU_TO_DRAM_LOCAL_TO_6
-UPMC3 UNC_CPU_TO_DRAM_LOCAL_TO_7
-
-METRICS
-Runtime (RDTSC) [s] time
-DRAM read/write local to 4 [MegaEvents/s] 1.0E-06*UPMC0/time
-DRAM read/write local to 5 [MegaEvents/s] 1.0E-06*UPMC1/time
-DRAM read/write local to 6 [MegaEvents/s] 1.0E-06*UPMC2/time
-DRAM read/write local to 7 [MegaEvents/s] 1.0E-06*UPMC3/time
-
-LONG
-Formulas:
-DRAM read/write local to 4 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_4/time
-DRAM read/write local to 5 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_5/time
-DRAM read/write local to 6 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_6/time
-DRAM read/write local to 7 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_7/time
--
-Profiling group to measure the traffic from local CPU to the different
-DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded
-code. You must first determine on which memory domains your code is running.
-A code should only have significant traffic to its own memory domain.
-
-
diff --git a/groups/kabini/NUMA_0_3.txt b/groups/kabini/NUMA_0_3.txt
new file mode 100644
index 0000000..ed13dbe
--- /dev/null
+++ b/groups/kabini/NUMA_0_3.txt
@@ -0,0 +1,28 @@
+SHORT Read/Write Events between the ccNUMA nodes
+
+EVENTSET
+UPMC0 UNC_CPU_TO_DRAM_LOCAL_TO_0
+UPMC1 UNC_CPU_TO_DRAM_LOCAL_TO_1
+UPMC2 UNC_CPU_TO_DRAM_LOCAL_TO_2
+UPMC3 UNC_CPU_TO_DRAM_LOCAL_TO_3
+
+METRICS
+Runtime (RDTSC) [s] time
+DRAM read/write local to 0 [MegaEvents/s] 1.0E-06*UPMC0/time
+DRAM read/write local to 1 [MegaEvents/s] 1.0E-06*UPMC1/time
+DRAM read/write local to 2 [MegaEvents/s] 1.0E-06*UPMC2/time
+DRAM read/write local to 3 [MegaEvents/s] 1.0E-06*UPMC3/time
+
+LONG
+Formulas:
+DRAM read/write local to 0 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_0/time
+DRAM read/write local to 1 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/time
+DRAM read/write local to 2 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time
+DRAM read/write local to 3 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time
+-
+Profiling group to measure the traffic from local CPU to the different
+DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded
+code. You must first determine on which memory domains your code is running.
+A code should only have significant traffic to its own memory domain.
+
+
diff --git a/groups/kabini/NUMA_4_7.txt b/groups/kabini/NUMA_4_7.txt
new file mode 100644
index 0000000..b744881
--- /dev/null
+++ b/groups/kabini/NUMA_4_7.txt
@@ -0,0 +1,28 @@
+SHORT Read/Write Events between the ccNUMA nodes
+
+EVENTSET
+UPMC0 UNC_CPU_TO_DRAM_LOCAL_TO_4
+UPMC1 UNC_CPU_TO_DRAM_LOCAL_TO_5
+UPMC2 UNC_CPU_TO_DRAM_LOCAL_TO_6
+UPMC3 UNC_CPU_TO_DRAM_LOCAL_TO_7
+
+METRICS
+Runtime (RDTSC) [s] time
+DRAM read/write local to 4 [MegaEvents/s] 1.0E-06*UPMC0/time
+DRAM read/write local to 5 [MegaEvents/s] 1.0E-06*UPMC1/time
+DRAM read/write local to 6 [MegaEvents/s] 1.0E-06*UPMC2/time
+DRAM read/write local to 7 [MegaEvents/s] 1.0E-06*UPMC3/time
+
+LONG
+Formulas:
+DRAM read/write local to 4 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_4/time
+DRAM read/write local to 5 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_5/time
+DRAM read/write local to 6 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_6/time
+DRAM read/write local to 7 [MegaEvents/s] 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_7/time
+-
+Profiling group to measure the traffic from local CPU to the different
+DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded
+code. You must first determine on which memory domains your code is running.
+A code should only have significant traffic to its own memory domain.
+
+
diff --git a/groups/kabini/TLB.txt b/groups/kabini/TLB.txt
index 4f170ee..707f888 100644
--- a/groups/kabini/TLB.txt
+++ b/groups/kabini/TLB.txt
@@ -26,8 +26,9 @@ L2 DTLB miss rate DTLB_MISS_ALL / RETIRED_INSTRUCTIONS
L2 DTLB miss ratio DTLB_MISS_ALL / (L2_DTLB_HIT_ALL+DTLB_MISS_ALL)
-
L1 DTLB request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
-The DTLB miss rate gives a measure how often a TLB miss occured
+or how many data accesses you have on average per instruction.
+The DTLB miss rate gives a measure how often a TLB miss occurred
per instruction. And finally L1 DTLB miss ratio tells you how many
-of your memory references required caused a TLB miss in average.
-NOTE: The L2 metrics are only relevant if L2 DTLB request rate is equal to the L1 DTLB miss rate!
+of your memory references required caused a TLB miss on average.
+NOTE: The L2 metrics are only relevant if L2 DTLB request rate is
+equal to the L1 DTLB miss rate!
diff --git a/groups/nehalem/BRANCH.txt b/groups/nehalem/BRANCH.txt
index 3d81416..1ef9f11 100644
--- a/groups/nehalem/BRANCH.txt
+++ b/groups/nehalem/BRANCH.txt
@@ -19,13 +19,13 @@ Instructions per branch FIXC0/PMC0
LONG
Formulas:
-Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
-
-The rates state how often in average a branch or a mispredicted branch occured
-per instruction retired in total. The Branch misprediction ratio sets directly
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
into relation what ration of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate.
+Instructions per branch is 1/branch rate.
diff --git a/groups/nehalem/CACHE.txt b/groups/nehalem/CACHE.txt
index c3e989c..6603171 100644
--- a/groups/nehalem/CACHE.txt
+++ b/groups/nehalem/CACHE.txt
@@ -12,24 +12,25 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-Data cache misses PMC0
-Data cache request rate PMC1/FIXC0
-Data cache miss rate PMC0/FIXC0
-Data cache miss ratio PMC0/PMC1
+data cache misses PMC0
+data cache request rate PMC1/FIXC0
+data cache miss rate PMC0/FIXC0
+data cache miss ratio PMC0/PMC1
LONG
Formulas:
-Data cache request rate = L1D_ALL_REF_ANY / INSTR_RETIRED_ANY
-Data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
-Data cache miss ratio = L1D_REPL / L1D_ALL_REF_ANY
+data cache misses = L1D_REPL
+data cache request rate = L1D_ALL_REF_ANY / INSTR_RETIRED_ANY
+data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
+data cache miss ratio = L1D_REPL / L1D_ALL_REF_ANY
-
This group measures the locality of your data accesses with regard to the
-L1 Cache. Data cache request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
-The Data cache miss rate gives a measure how often it was necessary to get
-cachelines from higher levels of the memory hierarchy. And finally
-Data cache miss ratio tells you how many of your memory references required
-a cacheline to be loaded from a higher level. While the Data cache miss rate
-might be given by your algorithm you should try to get Data cache miss ratio
+L1 cache. Data cache request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy. And finally
+data cache miss ratio tells you how many of your memory references required
+a cache line to be loaded from a higher level. While the data cache miss rate
+might be given by your algorithm you should try to get data cache miss ratio
as low as possible by increasing your cache reuse.
diff --git a/groups/nehalem/DATA.txt b/groups/nehalem/DATA.txt
index a5611bc..31bba51 100644
--- a/groups/nehalem/DATA.txt
+++ b/groups/nehalem/DATA.txt
@@ -12,11 +12,11 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
LONG
Formulas:
-Load to Store ratio = MEM_INST_RETIRED_LOADS / MEM_INST_RETIRED_STORES
+Load to store ratio = MEM_INST_RETIRED_LOADS/MEM_INST_RETIRED_STORES
-
-This is a simple metric to determine your Load to store ratio.
+This is a simple metric to determine your load to store ratio.
diff --git a/groups/nehalem/FLOPS_DP.txt b/groups/nehalem/FLOPS_DP.txt
index c5ba91c..3e75cad 100644
--- a/groups/nehalem/FLOPS_DP.txt
+++ b/groups/nehalem/FLOPS_DP.txt
@@ -1,4 +1,4 @@
-SHORT Double Precision MFlops/s
+SHORT Double Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -14,7 +14,7 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-DP MFlops/s 1.0E-06*(PMC0*2.0+PMC1)/time
+MFLOP/s 1.0E-06*(PMC0*2.0+PMC1)/time
Packed MUOPS/s 1.0E-06*PMC0/time
Scalar MUOPS/s 1.0E-06*PMC1/time
SP MUOPS/s 1.0E-06*PMC2/time
@@ -22,10 +22,14 @@ DP MUOPS/s 1.0E-06*PMC3/time
LONG
Formula:
-DP MFlops/s = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 + FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
-
-The Nehalem has not possibility to measure MFlops if mixed precision calculations are done.
-Therefore both Single as well as Double precision are measured to ensure the correctness
+The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done.
+Therefore both single as well as double precision are measured to ensure the correctness
of the measurements. You can check if your code was vectorized on the number of
-FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
+FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
diff --git a/groups/nehalem/FLOPS_SP.txt b/groups/nehalem/FLOPS_SP.txt
index 4478c8f..9768109 100644
--- a/groups/nehalem/FLOPS_SP.txt
+++ b/groups/nehalem/FLOPS_SP.txt
@@ -1,4 +1,4 @@
-SHORT Single Precision MFlops/s
+SHORT Single Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -14,7 +14,7 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-SP MFlops/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
+MFLOP/s 1.0E-06*(PMC0*4.0+PMC1)/time
Packed MUOPS/s 1.0E-06*PMC0/time
Scalar MUOPS/s 1.0E-06*PMC1/time
SP MUOPS/s 1.0E-06*PMC2/time
@@ -22,10 +22,14 @@ DP MUOPS/s 1.0E-06*PMC3/time
LONG
Formula:
-SP MFlops/s = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 + FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
-
-The Nehalem has not possibility to measure MFlops if mixed precision calculations are done.
-Therefore both Single as well as Double precision are measured to ensure the correctness
+The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done.
+Therefore both single as well as double precision are measured to ensure the correctness
of the measurements. You can check if your code was vectorized on the number of
-FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
+FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
diff --git a/groups/nehalem/FLOPS_X87.txt b/groups/nehalem/FLOPS_X87.txt
index 6447b93..a4176f0 100644
--- a/groups/nehalem/FLOPS_X87.txt
+++ b/groups/nehalem/FLOPS_X87.txt
@@ -1,4 +1,4 @@
-SHORT X87 MFlops/s
+SHORT X87 MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -11,8 +11,8 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-X87 MFlops/s 1.0E-06*PMC0/time
+X87 MFLOP/s 1.0E-06*PMC0/time
LONG
-Profiling group to measure X87 flop rate.
+Profiling group to measure X87 FLOP rate.
diff --git a/groups/nehalem/ICACHE.txt b/groups/nehalem/ICACHE.txt
new file mode 100644
index 0000000..49943ff
--- /dev/null
+++ b/groups/nehalem/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L1I_READS
+PMC1 L1I_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = L1I_READS / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / L1I_READS
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/nehalem/L2.txt b/groups/nehalem/L2.txt
index d193047..e2715cc 100644
--- a/groups/nehalem/L2.txt
+++ b/groups/nehalem/L2.txt
@@ -6,27 +6,35 @@ FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 L1D_REPL
PMC1 L1D_M_EVICT
+PMC2 L1I_MISSES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
-L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes] 1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
LONG
Formulas:
-L2 Load [MBytes/s] = 1.0E-06*L1D_REPL*64/time
-L2 Evict [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
-L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT)*64/time
-L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT)*64
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64
-
Profiling group to measure L2 cache bandwidth. The bandwidth is
-computed by the number of cacheline allocated in the L1 and the
-number of modified cachelines evicted from the L1.
+computed by the number of cache line allocated in the L1 and the
+number of modified cache lines evicted from the L1. Also reports on
+total data volume transferred between L2 and L1 cache.
Note that this bandwidth also includes data transfers due to a
-write allocate load on a store miss in L1.
+write allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
+
diff --git a/groups/nehalem/L2CACHE.txt b/groups/nehalem/L2CACHE.txt
index 0fd60da..343b263 100644
--- a/groups/nehalem/L2CACHE.txt
+++ b/groups/nehalem/L2CACHE.txt
@@ -4,7 +4,7 @@ EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
-PMC0 L2_DATA_RQSTS_DEMAND_ANY
+PMC0 L2_RQSTS_REFERENCES
PMC1 L2_RQSTS_MISS
METRICS
@@ -18,17 +18,17 @@ L2 miss ratio PMC1/PMC0
LONG
Formulas:
-L2 request rate = L2_DATA_RQSTS_DEMAND_ANY / INSTR_RETIRED_ANY
-L2 miss rate = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_DATA_RQSTS_DEMAND_MESI
+L2 request rate = L2_RQSTS_REFERENCES/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_RQSTS_REFERENCES
-
This group measures the locality of your data accesses with regard to the
-L2 Cache. L2 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
The L2 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L2 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
diff --git a/groups/nehalem/L3.txt b/groups/nehalem/L3.txt
index 446afee..70b5f29 100644
--- a/groups/nehalem/L3.txt
+++ b/groups/nehalem/L3.txt
@@ -12,20 +12,24 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-L3 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L3 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
+L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0
L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
LONG
Formulas:
-L3 Load [MBytes/s] 1.0E-06*L2_LINES_IN_ANY*64/time
-L3 Evict [MBytes/s] 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64/time
-L3 bandwidth [MBytes/s] 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64/time
-L3 data volume [GBytes] 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ANY*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ANY*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DEMAND_DIRTY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64
-
Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
-number of cacheline allocated in the L2 and the number of modified cachelines
+number of cache line allocated in the L2 and the number of modified cache lines
evicted from the L2. Also reports total data volume between L3 and L2 caches.
Note that this bandwidth also includes data transfers due to a write allocate
load on a store miss in L2.
diff --git a/groups/nehalem/L3CACHE.txt b/groups/nehalem/L3CACHE.txt
index b6ec110..15e00ed 100644
--- a/groups/nehalem/L3CACHE.txt
+++ b/groups/nehalem/L3CACHE.txt
@@ -1,36 +1,34 @@
SHORT L3 cache miss rate/ratio
EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
UPMC0 UNC_L3_HITS_ANY
UPMC1 UNC_L3_MISS_ANY
-UPMC2 UNC_L3_LINES_IN_ANY
-UPMC3 UNC_L3_LINES_OUT_ANY
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-L3 request rate UPMC0/FIXC0
+L3 request rate (UPMC0+UPMC1)/FIXC0
L3 miss rate UPMC1/FIXC0
-L3 miss ratio UPMC1/UPMC0
+L3 miss ratio UPMC1/(UPMC0+UPMC1)
LONG
Formulas:
-L3 request rate UNC_L3_HITS_ANY / INSTR_RETIRED_ANY
-L3 miss rate UNC_L3_MISS_ANY / INSTR_RETIRED_ANY
-L3 miss ratio UNC_L3_MISS_ANY / UNC_L3_HITS_ANY
+L3 request rate = (UNC_L3_HITS_ANY+UNC_L3_MISS_ANY)/INSTR_RETIRED_ANY
+L3 miss rate = UNC_L3_MISS_ANY/INSTR_RETIRED_ANY
+L3 miss ratio = UNC_L3_MISS_ANY/(UNC_L3_HITS_ANY+UNC_L3_MISS_ANY)
-
This group measures the locality of your data accesses with regard to the
-L3 Cache. L3 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
The L3 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L3 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
diff --git a/groups/nehalem/MEM.txt b/groups/nehalem/MEM.txt
index 087b269..d2083f5 100644
--- a/groups/nehalem/MEM.txt
+++ b/groups/nehalem/MEM.txt
@@ -1,36 +1,49 @@
SHORT Main memory bandwidth in MBytes/s
EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
UPMC0 UNC_QMC_NORMAL_READS_ANY
UPMC1 UNC_QMC_WRITES_FULL_ANY
-UPMC2 UNC_QHL_REQUESTS_REMOTE_READS
-UPMC3 UNC_QHL_REQUESTS_LOCAL_READS
-UPMC4 UNC_QHL_REQUESTS_REMOTE_WRITES
+UPMC2 UNC_QHL_REQUESTS_REMOTE_READS
+UPMC3 UNC_QHL_REQUESTS_REMOTE_WRITES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64/time
-Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64
-Remote Read BW [MBytes/s] 1.0E-06*(UPMC2)*64/time
-Remote Write BW [MBytes/s] 1.0E-06*(UPMC4)*64/time
-Remote BW [MBytes/s] 1.0E-06*(UPMC2+UPMC4)*64/time
+Memory read bandwidth [MBytes/s] 1.0E-06*UPMC0*64.0/time
+Memory data volume [GBytes] 1.0E-09*UPMC0*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*UPMC1*64.0/time
+Memory data volume [GBytes] 1.0E-09*UPMC1*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64.0
+Remote memory read bandwidth [MBytes/s] 1.0E-06*UPMC2*64.0/time
+Remote memory read data volume [GBytes] 1.0E-09*UPMC2*64.0
+Remote memory write bandwidth [MBytes/s] 1.0E-06*UPMC3*64.0/time
+Remote memory write data volume [GBytes] 1.0E-09*UPMC3*64.0
+Remote memory bandwidth [MBytes/s] 1.0E-06*(UPMC2+UPMC3)*64.0/time
+Remote memory data volume [GBytes] 1.0E-09*(UPMC2+UPMC3)*64.0
LONG
Formulas:
-Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64/time
-Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64
-Remote Read BW [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS)*64/time;
-Remote Write BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
-Remote BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
+Memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_NORMAL_READS_ANY*64.0/time
+Memory data volume [GBytes] = 1.0E-09*UNC_QMC_NORMAL_READS_ANY*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_WRITES_FULL_ANY*64.0/time
+Memory data volume [GBytes] = 1.0E-09*UNC_QMC_WRITES_FULL_ANY*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0
+Remote memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_READS*64.0/time
+Remote memory read data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_READS*64.0
+Remote memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0/time
+Remote memory write data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0
+Remote memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0/time
+Remote memory data volume [GBytes] = 1.0E-09*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0
-
Profiling group to measure memory bandwidth drawn by all cores of a socket.
-This group will be measured by one core per socket. The Remote Read BW tells
-you if cachelines are transfered between sockets, meaning that cores access
+This group will be measured by one core per socket. The Remote Read BW tells
+you if cache lines are transferred between sockets, meaning that cores access
data owned by a remote NUMA domain.
diff --git a/groups/nehalem/SCHEDULER.txt b/groups/nehalem/SCHEDULER.txt
index a7bbe37..0e43cce 100644
--- a/groups/nehalem/SCHEDULER.txt
+++ b/groups/nehalem/SCHEDULER.txt
@@ -13,9 +13,13 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-RATIO Port 1 PMC1/PMC0
-RATIO Port 5 PMC2/PMC0
+Ratio Port 1 PMC1/PMC0
+Ratio Port 5 PMC2/PMC0
LONG
+Formulas:
+Ratio Port 1 = UOPS_EXECUTED_PORT1/UOPS_EXECUTED_PORT0
+Ratio Port 5 = UOPS_EXECUTED_PORT5/UOPS_EXECUTED_PORT0
+-
Measures how many instructions were scheduled on which issue port.
diff --git a/groups/nehalem/TLB.txt b/groups/nehalem/TLB.txt
index 5f93d66..c380851 100644
--- a/groups/nehalem/TLB.txt
+++ b/groups/nehalem/TLB.txt
@@ -22,9 +22,9 @@ L1 DTLB request rate = L1D_ALL_REF_ANY / INSTR_RETIRED_ANY
DTLB miss rate = DTLB_MISSES_ANY / INSTR_RETIRED_ANY
L1 DTLB miss ratio = DTLB_MISSES_ANY / L1D_ALL_REF_ANY
-
-L1 DTLB request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
-The DTLB miss rate gives a measure how often a TLB miss occured
-per instruction. And finally L1 DTLB miss ratio tells you how many
-of your memory references required caused a TLB miss in average.
+L1 DTLB request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The DTLB miss rate gives a measure how often a TLB miss occurred
+per instruction. And finally L1 DTLB miss ratio tells you how many
+of your memory references required caused a TLB miss on average.
diff --git a/groups/nehalem/VIEW.txt b/groups/nehalem/VIEW.txt
deleted file mode 100644
index 98a856f..0000000
--- a/groups/nehalem/VIEW.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-SHORT Main memory bandwidth in MBytes/s
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED
-PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR
-PMC2 FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
-PMC3 FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
-UPMC0 UNC_QMC_NORMAL_READS_ANY
-UPMC1 UNC_QMC_WRITES_FULL_ANY
-UPMC2 UNC_QHL_REQUESTS_REMOTE_READS
-UPMC3 UNC_QHL_REQUESTS_LOCAL_READS
-UPMC4 UNC_QHL_REQUESTS_REMOTE_WRITES
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI FIXC1/FIXC0
-DP MFlops/s (DP assumed) 1.0E-06*(PMC0*2.0+PMC1)/time
-SP MFlops/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
-Packed MUOPS/s 1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-SP MUOPS/s 1.0E-06*PMC2/time
-DP MUOPS/s 1.0E-06*PMC3/time
-Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64/time
-Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64
-Remote Read BW [MBytes/s] 1.0E-06*(UPMC2)*64/time
-Remote Write BW [MBytes/s] 1.0E-06*(UPMC4)*64/time
-Remote BW [MBytes/s] 1.0E-06*(UPMC2+UPMC4)*64/time
-
-LONG
-Formulas:
-DP MFlops/s = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 + FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-SP MFlops/s = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 + FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-Packed MUOPS/s 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/time
-Scalar MUOPS/s 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/time
-SP MUOPS/s 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/time
-DP MUOPS/s 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/time
-Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64/time
-Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64
-Remote Read BW [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS)*64/time;
-Remote Write BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
-Remote BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
--
-This is a overview group using the capabilities of nehalem to measure multiple events at
-the same time.
-
diff --git a/groups/nehalemEX/BRANCH.txt b/groups/nehalemEX/BRANCH.txt
index 3d81416..1ef9f11 100644
--- a/groups/nehalemEX/BRANCH.txt
+++ b/groups/nehalemEX/BRANCH.txt
@@ -19,13 +19,13 @@ Instructions per branch FIXC0/PMC0
LONG
Formulas:
-Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
-
-The rates state how often in average a branch or a mispredicted branch occured
-per instruction retired in total. The Branch misprediction ratio sets directly
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
into relation what ration of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate.
+Instructions per branch is 1/branch rate.
diff --git a/groups/nehalemEX/CACHE.txt b/groups/nehalemEX/CACHE.txt
index c3e989c..6603171 100644
--- a/groups/nehalemEX/CACHE.txt
+++ b/groups/nehalemEX/CACHE.txt
@@ -12,24 +12,25 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-Data cache misses PMC0
-Data cache request rate PMC1/FIXC0
-Data cache miss rate PMC0/FIXC0
-Data cache miss ratio PMC0/PMC1
+data cache misses PMC0
+data cache request rate PMC1/FIXC0
+data cache miss rate PMC0/FIXC0
+data cache miss ratio PMC0/PMC1
LONG
Formulas:
-Data cache request rate = L1D_ALL_REF_ANY / INSTR_RETIRED_ANY
-Data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
-Data cache miss ratio = L1D_REPL / L1D_ALL_REF_ANY
+data cache misses = L1D_REPL
+data cache request rate = L1D_ALL_REF_ANY / INSTR_RETIRED_ANY
+data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
+data cache miss ratio = L1D_REPL / L1D_ALL_REF_ANY
-
This group measures the locality of your data accesses with regard to the
-L1 Cache. Data cache request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
-The Data cache miss rate gives a measure how often it was necessary to get
-cachelines from higher levels of the memory hierarchy. And finally
-Data cache miss ratio tells you how many of your memory references required
-a cacheline to be loaded from a higher level. While the Data cache miss rate
-might be given by your algorithm you should try to get Data cache miss ratio
+L1 cache. Data cache request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy. And finally
+data cache miss ratio tells you how many of your memory references required
+a cache line to be loaded from a higher level. While the data cache miss rate
+might be given by your algorithm you should try to get data cache miss ratio
as low as possible by increasing your cache reuse.
diff --git a/groups/nehalemEX/DATA.txt b/groups/nehalemEX/DATA.txt
index a5611bc..31bba51 100644
--- a/groups/nehalemEX/DATA.txt
+++ b/groups/nehalemEX/DATA.txt
@@ -12,11 +12,11 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
LONG
Formulas:
-Load to Store ratio = MEM_INST_RETIRED_LOADS / MEM_INST_RETIRED_STORES
+Load to store ratio = MEM_INST_RETIRED_LOADS/MEM_INST_RETIRED_STORES
-
-This is a simple metric to determine your Load to store ratio.
+This is a simple metric to determine your load to store ratio.
diff --git a/groups/nehalemEX/FLOPS_DP.txt b/groups/nehalemEX/FLOPS_DP.txt
index c5ba91c..3e75cad 100644
--- a/groups/nehalemEX/FLOPS_DP.txt
+++ b/groups/nehalemEX/FLOPS_DP.txt
@@ -1,4 +1,4 @@
-SHORT Double Precision MFlops/s
+SHORT Double Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -14,7 +14,7 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-DP MFlops/s 1.0E-06*(PMC0*2.0+PMC1)/time
+MFLOP/s 1.0E-06*(PMC0*2.0+PMC1)/time
Packed MUOPS/s 1.0E-06*PMC0/time
Scalar MUOPS/s 1.0E-06*PMC1/time
SP MUOPS/s 1.0E-06*PMC2/time
@@ -22,10 +22,14 @@ DP MUOPS/s 1.0E-06*PMC3/time
LONG
Formula:
-DP MFlops/s = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 + FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
-
-The Nehalem has not possibility to measure MFlops if mixed precision calculations are done.
-Therefore both Single as well as Double precision are measured to ensure the correctness
+The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done.
+Therefore both single as well as double precision are measured to ensure the correctness
of the measurements. You can check if your code was vectorized on the number of
-FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
+FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
diff --git a/groups/nehalemEX/FLOPS_SP.txt b/groups/nehalemEX/FLOPS_SP.txt
index 4478c8f..9768109 100644
--- a/groups/nehalemEX/FLOPS_SP.txt
+++ b/groups/nehalemEX/FLOPS_SP.txt
@@ -1,4 +1,4 @@
-SHORT Single Precision MFlops/s
+SHORT Single Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -14,7 +14,7 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-SP MFlops/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
+MFLOP/s 1.0E-06*(PMC0*4.0+PMC1)/time
Packed MUOPS/s 1.0E-06*PMC0/time
Scalar MUOPS/s 1.0E-06*PMC1/time
SP MUOPS/s 1.0E-06*PMC2/time
@@ -22,10 +22,14 @@ DP MUOPS/s 1.0E-06*PMC3/time
LONG
Formula:
-SP MFlops/s = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 + FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
-
-The Nehalem has not possibility to measure MFlops if mixed precision calculations are done.
-Therefore both Single as well as Double precision are measured to ensure the correctness
+The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done.
+Therefore both single as well as double precision are measured to ensure the correctness
of the measurements. You can check if your code was vectorized on the number of
-FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
+FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
diff --git a/groups/nehalemEX/FLOPS_X87.txt b/groups/nehalemEX/FLOPS_X87.txt
index 6447b93..a4176f0 100644
--- a/groups/nehalemEX/FLOPS_X87.txt
+++ b/groups/nehalemEX/FLOPS_X87.txt
@@ -1,4 +1,4 @@
-SHORT X87 MFlops/s
+SHORT X87 MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -11,8 +11,8 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-X87 MFlops/s 1.0E-06*PMC0/time
+X87 MFLOP/s 1.0E-06*PMC0/time
LONG
-Profiling group to measure X87 flop rate.
+Profiling group to measure X87 FLOP rate.
diff --git a/groups/nehalemEX/ICACHE.txt b/groups/nehalemEX/ICACHE.txt
new file mode 100644
index 0000000..49943ff
--- /dev/null
+++ b/groups/nehalemEX/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L1I_READS
+PMC1 L1I_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = L1I_READS / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / L1I_READS
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/nehalemEX/L2.txt b/groups/nehalemEX/L2.txt
index 2734c5d..e2715cc 100644
--- a/groups/nehalemEX/L2.txt
+++ b/groups/nehalemEX/L2.txt
@@ -6,28 +6,35 @@ FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 L1D_REPL
PMC1 L1D_M_EVICT
+PMC2 L1I_MISSES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
-L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes] 1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
LONG
Formulas:
-L2 Load [MBytes/s] = 1.0E-06*L1D_REPL*64/time
-L2 Evict [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
-L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT)*64/time
-L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT)*64
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64
-
Profiling group to measure L2 cache bandwidth. The bandwidth is
-computed by the number of cacheline allocated in the L1 and the
-number of modified cachelines evicted from the L1. Also reports on
-total data volume transfered between L2 and L1 cache.
+computed by the number of cache line allocated in the L1 and the
+number of modified cache lines evicted from the L1. Also reports on
+total data volume transferred between L2 and L1 cache.
Note that this bandwidth also includes data transfers due to a
-write allocate load on a store miss in L1.
+write allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
+
diff --git a/groups/nehalemEX/L2CACHE.txt b/groups/nehalemEX/L2CACHE.txt
index 49778be..343b263 100644
--- a/groups/nehalemEX/L2CACHE.txt
+++ b/groups/nehalemEX/L2CACHE.txt
@@ -4,7 +4,7 @@ EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
-PMC0 L2_DATA_RQSTS_DEMAND_ANY
+PMC0 L2_RQSTS_REFERENCES
PMC1 L2_RQSTS_MISS
METRICS
@@ -18,18 +18,17 @@ L2 miss ratio PMC1/PMC0
LONG
Formulas:
-L2 request rate = L2_DATA_RQSTS_DEMAND_ANY / INSTR_RETIRED_ANY
-L2 miss rate = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_DATA_RQSTS_DEMAND_ANY
+L2 request rate = L2_RQSTS_REFERENCES/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_RQSTS_REFERENCES
-
This group measures the locality of your data accesses with regard to the
-L2 Cache. L2 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
The L2 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L2 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
diff --git a/groups/nehalemEX/L3.txt b/groups/nehalemEX/L3.txt
new file mode 100644
index 0000000..51a0811
--- /dev/null
+++ b/groups/nehalemEX/L3.txt
@@ -0,0 +1,37 @@
+SHORT L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L2_LINES_IN_ANY
+PMC1 L2_LINES_OUT_DEMAND_DIRTY
+PMC2 L2_LINES_OUT_PREFETCH_DIRTY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s] 1.0E-06*(PMC1+PMC2)*64.0/time
+L3 evict data volume [GBytes] 1.0E-09*(PMC1+PMC2)*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ANY*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ANY*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_OUT_DEMAND_DIRTY+L2_LINES_OUT_PREFETCH_DIRTY)*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*(L2_LINES_OUT_DEMAND_DIRTY+L2_LINES_OUT_PREFETCH_DIRTY)*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY+L2_LINES_OUT_PREFETCH_DIRTY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY+L2_LINES_OUT_PREFETCH_DIRTY)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. Also reports total data volume between L3 and L2 caches.
+Note that this bandwidth also includes data transfers due to a write allocate
+load on a store miss in L2.
+
diff --git a/groups/nehalemEX/L3CACHE.txt b/groups/nehalemEX/L3CACHE.txt
new file mode 100644
index 0000000..c6b204e
--- /dev/null
+++ b/groups/nehalemEX/L3CACHE.txt
@@ -0,0 +1,48 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+CBOX0C0 LLC_HITS_ALL
+CBOX0C1 LLC_MISSES_ALL
+CBOX1C0 LLC_HITS_ALL
+CBOX1C1 LLC_MISSES_ALL
+CBOX2C0 LLC_HITS_ALL
+CBOX2C1 LLC_MISSES_ALL
+CBOX3C0 LLC_HITS_ALL
+CBOX3C1 LLC_MISSES_ALL
+CBOX4C0 LLC_HITS_ALL
+CBOX4C1 LLC_MISSES_ALL
+CBOX5C0 LLC_HITS_ALL
+CBOX5C1 LLC_MISSES_ALL
+CBOX6C0 LLC_HITS_ALL
+CBOX6C1 LLC_MISSES_ALL
+CBOX7C0 LLC_HITS_ALL
+CBOX7C1 LLC_MISSES_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L3 request rate (CBOX0C0+CBOX0C1+CBOX1C0+CBOX1C1+CBOX2C0+CBOX2C1+CBOX3C0+CBOX3C1+CBOX4C0+CBOX4C1+CBOX5C0+CBOX5C1+CBOX6C0+CBOX6C1+CBOX7C0+CBOX7C1)/FIXC0
+L3 miss rate (CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)/FIXC0
+L3 miss ratio (CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)/(CBOX0C0+CBOX0C1+CBOX1C0+CBOX1C1+CBOX2C0+CBOX2C1+CBOX3C0+CBOX3C1+CBOX4C0+CBOX4C1+CBOX5C0+CBOX5C1+CBOX6C0+CBOX6C1+CBOX7C0+CBOX7C1)
+
+LONG
+Formulas:
+L3 request rate = (SUM(LLC_HITS_ALL)+SUM(LLC_MISSES_ALL))/INSTR_RETIRED_ANY
+L3 miss rate = SUM(LLC_MISSES_ALL)/INSTR_RETIRED_ANY
+L3 miss ratio = SUM(LLC_MISSES_ALL)/(SUM(LLC_HITS_ALL)+SUM(LLC_MISSES_ALL))
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/nehalemEX/MEM.txt b/groups/nehalemEX/MEM.txt
index 86a2e97..510f27b 100644
--- a/groups/nehalemEX/MEM.txt
+++ b/groups/nehalemEX/MEM.txt
@@ -1,39 +1,42 @@
SHORT Main memory bandwidth
EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-WBOX4 UNCORE_CYCLES
-MBOX0C0 FVC_EV0_BBOX_CMDS_READS
-MBOX0C1 FVC_EV0_BBOX_RSP_ACK
-MBOX1C0 FVC_EV0_BBOX_CMDS_READS
-MBOX1C1 FVC_EV0_BBOX_RSP_ACK
-BBOX0C1 IMT_INSERTS_WR
-BBOX1C1 IMT_INSERTS_WR
-RBOX0C0 NEW_PACKETS_RECV_PORT0_IPERF0_ANY_DRS
-RBOX0C1 NEW_PACKETS_RECV_PORT1_IPERF0_ANY_DRS
-RBOX1C0 NEW_PACKETS_RECV_PORT4_IPERF0_ANY_DRS
-RBOX1C1 NEW_PACKETS_RECV_PORT5_IPERF0_ANY_DRS
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+WBOXFIX UNCORE_CLOCKTICKS
+MBOX0C0 FVC_EV0_BBOX_CMDS_READS
+MBOX0C1 DRAM_CMD_CAS_WR_OPN
+MBOX1C0 FVC_EV0_BBOX_CMDS_READS
+MBOX1C1 DRAM_CMD_CAS_WR_OPN
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
-Uncore Clock [MHz] 1.E-06*(WBOX4)/time
+Uncore Clock [MHz] 1.E-06*(WBOXFIX)/time
CPI FIXC1/FIXC0
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0)*64/time
-Memory Write BW [MBytes/s] 1.0E-06*(BBOX0C1+BBOX1C1)*64/time
-Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+BBOX0C1+BBOX1C1)*64/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+BBOX0C1+BBOX1C1)*64
-Remote write data traffic Port 0 [MBytes/s] 1.0E-06*(RBOX0C0)*64/time
-Remote write data traffic Port 1 [MBytes/s] 1.0E-06*(RBOX0C1)*64/time
-Remote write data traffic Port 4 [MBytes/s] 1.0E-06*(RBOX1C0)*64/time
-Remote write data traffic Port 5 [MBytes/s] 1.0E-06*(RBOX1C1)*64/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64
LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+-
Profiling group to measure memory bandwidth drawn by all cores of a socket.
-Addional to the bandwidth it also outputs the data volume and the remote
-traffic over QPI links to other sockets.
+On Nehalem EX it is not possible to measure the write operations with the
+FVC_EV0_BBOX_CMDS_WRITES event at the same time as the FVC_EV0_BBOX_CMDS_READS
+because they set contrary bits. The DRAM_CMD_CAS_WR_OPN is an alternative but
+it only measures write operations to open pages, hence writes to closed pages
+are not included here.
diff --git a/groups/nehalemEX/SCHEDULER.txt b/groups/nehalemEX/SCHEDULER.txt
index a7bbe37..237fcb8 100644
--- a/groups/nehalemEX/SCHEDULER.txt
+++ b/groups/nehalemEX/SCHEDULER.txt
@@ -13,9 +13,13 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-RATIO Port 1 PMC1/PMC0
-RATIO Port 5 PMC2/PMC0
+Ratio Port 1 PMC1/PMC0
+Ratio Port 5 PMC2/PMC0
LONG
+Forumlas:
+Ratio Port 1 = UOPS_EXECUTED_PORT1/UOPS_EXECUTED_PORT0
+Ratio Port 5 = UOPS_EXECUTED_PORT5/UOPS_EXECUTED_PORT0
+-
Measures how many instructions were scheduled on which issue port.
diff --git a/groups/nehalemEX/TLB.txt b/groups/nehalemEX/TLB.txt
index 5f93d66..0e358b8 100644
--- a/groups/nehalemEX/TLB.txt
+++ b/groups/nehalemEX/TLB.txt
@@ -22,9 +22,9 @@ L1 DTLB request rate = L1D_ALL_REF_ANY / INSTR_RETIRED_ANY
DTLB miss rate = DTLB_MISSES_ANY / INSTR_RETIRED_ANY
L1 DTLB miss ratio = DTLB_MISSES_ANY / L1D_ALL_REF_ANY
-
-L1 DTLB request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
-The DTLB miss rate gives a measure how often a TLB miss occured
+L1 DTLB request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The DTLB miss rate gives a measure how often a TLB miss occurred
per instruction. And finally L1 DTLB miss ratio tells you how many
-of your memory references required caused a TLB miss in average.
+of your memory references required caused a TLB miss on average.
diff --git a/groups/pentiumm/BRANCH.txt b/groups/pentiumm/BRANCH.txt
new file mode 100644
index 0000000..157c331
--- /dev/null
+++ b/groups/pentiumm/BRANCH.txt
@@ -0,0 +1,17 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+PMC0 BR_INST_EXEC
+PMC1 BR_INST_MISSP_EXEC
+
+METRICS
+Runtime (RDTSC) [s] time
+Branch misprediction ratio PMC1/PMC0
+
+LONG
+Formulas:
+Branch misprediction ratio = BR_INST_MISSP_EXEC / BR_INST_EXEC
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
diff --git a/groups/pentiumm/CPI.txt b/groups/pentiumm/CPI.txt
new file mode 100644
index 0000000..1df7ff8
--- /dev/null
+++ b/groups/pentiumm/CPI.txt
@@ -0,0 +1,22 @@
+SHORT Cycles per instruction
+
+EVENTSET
+PMC0 UOPS_RETIRED
+PMC1 CPU_CLK_UNHALTED
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI PMC1/PMC0
+IPC PMC0/PMC1
+
+LONG
+Formulas:
+CPI CPU_CLK_UNHALTED/UOPS_RETIRED
+IPC UOPS_RETIRED/CPU_CLK_UNHALTED
+-
+This group measures how efficient the processor works with
+regard to instruction throughput. Also important as a standalone
+metric is UOPS_RETIRED as it tells you how many uops
+you need to execute for a task. An optimization might show very
+low CPI values but execute many more instruction for it.
+
diff --git a/groups/pentiumm/FLOPS_DP.txt b/groups/pentiumm/FLOPS_DP.txt
new file mode 100644
index 0000000..976c44c
--- /dev/null
+++ b/groups/pentiumm/FLOPS_DP.txt
@@ -0,0 +1,20 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+PMC0 EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP
+PMC1 EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_DP
+
+METRICS
+Runtime (RDTSC) [s] time
+MFLOP/s 1.0E-06*(PMC0*2.0+PMC1)/time
+Packed MUOPS/s 1.0E-06*(PMC0)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = (EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP*2 + EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_DP )/ runtime
+Packed MUOPS/s = 1.0E-06*(EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP)/time
+Scalar MUOPS/s = 1.0E-06*EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_DP/time
+-
+SSE scalar and packed double precision FLOP rates.
+
diff --git a/groups/pentiumm/FLOPS_SP.txt b/groups/pentiumm/FLOPS_SP.txt
new file mode 100644
index 0000000..83b73f2
--- /dev/null
+++ b/groups/pentiumm/FLOPS_SP.txt
@@ -0,0 +1,18 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+PMC0 EMON_SSE_SSE2_COMP_INST_RETIRED_ALL_SP
+PMC1 EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_SP
+
+METRICS
+Runtime (RDTSC) [s] time
+MFLOP/s 1.0E-06*(PMC0)/time
+Scalar MUOPS/s 1.0E-06*(PMC1)/time
+
+LONG
+Formula:
+MFLOP/s = (EMON_SSE_SSE2_COMP_INST_RETIRED_ALL_SP)/ runtime
+Scalar MUOPS/s = (EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_SP)/ runtime
+-
+SSE scalar and packed single precision FLOP rates.
+
diff --git a/groups/pentiumm/L3.txt b/groups/pentiumm/L3.txt
new file mode 100644
index 0000000..2ed5293
--- /dev/null
+++ b/groups/pentiumm/L3.txt
@@ -0,0 +1,30 @@
+SHORT L3 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0 L2_LINES_IN_ALL_ALL
+PMC1 L2_LINES_OUT_ALL_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_ALL_ALL*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_ALL_ALL*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL_ALL+L2_LINES_OUT_ALL_ALL)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL_ALL+L2_LINES_OUT_ALL_ALL)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. The group also output total data volume transferred between
+L2. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L2.
+
diff --git a/groups/phi/CACHE.txt b/groups/phi/CACHE.txt
index d611965..01ac5e4 100644
--- a/groups/phi/CACHE.txt
+++ b/groups/phi/CACHE.txt
@@ -1,4 +1,4 @@
-SHORT Compute to Data Access Ratio
+SHORT L1 compute to data access ratio
EVENTSET
PMC0 VPU_ELEMENTS_ACTIVE
@@ -8,12 +8,15 @@ METRICS
Runtime (RDTSC) [s] time
L1 compute intensity PMC0/PMC1
-LONG
+LONG
+Formulas:
+L1 compute intensity = VPU_ELEMENTS_ACTIVE/DATA_READ_OR_WRITE
+-
These metric is a way to measure the computational density of an
application, or how many computations it is performing on average for each
-piece of data loaded. L1 Compute to Data Access Ratio, should be
+piece of data loaded. L1 compute to data access ratio should be
used to judge suitability of an application for running on the Intel MIC
-Architecture. Applications that will perform well on the Intel� MIC
-Architecture should be vectorized, and ideally be able to perform multiple
-operations on the same pieces of data (or same cachelines).
+architecture. Applications that will perform well on the Intel MIC
+architecture should be vectorized, and ideally be able to perform multiple
+operations on the same pieces of data (or same cache lines).
diff --git a/groups/phi/COMPUTE_TO_DATA_RATIO.txt b/groups/phi/COMPUTE_TO_DATA_RATIO.txt
new file mode 100644
index 0000000..6fdd008
--- /dev/null
+++ b/groups/phi/COMPUTE_TO_DATA_RATIO.txt
@@ -0,0 +1,22 @@
+SHORT L2 compute to data access ratio
+
+EVENTSET
+PMC0 VPU_ELEMENTS_ACTIVE
+PMC1 DATA_READ_MISS_OR_WRITE_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+L2 compute intensity PMC0/PMC1
+
+LONG
+Formulas:
+L2 compute intensity = VPU_ELEMENTS_ACTIVE/DATA_READ_MISS_OR_WRITE_MISS
+-
+These metric is a way to measure the computational density of an
+application, or how many computations it is performing on average for each
+piece of data loaded. L2 compute to data access ratio should be
+used to judge suitability of an application for running on the Intel MIC
+architecture. Applications that will perform well on the Intel MIC
+architecture should be vectorized, and ideally be able to perform multiple
+operations on the same pieces of data (or same cache lines).
+
diff --git a/groups/phi/CPI.txt b/groups/phi/CPI.txt
index 8d4cf36..f3d8b4e 100644
--- a/groups/phi/CPI.txt
+++ b/groups/phi/CPI.txt
@@ -11,6 +11,10 @@ CPI PMC1/PMC0
IPC PMC0/PMC1
LONG
+Formulas:
+CPI = CPU_CLK_UNHALTED/INSTRUCTIONS_EXECUTED
+IPC = INSTRUCTIONS_EXECUTED/CPU_CLK_UNHALTED
+-
This group measures how efficient the processor works with
regard to instruction throughput. Also important as a standalone
metric is INSTRUCTIONS_RETIRED as it tells you how many instruction
diff --git a/groups/phi/L2CACHE.txt b/groups/phi/L2CACHE.txt
deleted file mode 100644
index 228a5ba..0000000
--- a/groups/phi/L2CACHE.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-SHORT L2 Compute to Data Access Ratio
-
-EVENTSET
-PMC0 VPU_ELEMENTS_ACTIVE
-PMC1 DATA_READ_MISS_OR_WRITE_MISS
-
-METRICS
-Runtime (RDTSC) [s] time
-L2 compute intensity PMC0/PMC1
-
-LONG
-These metric is a way to measure the computational density of an
-application, or how many computations it is performing on average for each
-piece of data loaded. L2 Compute to Data Access Ratio, should be
-used to judge suitability of an application for running on the Intel MIC
-Architecture. Applications that will perform well on the Intel� MIC
-Architecture should be vectorized, and ideally be able to perform multiple
-operations on the same pieces of data (or same cachelines).
-
diff --git a/groups/phi/MEM.txt b/groups/phi/MEM.txt
new file mode 100644
index 0000000..8899592
--- /dev/null
+++ b/groups/phi/MEM.txt
@@ -0,0 +1,18 @@
+SHORT Memory bandwidth
+
+EVENTSET
+PMC0 DATA_READ_MISS_OR_WRITE_MISS
+PMC1 DATA_CACHE_LINES_WRITTEN_BACK
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Memory bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+Memory bandwidth [MBytes/s] = 1.0E-06*(DATA_READ_MISS_OR_WRITE_MISS+DATA_CACHE_LINES_WRITTEN_BACK)*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(DATA_READ_MISS_OR_WRITE_MISS+DATA_CACHE_LINES_WRITTEN_BACK)*64.0
+-
+Total memory bandwidth and data volume.
diff --git a/groups/phi/MEM1.txt b/groups/phi/MEM1.txt
index 16e44e0..c9f7fb6 100644
--- a/groups/phi/MEM1.txt
+++ b/groups/phi/MEM1.txt
@@ -1,13 +1,18 @@
-SHORT L2 Write Misses
+SHORT L2 write misses
EVENTSET
PMC0 L2_DATA_WRITE_MISS_MEM_FILL
METRICS
Runtime (RDTSC) [s] time
-RFO Data Bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-RFO Volume [GBytes] 1.0E-09*PMC0*64.0
+L2 RFO bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 RFO data volume [GBytes] 1.0E-09*PMC0*64.0
LONG
-Bla
+Formulas:
+L2 RFO bandwidth [MBytes/s] = 1.0E-06*L2_DATA_WRITE_MISS_MEM_FILL*64.0/time
+L2 RFO data volume [GBytes] = 1.0E-09*L2_DATA_WRITE_MISS_MEM_FILL*64.0
+-
+Bandwidth and data volume fetched from memory due to a L2 data write miss. These
+fetches are commonly called write-allocate loads or read-for-ownership (RFO).
diff --git a/groups/phi/MEM2.txt b/groups/phi/MEM2.txt
index 9be1f2a..d44a823 100644
--- a/groups/phi/MEM2.txt
+++ b/groups/phi/MEM2.txt
@@ -1,13 +1,17 @@
-SHORT L2 Read Misses
+SHORT L2 read misses
EVENTSET
PMC0 L2_DATA_READ_MISS_MEM_FILL
METRICS
Runtime (RDTSC) [s] time
-Read Data Bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-Read Data Volume [GBytes] 1.0E-09*PMC0*64.0
+L2 read bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 read data volume [GBytes] 1.0E-09*PMC0*64.0
LONG
-Bla
+Formulas:
+L2 read bandwidth [MBytes/s] = 1.0E-06*L2_DATA_READ_MISS_MEM_FILL*64.0/time
+L2 read data volume [GBytes] = 1.0E-09*L2_DATA_READ_MISS_MEM_FILL*64.0
+-
+The data volume and bandwidth caused by read misses in the L2 cache.
diff --git a/groups/phi/MEM3.txt b/groups/phi/MEM3.txt
index 45ce0de..73de570 100644
--- a/groups/phi/MEM3.txt
+++ b/groups/phi/MEM3.txt
@@ -5,9 +5,13 @@ PMC0 HWP_L2MISS
METRICS
Runtime (RDTSC) [s] time
-Prefetch Data Bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-Prefetch Data Volume [GBytes] 1.0E-09*PMC0*64.0
+Prefetch bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+Prefetch data volume [GBytes] 1.0E-09*PMC0*64.0
LONG
-Bla
+Formulas:
+Prefetch bandwidth [MBytes/s] = 1.0E-06*HWP_L2MISS*64.0/time
+Prefetch data volume [GBytes] = 1.0E-09*HWP_L2MISS*64.0
+-
+The bandwidth and data volume caused by L2 misses from the hardware prefetcher.
diff --git a/groups/phi/MEM4.txt b/groups/phi/MEM4.txt
index a861a8b..9e892bd 100644
--- a/groups/phi/MEM4.txt
+++ b/groups/phi/MEM4.txt
@@ -1,13 +1,17 @@
-SHORT L2 Victim requests
+SHORT L2 victom requests
EVENTSET
PMC0 L2_VICTIM_REQ_WITH_DATA
METRICS
Runtime (RDTSC) [s] time
-Victim Data Bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-Victim Data Volume [GBytes] 1.0E-09*PMC0*64.0
+Victim bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+Victim data volume [GBytes] 1.0E-09*PMC0*64.0
LONG
-Bla
+Formulas:
+Victim bandwidth [MBytes/s] = 1.0E-06*L2_VICTIM_REQ_WITH_DATA*64.0/time
+Victim data volume [GBytes] = 1.0E-09*L2_VICTIM_REQ_WITH_DATA*64.0
+-
+Data volume and bandwidth caused by cache line victims.
diff --git a/groups/phi/MEM5.txt b/groups/phi/MEM5.txt
index ade9828..49acb98 100644
--- a/groups/phi/MEM5.txt
+++ b/groups/phi/MEM5.txt
@@ -1,13 +1,19 @@
-SHORT L2 Snoop hits
+SHORT L2 snoop hits
EVENTSET
PMC0 SNP_HITM_L2
METRICS
Runtime (RDTSC) [s] time
-Snoop Data Bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-Snoop Data Volume [GBytes] 1.0E-09*PMC0*64.0
+Snoop bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+Snoop data volume [GBytes] 1.0E-09*PMC0*64.0
LONG
-Bla
+Formulas:
+Snoop bandwidth [MBytes/s] = 1.0E-06*SNP_HITM_L2*64.0/time
+Snoop data volume [GBytes] = 1.0E-09*SNP_HITM_L2*64.0
+-
+Snoop traffic caused by HITM requests. HITM requests are L2 requests that
+are served by another core's L2 cache but the remote cache line is in modified
+state.
diff --git a/groups/phi/MEM6.txt b/groups/phi/MEM6.txt
index 41be52e..835faf8 100644
--- a/groups/phi/MEM6.txt
+++ b/groups/phi/MEM6.txt
@@ -1,13 +1,17 @@
-SHORT L2 Read Misses
+SHORT L2 read misses
EVENTSET
PMC0 L2_READ_MISS
METRICS
Runtime (RDTSC) [s] time
-L2 Read Data Bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 Read Data Volume [GBytes] 1.0E-09*PMC0*64.0
+L2 read bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 read data volume [GBytes] 1.0E-09*PMC0*64.0
LONG
-Bla
+Formulas:
+L2 read bandwidth [MBytes/s] = 1.0E-06*L2_READ_MISS*64.0/time
+L2 read data volume [GBytes] = 1.0E-09*L2_READ_MISS*64.0
+-
+Data volume and bandwidth caused by read misses in the L2 cache.
diff --git a/groups/phi/MEM_READ.txt b/groups/phi/MEM_READ.txt
new file mode 100644
index 0000000..fb107b0
--- /dev/null
+++ b/groups/phi/MEM_READ.txt
@@ -0,0 +1,20 @@
+SHORT Memory read bandwidth
+
+EVENTSET
+PMC0 DATA_READ_MISS
+PMC1 HWP_L2MISS
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Memory read bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(L2_DATA_READ_MISS_MEM_FILL+HWP_L2MISS)*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(L2_DATA_READ_MISS_MEM_FILL+HWP_L2MISS)*64.0
+-
+Bandwidth and data volume of read operations from the memory to L2 cache. The
+metric is introduced in the book 'Intel Xeon Phi Coprocessor High-Performance
+Programming' by James Jeffers and James Reinders.
diff --git a/groups/phi/MEM_WRITE.txt b/groups/phi/MEM_WRITE.txt
new file mode 100644
index 0000000..01043fd
--- /dev/null
+++ b/groups/phi/MEM_WRITE.txt
@@ -0,0 +1,20 @@
+SHORT Memory write bandwidth
+
+EVENTSET
+PMC0 L2_VICTIM_REQ_WITH_DATA
+PMC1 SNP_HITM_L2
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Memory write bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+Memory write bandwidth [MBytes/s] = 1.0E-06*(L2_VICTIM_REQ_WITH_DATA+SNP_HITM_L2)*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(L2_VICTIM_REQ_WITH_DATA+SNP_HITM_L2)*64.0
+-
+Bandwidth and data volume of write operations from the L2 cache to memory. The
+metric is introduced in the book 'Intel Xeon Phi Coprocessor High-Performance
+Programming' by James Jeffers and James Reinders.
diff --git a/groups/phi/PAIRING.txt b/groups/phi/PAIRING.txt
index 2e93cc8..ce3627c 100644
--- a/groups/phi/PAIRING.txt
+++ b/groups/phi/PAIRING.txt
@@ -6,8 +6,16 @@ PMC1 INSTRUCTIONS_EXECUTED_V_PIPE
METRICS
Runtime (RDTSC) [s] time
-VPipeRatio PMC1/PMC0
-PairingRatio PMC1/(PMC0-PMC1)
+V-pipe ratio PMC1/PMC0
+Pairing ratio PMC1/(PMC0-PMC1)
LONG
-Pairing ratio
+Formulas:
+V-pipe ratio = INSTRUCTIONS_EXECUTED_V_PIPE/INSTRUCTIONS_EXECUTED
+Pairing ratio = INSTRUCTIONS_EXECUTED_V_PIPE/(INSTRUCTIONS_EXECUTED-INSTRUCTIONS_EXECUTED_V_PIPE)
+-
+Each hardware thread on the Xeon Phi can execute two instruction simultaneously,
+one in the U-pipe and one in the V-pipe. But this is only possible if the
+instructions can be paired. The instructions executed in paired fashion are counted
+by the event INSTRUCTIONS_EXECUTED_V_PIPE. The event INSTRUCTIONS_EXECUTED increments
+for each instruction, hence the maximal increase per cycle can be 2.
diff --git a/groups/phi/READ_MISS_RATIO.txt b/groups/phi/READ_MISS_RATIO.txt
index c98f91b..dbdaad5 100644
--- a/groups/phi/READ_MISS_RATIO.txt
+++ b/groups/phi/READ_MISS_RATIO.txt
@@ -1,4 +1,4 @@
-SHORT Miss ratio for data read
+SHORT Miss ratio fof data reads
EVENTSET
PMC0 DATA_READ
@@ -6,7 +6,10 @@ PMC1 DATA_READ_MISS
METRICS
Runtime (RDTSC) [s] time
-Miss ratio PMC1/PMC0
+Read miss ratio PMC1/PMC0
LONG
-Miss ratio for data read
+Formulas:
+Read miss ratio = DATA_READ_MISS/DATA_READ
+--
+Miss ratio for data reads.
diff --git a/groups/phi/TLB.txt b/groups/phi/TLB.txt
new file mode 100644
index 0000000..6f00359
--- /dev/null
+++ b/groups/phi/TLB.txt
@@ -0,0 +1,23 @@
+SHORT TLB Misses
+
+EVENTSET
+PMC0 LONG_DATA_PAGE_WALK
+PMC1 DATA_PAGE_WALK
+
+METRICS
+Runtime (RDTSC) [s] time
+L1 TLB misses [misses/s] PMC1/time
+L2 TLB misses [misses/s] PMC0/time
+L1 TLB misses per L2 TLB miss PMC1/PMC0
+
+LONG
+Formulas:
+L1 TLB misses [misses/s] = DATA_PAGE_WALK/time
+L2 TLB misses [misses/s] = LONG_DATA_PAGE_WALK/time
+L1 TLB misses per L2 TLB miss = DATA_PAGE_WALK/LONG_DATA_PAGE_WALK
+-
+Analysis of the layered TLB of the Intel Xeon Phi. According to the book
+'Intel Xeon Phi Coprocessor High-Performance Programming' by James Jeffers and
+James Reinders, a high L1 TLB misses per L2 TLB miss ratio suggests that your
+working set fits into the L2 TLB but not in L1 TLB. Using large pages may be
+beneficial.
diff --git a/groups/phi/TLB_L1.txt b/groups/phi/TLB_L1.txt
new file mode 100644
index 0000000..d826d04
--- /dev/null
+++ b/groups/phi/TLB_L1.txt
@@ -0,0 +1,23 @@
+SHORT L1 TLB misses
+
+EVENTSET
+PMC0 DATA_PAGE_WALK
+PMC1 DATA_READ_OR_WRITE
+
+METRICS
+Runtime (RDTSC) [s] time
+L1 TLB misses [misses/s] PMC0/time
+L1 TLB miss ratio PMC0/PMC1
+
+LONG
+Formulas:
+L1 TLB misses [misses/s] = DATA_PAGE_WALK/time
+L1 TLB miss ratio = DATA_PAGE_WALK/DATA_READ_OR_WRITE
+-
+This performance group measures the L1 TLB misses. A L1 TLB miss that hits the
+L2 TLB has a penelty of about 25 cycles for 4kB pages. For 2MB pages, the penelty
+for a L1 TLB miss that hits L2 TLB is about 8 cycles. The minimal L1 TLB miss ratio
+is about 1/64, so a high ratio indicates a bad spartial locality. Data of a page
+is only partly accessed. It can also indicate trashing because when multiple pages
+are accessed in a loop iteration, the size and associativity is not sufficient to
+hold all pages.
diff --git a/groups/phi/TLB_L2.txt b/groups/phi/TLB_L2.txt
new file mode 100644
index 0000000..9a95125
--- /dev/null
+++ b/groups/phi/TLB_L2.txt
@@ -0,0 +1,21 @@
+SHORT L2 TLB misses
+
+EVENTSET
+PMC0 LONG_DATA_PAGE_WALK
+PMC1 DATA_READ_OR_WRITE
+
+METRICS
+Runtime (RDTSC) [s] time
+L2 TLB misses [misses/s] PMC0/time
+L2 TLB miss ratio PMC0/PMC1
+
+LONG
+Formulas:
+L2 TLB misses [misses/s] = LONG_DATA_PAGE_WALK/time
+L2 TLB miss ratio = LONG_DATA_PAGE_WALK/DATA_READ_OR_WRITE
+-
+This performance group measures the L2 TLB misses. A L2 TLB miss has a penelty
+of at least 100 cycles, hence it is important to avoid them. A high ratio can
+indicate trashing because when multiple pages are accessed in a loop iteration,
+the size and associativity is not sufficient to hold all pages. This would also
+result in a bad ratio for the L1 TLB.
diff --git a/groups/phi/VECTOR.txt b/groups/phi/VECTOR.txt
index 1e91bc4..fd2e27f 100644
--- a/groups/phi/VECTOR.txt
+++ b/groups/phi/VECTOR.txt
@@ -1,4 +1,4 @@
-SHORT Vector unit usage
+SHORT Vectorization intensity
EVENTSET
PMC0 VPU_INSTRUCTIONS_EXECUTED
@@ -6,10 +6,16 @@ PMC1 VPU_ELEMENTS_ACTIVE
METRICS
Runtime (RDTSC) [s] time
-Vectorization Intensity PMC1/PMC0
+Vectorization intensity PMC1/PMC0
LONG
+Formula:
+Vectorization intensity = VPU_ELEMENTS_ACTIVE / VPU_INSTRUCTIONS_EXECUTED
+-
Vector instructions include instructions that perform floating-point
operations, instructions that load vector registers from memory and store them
to memory, instructions to manipulate vector mask registers, and other special
purpose instructions such as vector shuffle.
+According to the book 'Intel Xeon Phi Coprocessor High-Performance Programming'
+by James Jeffers and James Reinders, the vectorization intensity should be >=8
+for double precision and >=16 for single precision.
diff --git a/groups/phi/VECTOR2.txt b/groups/phi/VECTOR2.txt
index 487460c..78e6b82 100644
--- a/groups/phi/VECTOR2.txt
+++ b/groups/phi/VECTOR2.txt
@@ -7,11 +7,13 @@ PMC1 VPU_STALL_REG
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] PMC1*inverseClock
+VPU stall ratio [%] 100*(VPU_STALL_REG/PMC0)
LONG
+VPU stall ratio [%] = 100*(VPU_STALL_REG/VPU_INSTRUCTIONS_EXECUTED)
+--
This group measures how efficient the processor works with
-regard to instruction throughput. Also important as a standalone
-metric is INSTRUCTIONS_RETIRED as it tells you how many instruction
-you need to execute for a task. An optimization might show very
-low CPI values but execute many more instruction for it.
+regard to vectorization instruction throughput. The event VPU_STALL_REG counts
+the VPU stalls due to data dependencies. Dependencies are read-after-write,
+write-after-write and write-after-read.
diff --git a/groups/phi/VPU_FILL_RATIO_DBL.txt b/groups/phi/VPU_FILL_RATIO_DBL.txt
index 50d3835..6e8065c 100644
--- a/groups/phi/VPU_FILL_RATIO_DBL.txt
+++ b/groups/phi/VPU_FILL_RATIO_DBL.txt
@@ -1,4 +1,4 @@
-SHORT VPU filling for Double
+SHORT VPU filling for double precisiof data
EVENTSET
PMC0 VPU_INSTRUCTIONS_EXECUTED
@@ -6,7 +6,13 @@ PMC1 VPU_ELEMENTS_ACTIVE
METRICS
Runtime (RDTSC) [s] time
-VPUFillRatio PMC0*8/PMC1
+VPU fill ratio PMC0*8/PMC1
LONG
-VPU filling for Double
+Formulas:
+VPU fill ratio = VPU_INSTRUCTIONS_EXECUTED*8/VPU_ELEMENTS_ACTIVE
+--
+This performance group measures the number of vector instructions that are
+performed on each vector loaded to the VPU. It is important to increate the
+ratio to get a high throughput because memory accesses (loading data to the VPU)
+are expensive.
diff --git a/groups/phi/VPU_PAIRING.txt b/groups/phi/VPU_PAIRING.txt
index 998c1d7..024919b 100644
--- a/groups/phi/VPU_PAIRING.txt
+++ b/groups/phi/VPU_PAIRING.txt
@@ -1,4 +1,4 @@
-SHORT VPU Pairing ratio
+SHORT VPU pairing ratio
EVENTSET
PMC0 VPU_INSTRUCTIONS_EXECUTED
@@ -6,8 +6,15 @@ PMC1 VPU_INSTRUCTIONS_EXECUTED_V_PIPE
METRICS
Runtime (RDTSC) [s] time
-VPipeRatio PMC1/PMC0
-PairingRatio PMC1/(PMC0-PMC1)
+V-pipe ratio PMC1/PMC0
+Pairing ratio PMC1/(PMC0-PMC1)
LONG
-VPU Pairing ratio
+Formulas:
+V-pipe ratio = VPU_INSTRUCTIONS_EXECUTED_V_PIPE/VPU_INSTRUCTIONS_EXECUTED
+Pairing ratio = VPU_INSTRUCTIONS_EXECUTED_V_PIPE/(VPU_INSTRUCTIONS_EXECUTED-VPU_INSTRUCTIONS_EXECUTED_V_PIPE)
+--
+This performance group measures the pairing ratio of vector instructions. The
+V-pipe can only execute a subset of all instruction, the main workload is done
+by the U-pipe. A higher throughput can be achieved if the pairing ratio is
+increased.
diff --git a/groups/phi/VPU_READ_MISS_RATIO.txt b/groups/phi/VPU_READ_MISS_RATIO.txt
index 94ec963..502644a 100644
--- a/groups/phi/VPU_READ_MISS_RATIO.txt
+++ b/groups/phi/VPU_READ_MISS_RATIO.txt
@@ -1,4 +1,4 @@
-SHORT Miss ratio for VPU data read
+SHORT Miss ratio for VPU data reads
EVENTSET
PMC0 VPU_DATA_READ
@@ -6,7 +6,11 @@ PMC1 VPU_DATA_READ_MISS
METRICS
Runtime (RDTSC) [s] time
-Miss ratio PMC1/PMC0
+VPU read miss ratio PMC1/PMC0
LONG
-Miss ratio for VPU data read
+Formula:
+VPU read miss ratio = PMC1/PMC0
+--
+This performance group determines the ratio between reads and reads that miss
+the cache and are issued by the VPU.
diff --git a/groups/phi/VPU_WRITE_MISS_RATIO.txt b/groups/phi/VPU_WRITE_MISS_RATIO.txt
index 429ee6d..b098b6f 100644
--- a/groups/phi/VPU_WRITE_MISS_RATIO.txt
+++ b/groups/phi/VPU_WRITE_MISS_RATIO.txt
@@ -1,4 +1,4 @@
-SHORT Miss ratio for VPU data write
+SHORT Miss ratio for VPU data writes
EVENTSET
PMC0 VPU_DATA_WRITE
@@ -6,7 +6,11 @@ PMC1 VPU_DATA_WRITE_MISS
METRICS
Runtime (RDTSC) [s] time
-Miss ratio PMC1/PMC0
+VPU write miss ratio PMC1/PMC0
LONG
-Miss ratio for VPU data write
+Formula:
+VPU write miss ratio = PMC1/PMC0
+--
+This performance group determines the ratio between writes and writes that miss
+the cache and are issued by the VPU.
diff --git a/groups/phi/WRITE_MISS_RATIO.txt b/groups/phi/WRITE_MISS_RATIO.txt
index 0544b0e..1e92c76 100644
--- a/groups/phi/WRITE_MISS_RATIO.txt
+++ b/groups/phi/WRITE_MISS_RATIO.txt
@@ -1,4 +1,4 @@
-SHORT Miss ratio for data write
+SHORT Miss ratio fof data writes
EVENTSET
PMC0 DATA_WRITE
@@ -6,7 +6,10 @@ PMC1 DATA_WRITE_MISS
METRICS
Runtime (RDTSC) [s] time
-Miss ratio PMC1/PMC0
+Write miss ratio PMC1/PMC0
LONG
-Miss ratio for data write
+Formulas:
+Write miss ratio = DATA_WRITE_MISS/DATA_WRITE
+--
+Miss ratio fof data writes.
diff --git a/groups/sandybridge/BRANCH.txt b/groups/sandybridge/BRANCH.txt
index cbaf834..b8d41b2 100644
--- a/groups/sandybridge/BRANCH.txt
+++ b/groups/sandybridge/BRANCH.txt
@@ -19,13 +19,13 @@ Instructions per branch FIXC0/PMC0
LONG
Formulas:
-Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
-
-The rates state how often in average a branch or a mispredicted branch occured
-per instruction retired in total. The Branch misprediction ratio sets directly
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
into relation what ratio of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate.
+Instructions per branch is 1/branch rate.
diff --git a/groups/sandybridge/CLOCK.txt b/groups/sandybridge/CLOCK.txt
index 0cc92d3..7a5e87d 100644
--- a/groups/sandybridge/CLOCK.txt
+++ b/groups/sandybridge/CLOCK.txt
@@ -8,7 +8,7 @@ PWR0 PWR_PKG_ENERGY
PWR3 PWR_DRAM_ENERGY
METRICS
-Runtime (RDTSC) [s] time
+Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
diff --git a/groups/sandybridge/DATA.txt b/groups/sandybridge/DATA.txt
index 5f04a23..967cbad 100644
--- a/groups/sandybridge/DATA.txt
+++ b/groups/sandybridge/DATA.txt
@@ -4,19 +4,19 @@ EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
-PMC0 MEM_UOP_RETIRED_LOADS
-PMC1 MEM_UOP_RETIRED_STORES
+PMC0 MEM_UOPS_RETIRED_LOADS
+PMC1 MEM_UOPS_RETIRED_STORES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
LONG
Formulas:
-Load to Store ratio = MEM_UOP_RETIRED_LOADS / MEM_UOP_RETIRED_STORES
+Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
-
This is a metric to determine your load to store ratio.
diff --git a/groups/sandybridge/ENERGY.txt b/groups/sandybridge/ENERGY.txt
index 9261934..2b466c8 100644
--- a/groups/sandybridge/ENERGY.txt
+++ b/groups/sandybridge/ENERGY.txt
@@ -7,10 +7,11 @@ FIXC2 CPU_CLK_UNHALTED_REF
TMP0 TEMP_CORE
PWR0 PWR_PKG_ENERGY
PWR1 PWR_PP0_ENERGY
+PWR2 PWR_PP1_ENERGY
PWR3 PWR_DRAM_ENERGY
METRICS
-Runtime (RDTSC) [s] time
+Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
@@ -18,16 +19,19 @@ Temperature [C] TMP0
Energy [J] PWR0
Power [W] PWR0/time
Energy PP0 [J] PWR1
-Power PP0 [W] PWR1/time
+Power PP0 [W] PWR1/time
+Energy PP1 [J] PWR2
+Power PP1 [W] PWR2/time
Energy DRAM [J] PWR3
-Power DRAM [W] PWR3/time
+Power DRAM [W] PWR3/time
LONG
Formula:
-Power = PWR_PKG_ENERGY / time
+Power = PWR_PKG_ENERGY / time
Power PP0 = PWR_PP0_ENERGY / time
+Power PP1 = PWR_PP1_ENERGY / time
Power DRAM = PWR_DRAM_ENERGY / time
-
SandyBridge implements the new RAPL interface. This interface enables to
-monitor the consumed energy on the package (socket) and DRAM level.
+monitor the consumed energy on the package (socket) level.
diff --git a/groups/sandybridge/FALSE_SHARE.txt b/groups/sandybridge/FALSE_SHARE.txt
new file mode 100644
index 0000000..a87f7d4
--- /dev/null
+++ b/groups/sandybridge/FALSE_SHARE.txt
@@ -0,0 +1,25 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM
+PMC2 MEM_LOAD_UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Local LLC false sharing [MByte] 1.E-06*PMC0*64
+Local LLC false sharing rate PMC0/PMC2
+
+LONG
+Formula:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory loads as reference.
diff --git a/groups/sandybridge/FLOPS_AVX.txt b/groups/sandybridge/FLOPS_AVX.txt
index 6850bca..b4ae4e7 100644
--- a/groups/sandybridge/FLOPS_AVX.txt
+++ b/groups/sandybridge/FLOPS_AVX.txt
@@ -1,4 +1,4 @@
-SHORT Packed AVX MFlops/s
+SHORT Packed AVX MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -12,14 +12,15 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-32b packed SP MFlops/s 1.0E-06*(PMC0*8.0)/time
-32b packed DP MFlops/s 1.0E-06*(PMC1*4.0)/time
+Packed SP MFLOP/s 1.0E-06*(PMC0*8.0)/time
+Packed DP MFLOP/s 1.0E-06*(PMC1*4.0)/time
LONG
Formula:
-32b packed SP MFlops/s = (SIMD_FP_256_PACKED_SINGLE*8)/ runtime
-32b packed DP MFlops/s = (SIMD_FP_256_PACKED_DOUBLE*4)/ runtime
+Packed SP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed DP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
-
-Packed 32b AVX flops rates. Please note that the current flop measurements on SandyBridge are
+Packed 32b AVX FLOPs rates.
+Please note that the current FLOP measurements on SandyBridge are
potentially wrong. So you cannot trust these counters at the moment!
diff --git a/groups/sandybridge/FLOPS_DP.txt b/groups/sandybridge/FLOPS_DP.txt
index cda580a..244e5ce 100644
--- a/groups/sandybridge/FLOPS_DP.txt
+++ b/groups/sandybridge/FLOPS_DP.txt
@@ -1,4 +1,4 @@
-SHORT Double Precision MFlops/s
+SHORT Double Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -13,17 +13,19 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-MFlops/s 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
-32b AVX MFlops/s 1.0E-06*(PMC2*4.0)/time
+MFLOP/s 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFLOP/s 1.0E-06*(PMC2*4.0)/time
Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
Scalar MUOPS/s 1.0E-06*PMC1/time
LONG
Formula:
-MFlops/s = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 + FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-AVX MFlops/s = (SIMD_FP_256_PACKED_DOUBLE*4)/ runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
-
-SSE scalar and packed double precision flop rates. Please note that the current
-flop measurements on IvyBridge are potentially wrong. So you cannot trust
-these counters at the moment!
+SSE scalar and packed double precision FLOP rates.
+Please note that the current FLOP measurements on SandyBridge are potentially
+wrong. So you cannot trust these counters at the moment!
diff --git a/groups/sandybridge/FLOPS_SP.txt b/groups/sandybridge/FLOPS_SP.txt
index 753ade7..8cd8de2 100644
--- a/groups/sandybridge/FLOPS_SP.txt
+++ b/groups/sandybridge/FLOPS_SP.txt
@@ -1,4 +1,4 @@
-SHORT Single Precision MFlops/s
+SHORT Single Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -13,17 +13,19 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-MFlops/s 1.0E-06*(PMC0*4.0+PMC1)/time
-AVX MFlops/s 1.0E-06*(PMC2*8.0)/time
+MFLOP/s 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFLOP/s 1.0E-06*(PMC2*8.0)/time
Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
Scalar MUOPS/s 1.0E-06*PMC1/time
LONG
Formula:
-MFlops/s = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 + FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-AVX MFlops/s = (SIMD_FP_256_PACKED_SINGLE*8)/ runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
-
-SSE scalar and packed single precision flop rates. Also shows packed AVX 32b
-flop rates. Please note that the current flop measurements on SandyBridge are
-potentially wrong. So you cannot trust these counters at the moment!
+SSE scalar and packed single precision FLOP rates.
+Please note that the current FLOP measurements on SandyBridge are potentially
+wrong. So you cannot trust these counters at the moment!
diff --git a/groups/sandybridge/ICACHE.txt b/groups/sandybridge/ICACHE.txt
new file mode 100644
index 0000000..f1e2335
--- /dev/null
+++ b/groups/sandybridge/ICACHE.txt
@@ -0,0 +1,33 @@
+SHORT Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 ICACHE_ACCESSES
+PMC1 ICACHE_MISSES
+PMC2 ICACHE_IFETCH_STALL
+PMC3 ILD_STALL_IQ_FULL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+L1I stalls PMC2
+L1I stall rate PMC2/FIXC0
+L1I queue full stalls PMC3
+L1I queue full stall rate PMC3/FIXC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I stalls = ICACHE_IFETCH_STALL
+L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/sandybridge/L2.txt b/groups/sandybridge/L2.txt
index 5345b7a..1feb44c 100644
--- a/groups/sandybridge/L2.txt
+++ b/groups/sandybridge/L2.txt
@@ -6,27 +6,33 @@ FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 L1D_REPLACEMENT
PMC1 L1D_M_EVICT
+PMC2 ICACHE_MISSES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
-L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes] 1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
LONG
Formulas:
-L2 Load [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
-L2 Evict [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
-L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
-L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64
-
Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
-number of cacheline allocated in the L1 and the number of modified cachelines
-evicted from the L1. The group also output total data volume transfered between
+number of cache line allocated in the L1 and the number of modified cache lines
+evicted from the L1. The group also output total data volume transferred between
L2 and L1. Note that this bandwidth also includes data transfers due to a write
-allocate load on a store miss in L1.
+allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
diff --git a/groups/sandybridge/L2CACHE.txt b/groups/sandybridge/L2CACHE.txt
index 3d7c36e..fbc3745 100644
--- a/groups/sandybridge/L2CACHE.txt
+++ b/groups/sandybridge/L2CACHE.txt
@@ -18,18 +18,17 @@ L2 miss ratio PMC1/PMC0
LONG
Formulas:
-L2 request rate = L2_TRANS_ALL_REQUESTS / INSTR_RETIRED_ANY
-L2 miss rate = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_TRANS_ALL_REQUESTS
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
-
This group measures the locality of your data accesses with regard to the
-L2 Cache. L2 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
The L2 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L2 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
diff --git a/groups/sandybridge/L3.txt b/groups/sandybridge/L3.txt
index 9a7c914..f63a918 100644
--- a/groups/sandybridge/L3.txt
+++ b/groups/sandybridge/L3.txt
@@ -5,28 +5,32 @@ FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 L2_LINES_IN_ALL
-PMC1 L2_LINES_OUT_DIRTY_ALL
+PMC1 L2_TRANS_L2_WB
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-L3 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L3 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
+L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0
L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
LONG
Formulas:
-L3 Load [MBytes/s] 1.0E-06*L2_LINES_IN_ALL*64/time
-L3 Evict [MBytes/s] 1.0E-06*L2_LINES_OUT_DIRTY_ALL*64/time
-L3 bandwidth [MBytes/s] 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64/time
-L3 data volume [GBytes] 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
-
Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
-number of cacheline allocated in the L2 and the number of modified cachelines
-evicted from the L2. This group also outputs data volume transfered between the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
L3 and measured cores L2 caches. Note that this bandwidth also includes data
transfers due to a write allocate load on a store miss in L2.
diff --git a/groups/sandybridge/L3CACHE.txt b/groups/sandybridge/L3CACHE.txt
index d4fd89e..c1cd039 100644
--- a/groups/sandybridge/L3CACHE.txt
+++ b/groups/sandybridge/L3CACHE.txt
@@ -6,30 +6,30 @@ FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 MEM_LOAD_UOPS_RETIRED_L3_ALL
PMC1 MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2 UOPS_RETIRED_ALL
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-L3 request rate (PMC0)/FIXC0
-L3 miss rate PMC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
L3 miss ratio PMC1/PMC0
LONG
Formulas:
-L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL / INSTR_RETIRED_ANY
-L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS / INSTR_RETIRED_ANY
-L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS / MEM_LOAD_UOPS_RETIRED_L3_ALL
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
-
This group measures the locality of your data accesses with regard to the
-L3 Cache. L3 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
The L3 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L3 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
diff --git a/groups/sandybridge/MEM.txt b/groups/sandybridge/MEM.txt
deleted file mode 100644
index 1f9ff4a..0000000
--- a/groups/sandybridge/MEM.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-SHORT Main memory bandwidth in MBytes/s
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-MBOX0C0 CAS_COUNT_RD
-MBOX0C1 CAS_COUNT_WR
-MBOX1C0 CAS_COUNT_RD
-MBOX1C1 CAS_COUNT_WR
-MBOX2C0 CAS_COUNT_RD
-MBOX2C1 CAS_COUNT_WR
-MBOX3C0 CAS_COUNT_RD
-MBOX3C1 CAS_COUNT_WR
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI FIXC1/FIXC0
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0
-
-LONG
-Profiling group to measure main memory bandwidth drawn by all cores of
-a socket. Since this group is based on uncore events it is only possible to
-measure on the granularity of a socket. If a thread group contains multiple
-threads only one thread per socket will show the results. Also outputs total
-data volume transfered from main memory.
-
diff --git a/groups/sandybridge/MEM_DP.txt b/groups/sandybridge/MEM_DP.txt
deleted file mode 100644
index 78fbd18..0000000
--- a/groups/sandybridge/MEM_DP.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-SHORT Overview of arithmetic and main memory performance
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-TMP0 TEMP_CORE
-PWR0 PWR_PKG_ENERGY
-PWR3 PWR_DRAM_ENERGY
-PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
-PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
-PMC2 SIMD_FP_256_PACKED_DOUBLE
-MBOX0C0 CAS_COUNT_RD
-MBOX0C1 CAS_COUNT_WR
-MBOX1C0 CAS_COUNT_RD
-MBOX1C1 CAS_COUNT_WR
-MBOX2C0 CAS_COUNT_RD
-MBOX2C1 CAS_COUNT_WR
-MBOX3C0 CAS_COUNT_RD
-MBOX3C1 CAS_COUNT_WR
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI FIXC1/FIXC0
-Temperature TMP0
-Energy [J] PWR0
-Power [W] PWR0/time
-Energy DRAM [J] PWR3
-Power DRAM [W] PWR3/time
-MFlops/s 1.0E-06*(PMC0*2.0+PMC1)/time
-32b AVX MFlops/s 1.0E-06*(PMC2*4.0)/time
-Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0
-
-LONG
-Formula:
-Power = PWR_PKG_ENERGY / runtime
-Power DRAM = PWR_DRAM_ENERGY / runtime
-MFlops/s = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 + FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-AVX MFlops/s = (SIMD_FP_256_PACKED_DOUBLE*4)/ runtime
-Memory Read BW [MBytes/s] 1.0E-06*(CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR)*64.0/time
---
-Profiling group to measure memory bandwidth drawn by all cores of a socket.
-Since this group is based on uncore events it is only possible to measure on
-a per socket base. Also outputs total data volume transfered from main memory,
-SSE scalar and packed double precision flop rates as well as consumed energy and
-temperature. Also reports on packed AVX 32b instructions. Please note that the
-current flop measurements on SandyBridge are potentially wrong. So you cannot trust these counters at the moment!
diff --git a/groups/sandybridge/MEM_SP.txt b/groups/sandybridge/MEM_SP.txt
deleted file mode 100644
index 1ede713..0000000
--- a/groups/sandybridge/MEM_SP.txt
+++ /dev/null
@@ -1,56 +0,0 @@
-SHORT Overview of arithmetic and main memory performance
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-TMP0 TEMP_CORE
-PWR0 PWR_PKG_ENERGY
-PWR3 PWR_DRAM_ENERGY
-PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
-PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
-PMC2 SIMD_FP_256_PACKED_DOUBLE
-MBOX0C0 CAS_COUNT_RD
-MBOX0C1 CAS_COUNT_WR
-MBOX1C0 CAS_COUNT_RD
-MBOX1C1 CAS_COUNT_WR
-MBOX2C0 CAS_COUNT_RD
-MBOX2C1 CAS_COUNT_WR
-MBOX3C0 CAS_COUNT_RD
-MBOX3C1 CAS_COUNT_WR
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI FIXC1/FIXC0
-Temperature TMP0
-Energy [J] PWR0
-Power [W] PWR0/time
-Energy DRAM [J] PWR3
-Power DRAM [W] PWR3/time
-MFlops/s 1.0E-06*(PMC0*4.0+PMC1)/time
-32b AVX MFlops/s 1.0E-06*(PMC2*8.0)/time
-Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0
-
-LONG
-Formula:
-Power = PWR_PKG_ENERGY / runtime
-Power DRAM = PWR_DRAM_ENERGY / runtime
-MFlops/s = (FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE * 4 + FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE) / runtime
-AVX MFlops/s = (SIMD_FP_256_PACKED_SINGLE * 8) / runtime
-Memory Read BW [MBytes/s] 1.0E-06*(CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR)*64.0/time
---
-Profiling group to measure memory bandwidth drawn by all cores of a socket.
-Since this group is based on uncore events it is only possible to measure on
-a per socket base. Also outputs total data volume transfered from main memory.
-SSE scalar and packed single precision flop rates as well as consumed energy and
-temperature. Also reports on packed AVX 32b instructions. Please note that the
-current flop measurements on SandyBridge are potentially wrong. So you cannot
-trust these counters at the moment!
diff --git a/groups/sandybridge/RECOVERY.txt b/groups/sandybridge/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/groups/sandybridge/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 INT_MISC_RECOVERY_CYCLES
+PMC1 INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/groups/sandybridge/TLB_DATA.txt b/groups/sandybridge/TLB_DATA.txt
index 2f59772..8d94e05 100644
--- a/groups/sandybridge/TLB_DATA.txt
+++ b/groups/sandybridge/TLB_DATA.txt
@@ -1,4 +1,4 @@
-SHORT L1 Data TLB miss rate/ratio
+SHORT L2 data TLB miss rate/ratio
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -16,20 +16,20 @@ Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
L1 DTLB load misses PMC0
L1 DTLB load miss rate PMC0/FIXC0
-L1 DTLB load miss duration PMC2
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
L1 DTLB store misses PMC1
L1 DTLB store miss rate PMC1/FIXC0
-L1 DTLB store miss duration PMC3
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
LONG
Formulas:
-L1 DTLB load misses DTLB_LOAD_MISSES_CAUSES_A_WALK
-L1 DTLB load miss rate DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 DTLB load miss duration DTLB_LOAD_MISSES_WALK_DURATION
-L1 DTLB store misses DTLB_STORE_MISSES_CAUSES_A_WALK
-L1 DTLB store miss rate DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 DTLB store miss duration DTLB_STORE_MISSES_WALK_DURATION
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
-
-The DTLB load and store miss rates gives a measure how often a TLB miss occured
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
per instruction. The duration measures the time in cycles how long a walk did take.
diff --git a/groups/sandybridge/TLB_INSTR.txt b/groups/sandybridge/TLB_INSTR.txt
index f95f78a..235d977 100644
--- a/groups/sandybridge/TLB_INSTR.txt
+++ b/groups/sandybridge/TLB_INSTR.txt
@@ -14,15 +14,15 @@ Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
L1 ITLB misses PMC0
L1 ITLB miss rate PMC0/FIXC0
-L1 ITLB miss duration PMC1
+L1 ITLB miss duration [Cyc] PMC1/PMC0
LONG
Formulas:
-L1 ITLB misses ITLB_MISSES_CAUSES_A_WALK
-L1 ITLB miss rate ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 ITLB miss duration ITLB_MISSES_WALK_DURATION
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
-
-The ITLB miss rates gives a measure how often a TLB miss occured
+The ITLB miss rates gives a measure how often a TLB miss occurred
per instruction. The duration measures the time in cycles how long a walk did take.
diff --git a/groups/sandybridge/UOPS.txt b/groups/sandybridge/UOPS.txt
new file mode 100644
index 0000000..178aec5
--- /dev/null
+++ b/groups/sandybridge/UOPS.txt
@@ -0,0 +1,35 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_ANY
+PMC1 UOPS_EXECUTED_THREAD
+PMC2 UOPS_RETIRED_ALL
+PMC3 UOPS_ISSUED_FLAGS_MERGE
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formula:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/groups/sandybridge/UOPS_EXEC.txt b/groups/sandybridge/UOPS_EXEC.txt
new file mode 100644
index 0000000..7042df7
--- /dev/null
+++ b/groups/sandybridge/UOPS_EXEC.txt
@@ -0,0 +1,31 @@
+SHORT UOPs execution
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_EXECUTED_USED_CYCLES
+PMC1 UOPS_EXECUTED_STALL_CYCLES
+PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT UOPS_EXECUTED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the execution stage in the pipeline. Used cycles are all cycles where uOPs are
+executed while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/sandybridge/UOPS_ISSUE.txt b/groups/sandybridge/UOPS_ISSUE.txt
new file mode 100644
index 0000000..9aac923
--- /dev/null
+++ b/groups/sandybridge/UOPS_ISSUE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs issueing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_USED_CYCLES
+PMC1 UOPS_ISSUED_STALL_CYCLES
+PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT UOPS_ISSUED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the issue stage in the pipeline. Used cycles are all cycles where uOPs are
+issued while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/sandybridge/UOPS_RETIRE.txt b/groups/sandybridge/UOPS_RETIRE.txt
new file mode 100644
index 0000000..0f37585
--- /dev/null
+++ b/groups/sandybridge/UOPS_RETIRE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_RETIRED_USED_CYCLES
+PMC1 UOPS_RETIRED_STALL_CYCLES
+PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the retirement stage in the pipeline (re-order buffer). Used cycles are all
+cycles where uOPs are retired while unused cycles refer to pipeline stalls.
+Moreover, the group calculates the average stall duration in cycles.
diff --git a/groups/sandybridgeEP/BRANCH.txt b/groups/sandybridgeEP/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/groups/sandybridgeEP/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 BR_INST_RETIRED_ALL_BRANCHES
+PMC1 BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Branch rate PMC0/FIXC0
+Branch misprediction rate PMC1/FIXC0
+Branch misprediction ratio PMC1/PMC0
+Instructions per branch FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/groups/sandybridgeEP/CACHES.txt b/groups/sandybridgeEP/CACHES.txt
new file mode 100644
index 0000000..889cca8
--- /dev/null
+++ b/groups/sandybridgeEP/CACHES.txt
@@ -0,0 +1,97 @@
+SHORT Some data from the CBOXes
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L1D_REPLACEMENT
+PMC1 L1D_M_EVICT
+PMC2 L2_LINES_IN_ALL
+PMC3 L2_TRANS_L2_WB
+CBOX0C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX1C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX2C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX3C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX4C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX5C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX6C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX7C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX0C1 LLC_VICTIMS_M_STATE
+CBOX1C1 LLC_VICTIMS_M_STATE
+CBOX2C1 LLC_VICTIMS_M_STATE
+CBOX3C1 LLC_VICTIMS_M_STATE
+CBOX4C1 LLC_VICTIMS_M_STATE
+CBOX5C1 LLC_VICTIMS_M_STATE
+CBOX6C1 LLC_VICTIMS_M_STATE
+CBOX7C1 LLC_VICTIMS_M_STATE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L3 to L2 load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time
+L3 to L2 load data volume [GBytes] 1.0E-09*PMC2*64.0
+L2 to L3 evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time
+L2 to L3 evict data volume [GBytes] 1.0E-06*PMC3*64.0
+L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F)*64.0/time
+System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F)*64.0
+L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)*64.0/time
+L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)*64.0
+L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)*64.0/time
+L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)*64.0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+
+
+LONG
+Formulas:
+L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
+L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64
+L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
+L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64
+L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
+L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time
+L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64
+L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time
+L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64
+L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F))*64/time
+System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F))*64
+L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M_STATE))*64/time
+L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M_STATE))*64
+L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F)+SUM(LLC_VICTIMS_M_STATE))*64/time
+L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F)+SUM(LLC_VICTIMS_M_STATE))*64
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
+-
+Group to measure cache transfers between L1 and Memory. Please notice that the
+L3 to/from system metrics contain any traffic to the system (memory,
+Intel QPI, etc.) but don't seem to handle anything because commonly memory read
+bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth.
diff --git a/groups/sandybridgeEP/CLOCK.txt b/groups/sandybridgeEP/CLOCK.txt
new file mode 100644
index 0000000..7a5e87d
--- /dev/null
+++ b/groups/sandybridgeEP/CLOCK.txt
@@ -0,0 +1,27 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Energy [J] PWR0
+Power [W] PWR0/time
+Energy DRAM [J] PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+SandyBridge implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) and DRAM level.
+
diff --git a/groups/sandybridgeEP/DATA.txt b/groups/sandybridgeEP/DATA.txt
new file mode 100644
index 0000000..967cbad
--- /dev/null
+++ b/groups/sandybridgeEP/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_UOPS_RETIRED_LOADS
+PMC1 MEM_UOPS_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/sandybridgeEP/ENERGY.txt b/groups/sandybridgeEP/ENERGY.txt
new file mode 100644
index 0000000..e5e2b33
--- /dev/null
+++ b/groups/sandybridgeEP/ENERGY.txt
@@ -0,0 +1,33 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0 TEMP_CORE
+PWR0 PWR_PKG_ENERGY
+PWR1 PWR_PP0_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Temperature [C] TMP0
+Energy [J] PWR0
+Power [W] PWR0/time
+Energy PP0 [J] PWR1
+Power PP0 [W] PWR1/time
+Energy DRAM [J] PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+SandyBridge implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/groups/sandybridgeEP/FALSE_SHARE.txt b/groups/sandybridgeEP/FALSE_SHARE.txt
new file mode 100644
index 0000000..be9c66c
--- /dev/null
+++ b/groups/sandybridgeEP/FALSE_SHARE.txt
@@ -0,0 +1,27 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM
+PMC2 MEM_LOAD_UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Local LLC false sharing [MByte] 1.E-06*PMC0*64
+Local LLC false sharing rate PMC0/PMC2
+
+LONG
+Formula:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory loads as reference.
+Intel SandyBridge EP CPUs do not provide the events to measure the false-sharing
+over CPU socket boundaries.
diff --git a/groups/sandybridgeEP/FLOPS_AVX.txt b/groups/sandybridgeEP/FLOPS_AVX.txt
new file mode 100644
index 0000000..b4ae4e7
--- /dev/null
+++ b/groups/sandybridgeEP/FLOPS_AVX.txt
@@ -0,0 +1,26 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 SIMD_FP_256_PACKED_SINGLE
+PMC1 SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Packed SP MFLOP/s 1.0E-06*(PMC0*8.0)/time
+Packed DP MFLOP/s 1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formula:
+Packed SP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed DP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+-
+Packed 32b AVX FLOPs rates.
+Please note that the current FLOP measurements on SandyBridge are
+potentially wrong. So you cannot trust these counters at the moment!
+
diff --git a/groups/sandybridgeEP/FLOPS_DP.txt b/groups/sandybridgeEP/FLOPS_DP.txt
new file mode 100644
index 0000000..244e5ce
--- /dev/null
+++ b/groups/sandybridgeEP/FLOPS_DP.txt
@@ -0,0 +1,31 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2 SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+MFLOP/s 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFLOP/s 1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+-
+SSE scalar and packed double precision FLOP rates.
+Please note that the current FLOP measurements on SandyBridge are potentially
+wrong. So you cannot trust these counters at the moment!
+
diff --git a/groups/sandybridgeEP/FLOPS_SP.txt b/groups/sandybridgeEP/FLOPS_SP.txt
new file mode 100644
index 0000000..8cd8de2
--- /dev/null
+++ b/groups/sandybridgeEP/FLOPS_SP.txt
@@ -0,0 +1,31 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2 SIMD_FP_256_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+MFLOP/s 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFLOP/s 1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+-
+SSE scalar and packed single precision FLOP rates.
+Please note that the current FLOP measurements on SandyBridge are potentially
+wrong. So you cannot trust these counters at the moment!
+
diff --git a/groups/sandybridgeEP/ICACHE.txt b/groups/sandybridgeEP/ICACHE.txt
new file mode 100644
index 0000000..f1e2335
--- /dev/null
+++ b/groups/sandybridgeEP/ICACHE.txt
@@ -0,0 +1,33 @@
+SHORT Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 ICACHE_ACCESSES
+PMC1 ICACHE_MISSES
+PMC2 ICACHE_IFETCH_STALL
+PMC3 ILD_STALL_IQ_FULL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+L1I stalls PMC2
+L1I stall rate PMC2/FIXC0
+L1I queue full stalls PMC3
+L1I queue full stall rate PMC3/FIXC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I stalls = ICACHE_IFETCH_STALL
+L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/sandybridgeEP/L2.txt b/groups/sandybridgeEP/L2.txt
new file mode 100644
index 0000000..1feb44c
--- /dev/null
+++ b/groups/sandybridgeEP/L2.txt
@@ -0,0 +1,38 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L1D_REPLACEMENT
+PMC1 L1D_M_EVICT
+PMC2 ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes] 1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L1 and the number of modified cache lines
+evicted from the L1. The group also output total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
+
diff --git a/groups/sandybridgeEP/L2CACHE.txt b/groups/sandybridgeEP/L2CACHE.txt
new file mode 100644
index 0000000..fbc3745
--- /dev/null
+++ b/groups/sandybridgeEP/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L2_TRANS_ALL_REQUESTS
+PMC1 L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/sandybridgeEP/L3.txt b/groups/sandybridgeEP/L3.txt
new file mode 100644
index 0000000..f63a918
--- /dev/null
+++ b/groups/sandybridgeEP/L3.txt
@@ -0,0 +1,36 @@
+SHORT L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L2_LINES_IN_ALL
+PMC1 L2_TRANS_L2_WB
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/groups/sandybridgeEP/L3CACHE.txt b/groups/sandybridgeEP/L3CACHE.txt
new file mode 100644
index 0000000..28766be
--- /dev/null
+++ b/groups/sandybridgeEP/L3CACHE.txt
@@ -0,0 +1,36 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1 MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2 UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. The L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level as they were not
+stored in the L3 cache.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/sandybridgeEP/MEM.txt b/groups/sandybridgeEP/MEM.txt
new file mode 100644
index 0000000..0be0645
--- /dev/null
+++ b/groups/sandybridgeEP/MEM.txt
@@ -0,0 +1,40 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base. Also outputs total data volume transferred from main memory.
+
diff --git a/groups/sandybridgeEP/MEM_DP.txt b/groups/sandybridgeEP/MEM_DP.txt
new file mode 100644
index 0000000..0193575
--- /dev/null
+++ b/groups/sandybridgeEP/MEM_DP.txt
@@ -0,0 +1,59 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2 SIMD_FP_256_PACKED_DOUBLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Energy [J] PWR0
+Power [W] PWR0/time
+Energy DRAM [J] PWR3
+Power DRAM [W] PWR3/time
+MFLOP/s 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFLOP/s 1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed double precision FLOP rates. Also reports on packed AVX
+32b instructions. Please note that the current FLOP measurements on SandyBridge
+are potentially wrong. So you cannot trust these counters at the moment!
diff --git a/groups/sandybridgeEP/MEM_SP.txt b/groups/sandybridgeEP/MEM_SP.txt
new file mode 100644
index 0000000..9e651fa
--- /dev/null
+++ b/groups/sandybridgeEP/MEM_SP.txt
@@ -0,0 +1,61 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2 SIMD_FP_256_PACKED_SINGLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Energy [J] PWR0
+Power [W] PWR0/time
+Energy DRAM [J] PWR3
+Power DRAM [W] PWR3/time
+MFLOP/s 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFLOP/s 1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+
+LONG
+Formula:
+Power [W] = PWR_PKG_ENERGY/runtime
+Power DRAM [W] = PWR_DRAM_ENERGY/runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed single precision FLOP rates. Also reports on packed AVX
+32b instructions. Please note that the current FLOP measurements on SandyBridge
+are potentially wrong. So you cannot trust these counters at the moment!
diff --git a/groups/sandybridgeEP/NUMA.txt b/groups/sandybridgeEP/NUMA.txt
new file mode 100644
index 0000000..0c1b8fb
--- /dev/null
+++ b/groups/sandybridgeEP/NUMA.txt
@@ -0,0 +1,33 @@
+SHORT Local and remote memory accesses
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 OFFCORE_RESPONSE_0_LOCAL_DRAM
+PMC1 OFFCORE_RESPONSE_1_REMOTE_DRAM
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Local DRAM data volume [GByte] 1.E-09*PMC0*64
+Local DRAM bandwidth [MByte/s] 1.E-06*(PMC0*64)/time
+Remote DRAM data volume [GByte] 1.E-09*PMC1*64
+Remote DRAM bandwidth [MByte/s] 1.E-06*(PMC1*64)/time
+Memory data volume [GByte] 1.E-09*(PMC0+PMC1)*64
+Memory bandwidth [MByte/s] 1.E-06*((PMC0+PMC1)*64)/time
+
+LONG
+Formula:
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+Local DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_0_LOCAL_DRAM*64
+Local DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_0_LOCAL_DRAM*64)/time
+Remote DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_1_REMOTE_DRAM*64
+Remote DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_1_REMOTE_DRAM*64)/time
+Memory data volume [GByte] = 1.E-09*(OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64
+Memory bandwidth [MByte/s] = 1.E-06*((OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64)/time
+--
+This performance group measures the data traffic of CPU cores to local and remote
+memory.
diff --git a/groups/sandybridgeEP/QPI.txt b/groups/sandybridgeEP/QPI.txt
new file mode 100644
index 0000000..f09df03
--- /dev/null
+++ b/groups/sandybridgeEP/QPI.txt
@@ -0,0 +1,35 @@
+SHORT QPI traffic between sockets
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+SBOX0C0 DIRECT2CORE_SUCCESS
+SBOX0C1 RXL_FLITS_G1_DRS_DATA
+SBOX0C2 RXL_FLITS_G2_NCB_DATA
+SBOX1C0 DIRECT2CORE_SUCCESS
+SBOX1C1 RXL_FLITS_G1_DRS_DATA
+SBOX1C2 RXL_FLITS_G2_NCB_DATA
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Received bandwidth from QPI [MBytes/s] 1.0E-06*(SBOX0C1+SBOX0C2+SBOX1C1+SBOX1C2)*8/time
+Received data volume from QPI [GBytes] 1.0E-09*(SBOX0C1+SBOX0C2+SBOX1C1+SBOX1C2)*8
+Bandwidth QPI to LLC [MBytes/s] 1.0E-06*(SBOX0C0+SBOX1C0)*64/time
+Data volume QPI to LLC [GBytes] 1.0E-09*(SBOX0C0+SBOX1C0)*64
+Bandwidth QPI to HA or IIO [MBytes/s] 1.0E-06*(((SBOX0C1+SBOX0C2+SBOX1C1+SBOX1C2)*8)-((SBOX0C0+SBOX1C0)*64))/time
+Data volume QPI to HA or IIO [GBytes] 1.0E-09*(((SBOX0C1+SBOX0C2+SBOX1C1+SBOX1C2)*8)-((SBOX0C0+SBOX1C0)*64))
+
+LONG
+Formulas:
+Received bandwidth from QPI [MBytes/s] 1.0E-06*(sum(RXL_FLITS_G1_DRS_DATA)+sum(RXL_FLITS_G2_NCB_DATA))*8/time
+Received data volume from QPI [GBytes] 1.0E-09*(sum(RXL_FLITS_G1_DRS_DATA)+sum(RXL_FLITS_G2_NCB_DATA))*8
+Bandwidth QPI to LLC [MBytes/s] 1.0E-06*(sum(DIRECT2CORE_SUCCESS))*64/time
+Data volume QPI to LLC [GBytes] 1.0E-09*(sum(DIRECT2CORE_SUCCESS))*64
+Bandwidth QPI to HA or IIO [MBytes/s] 1.0E-06*(((sum(RXL_FLITS_G1_DRS_DATA)+sum(RXL_FLITS_G2_NCB_DATA))*8)-((sum(DIRECT2CORE_SUCCESS))*64))/time
+Data volume QPI to HA or IIO [GBytes] 1.0E-09*(((sum(RXL_FLITS_G1_DRS_DATA)+sum(RXL_FLITS_G2_NCB_DATA))*8)-((sum(DIRECT2CORE_SUCCESS))*64))
+-
+Profiling group to measure traffic on the QPI.
diff --git a/groups/sandybridgeEP/RECOVERY.txt b/groups/sandybridgeEP/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/groups/sandybridgeEP/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 INT_MISC_RECOVERY_CYCLES
+PMC1 INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/groups/sandybridgeEP/TLB_DATA.txt b/groups/sandybridgeEP/TLB_DATA.txt
new file mode 100644
index 0000000..8d94e05
--- /dev/null
+++ b/groups/sandybridgeEP/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1 DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2 DTLB_LOAD_MISSES_WALK_DURATION
+PMC3 DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1 DTLB load misses PMC0
+L1 DTLB load miss rate PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses PMC1
+L1 DTLB store miss rate PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/sandybridgeEP/TLB_INSTR.txt b/groups/sandybridgeEP/TLB_INSTR.txt
new file mode 100644
index 0000000..235d977
--- /dev/null
+++ b/groups/sandybridgeEP/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 ITLB_MISSES_CAUSES_A_WALK
+PMC1 ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1 ITLB misses PMC0
+L1 ITLB miss rate PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/sandybridgeEP/UOPS.txt b/groups/sandybridgeEP/UOPS.txt
new file mode 100644
index 0000000..178aec5
--- /dev/null
+++ b/groups/sandybridgeEP/UOPS.txt
@@ -0,0 +1,35 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_ANY
+PMC1 UOPS_EXECUTED_THREAD
+PMC2 UOPS_RETIRED_ALL
+PMC3 UOPS_ISSUED_FLAGS_MERGE
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formula:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/groups/sandybridgeEP/UOPS_EXEC.txt b/groups/sandybridgeEP/UOPS_EXEC.txt
new file mode 100644
index 0000000..7042df7
--- /dev/null
+++ b/groups/sandybridgeEP/UOPS_EXEC.txt
@@ -0,0 +1,31 @@
+SHORT UOPs execution
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_EXECUTED_USED_CYCLES
+PMC1 UOPS_EXECUTED_STALL_CYCLES
+PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT UOPS_EXECUTED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the execution stage in the pipeline. Used cycles are all cycles where uOPs are
+executed while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/sandybridgeEP/UOPS_ISSUE.txt b/groups/sandybridgeEP/UOPS_ISSUE.txt
new file mode 100644
index 0000000..9aac923
--- /dev/null
+++ b/groups/sandybridgeEP/UOPS_ISSUE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs issueing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_USED_CYCLES
+PMC1 UOPS_ISSUED_STALL_CYCLES
+PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT UOPS_ISSUED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the issue stage in the pipeline. Used cycles are all cycles where uOPs are
+issued while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/sandybridgeEP/UOPS_RETIRE.txt b/groups/sandybridgeEP/UOPS_RETIRE.txt
new file mode 100644
index 0000000..0f37585
--- /dev/null
+++ b/groups/sandybridgeEP/UOPS_RETIRE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_RETIRED_USED_CYCLES
+PMC1 UOPS_RETIRED_STALL_CYCLES
+PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the retirement stage in the pipeline (re-order buffer). Used cycles are all
+cycles where uOPs are retired while unused cycles refer to pipeline stalls.
+Moreover, the group calculates the average stall duration in cycles.
diff --git a/groups/silvermont/BRANCH.txt b/groups/silvermont/BRANCH.txt
index cbaf834..b8d41b2 100644
--- a/groups/silvermont/BRANCH.txt
+++ b/groups/silvermont/BRANCH.txt
@@ -19,13 +19,13 @@ Instructions per branch FIXC0/PMC0
LONG
Formulas:
-Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
-
-The rates state how often in average a branch or a mispredicted branch occured
-per instruction retired in total. The Branch misprediction ratio sets directly
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
into relation what ratio of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate.
+Instructions per branch is 1/branch rate.
diff --git a/groups/silvermont/CLOCK.txt b/groups/silvermont/CLOCK.txt
new file mode 100644
index 0000000..088a776
--- /dev/null
+++ b/groups/silvermont/CLOCK.txt
@@ -0,0 +1,23 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Energy [J] PWR0
+Power [W] PWR0/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+-
+Silvermont implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/groups/silvermont/DATA.txt b/groups/silvermont/DATA.txt
new file mode 100644
index 0000000..61a915b
--- /dev/null
+++ b/groups/silvermont/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_UOPS_RETIRED_ALL_LOADS
+PMC1 MEM_UOPS_RETIRED_ALL_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_ALL_LOADS/MEM_UOPS_RETIRED_ALL_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/silvermont/ENERGY.txt b/groups/silvermont/ENERGY.txt
index 5646a9a..d0996b3 100644
--- a/groups/silvermont/ENERGY.txt
+++ b/groups/silvermont/ENERGY.txt
@@ -6,6 +6,7 @@ FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
TMP0 TEMP_CORE
PWR0 PWR_PKG_ENERGY
+PWR1 PWR_PP0_ENERGY
METRICS
Runtime (RDTSC) [s] time
@@ -15,10 +16,13 @@ CPI FIXC1/FIXC0
Temperature [C] TMP0
Energy [J] PWR0
Power [W] PWR0/time
+Energy PP0 [J] PWR1
+Power PP0 [W] PWR1/time
LONG
Formula:
-Power = PWR_PKG_ENERGY / time
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PKG_ENERGY / time
-
Silvermont implements the new RAPL interface. This interface enables to
monitor the consumed energy on the package (socket) level.
diff --git a/groups/silvermont/ICACHE.txt b/groups/silvermont/ICACHE.txt
index 6ce3ce8..5f11ad6 100644
--- a/groups/silvermont/ICACHE.txt
+++ b/groups/silvermont/ICACHE.txt
@@ -18,8 +18,8 @@ L1I miss ratio PMC1/PMC0
LONG
Formulas:
-L2 request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
-L2 miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
-L2 miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
-
This group measures some L1 instruction cache metrics.
diff --git a/groups/silvermont/L1TOL2.txt b/groups/silvermont/L1TOL2.txt
deleted file mode 100644
index 225533d..0000000
--- a/groups/silvermont/L1TOL2.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-SHORT L2 load cache bandwidth in MBytes/s
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PMC0 MEM_UOPS_RETIRED_L1_MISS_LOADS
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI FIXC1/FIXC0
-L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 bandwidth [MBytes/s] 1.0E-06*(PMC0)*64.0/time
-L2 data volume [GBytes] 1.0E-09*(PMC0)*64.0
-
-LONG
-Formulas:
-L2 Load [MBytes/s] = 1.0E-06*MEM_UOPS_RETIRED_L1_MISS_LOADS*64/time
-L2 bandwidth [MBytes/s] = 1.0E-06*(MEM_UOPS_RETIRED_L1_MISS_LOADS)*64/time
-L2 data volume [GBytes] = 1.0E-09*(MEM_UOPS_RETIRED_L1_MISS_LOADS)*64
--
-Profiling group to measure L2 load cache bandwidth. The bandwidth is computed by the
-number of cacheline allocated in the L1 cache. Since there is no possibility to retrieve
-the evicted cache lines, this group measures only the load cache bandwidth.
-The group also output totally loaded data volume transfered between L2 and L1.
-
diff --git a/groups/silvermont/L2CACHE.txt b/groups/silvermont/L2CACHE.txt
new file mode 100644
index 0000000..32a1545
--- /dev/null
+++ b/groups/silvermont/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 LONGEST_LAT_CACHE_REFERENCE
+PMC1 LONGEST_LAT_CACHE_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = LONGEST_LAT_CACHE_REFERENCE/INSTR_RETIRED_ANY
+L2 miss rate = LONGEST_LAT_CACHE_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = LONGEST_LAT_CACHE_MISS/LONGEST_LAT_CACHE_REFERENCE
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache
+reuse.
+
diff --git a/groups/silvermont/L2TOMEM.txt b/groups/silvermont/L2TOMEM.txt
deleted file mode 100644
index bc4cbed..0000000
--- a/groups/silvermont/L2TOMEM.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-SHORT L2 to Mem load cache bandwidth in MBytes/s
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PMC0 MEM_UOPS_RETIRED_L2_MISS_LOADS
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI FIXC1/FIXC0
-L2 to MEM load bandwidth [MBytes/s] 1.0E-06*(PMC0)*64.0/time
-L2 to MEM load data volume [GBytes] 1.0E-09*(PMC0)*64.0
-
-LONG
-Formulas:
-L2 to MEM load bandwidth [MBytes/s] = 1.0E-06*(MEM_UOPS_RETIRED_L2_MISS_LOADS)*64/time
-L2 to MEM load data volume [GBytes] = 1.0E-09*(MEM_UOPS_RETIRED_L2_MISS_LOADS)*64
--
-Profiling group to measure L2 to MEM load cache bandwidth. The bandwidth is computed by the
-number of cacheline allocated in the L2 cache. Since there is no possibility to retrieve
-the evicted cache lines, this group measures only the load cache bandwidth.
-The group also output totally loaded data volume transfered between memory and L2.
-
diff --git a/groups/silvermont/MEM.txt b/groups/silvermont/MEM.txt
new file mode 100644
index 0000000..de78337
--- /dev/null
+++ b/groups/silvermont/MEM.txt
@@ -0,0 +1,37 @@
+SHORT Memory load bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 LONGEST_LAT_CACHE_MISS
+PMC1 OFFCORE_RESPONSE_1_WB_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(PMC0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(PMC0)*64.0
+Memory writeback bandwidth [MBytes/s] 1.0E-06*(PMC1)*64.0/time
+Memory writeback data volume [GBytes] 1.0E-09*(PMC1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(LONGEST_LAT_CACHE_MISS)*64/time
+Memory read data volume [GBytes] = 1.0E-09*(LONGEST_LAT_CACHE_MISS)*64
+Memory writeback bandwidth [MBytes/s] = 1.0E-06*(OFFCORE_RESPONSE_1_WB_ANY)*64/time
+Memory writeback data volume [GBytes] = 1.0E-09*(OFFCORE_RESPONSE_1_WB_ANY)*64
+Memory bandwidth [MBytes/s] = 1.0E-06*(LONGEST_LAT_CACHE_MISS+OFFCORE_RESPONSE_1_WB_ANY)*64/time
+Memory data volume [GBytes] = 1.0E-09*(LONGEST_LAT_CACHE_MISS+OFFCORE_RESPONSE_1_WB_ANY)*64
+-
+Profiling group to measure L2 to MEM load cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 cache. Since there is no possibility to retrieve
+the evicted cache lines, this group measures only the load cache bandwidth. The
+writeback metrics count only modified cache lines that are written back to go to
+exclusive state
+The group also output totally load and writeback data volume transferred between memory and L2.
+
diff --git a/groups/silvermont/MEM_LAT.txt b/groups/silvermont/MEM_LAT.txt
new file mode 100644
index 0000000..516b135
--- /dev/null
+++ b/groups/silvermont/MEM_LAT.txt
@@ -0,0 +1,23 @@
+SHORT Average data read latency
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 OFFCORE_RESPONSE_0_DMND_DATA_RD_AVG_LAT
+PMC1 OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Average data read latency [cyc/read] PMC0/PMC1
+
+LONG
+Formulas:
+Average data read latency [cyc/read] = OFFCORE_RESPONSE_0_DMND_DATA_RD_AVG_LAT/OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY
+-
+The Offcore request facility of Intel Silvermont processors can be used to determine
+the average data read latency. It includes all operations done to read data like
+snoops and hits in upper cache levels.
diff --git a/groups/silvermont/TLB_DATA.txt b/groups/silvermont/TLB_DATA.txt
new file mode 100644
index 0000000..5f2617f
--- /dev/null
+++ b/groups/silvermont/TLB_DATA.txt
@@ -0,0 +1,27 @@
+SHORT L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 PAGE_WALKS_DTLB_COUNT
+PMC1 PAGE_WALKS_DTLB_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1 DTLB misses PMC0
+L1 DTLB miss rate PMC0/FIXC0
+L1 DTLB miss duration [Cyc] PMC1/PMC0
+
+LONG
+Formulas:
+L1 DTLB misses = PAGE_WALKS_DTLB_COUNT
+L1 DTLB miss rate = PAGE_WALKS_DTLB_COUNT / INSTR_RETIRED_ANY
+L1 DTLB miss duration [Cyc] = PAGE_WALKS_DTLB_CYCLES / PAGE_WALKS_DTLB_COUNT
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/silvermont/TLB_INSTR.txt b/groups/silvermont/TLB_INSTR.txt
new file mode 100644
index 0000000..f3dd3ec
--- /dev/null
+++ b/groups/silvermont/TLB_INSTR.txt
@@ -0,0 +1,27 @@
+SHORT L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 PAGE_WALKS_ITLB_COUNT
+PMC1 PAGE_WALKS_ITLB_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1 ITLB misses PMC0
+L1 ITLB miss rate PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = PAGE_WALKS_ITLB_COUNT
+L1 ITLB miss rate = PAGE_WALKS_ITLB_COUNT / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = PAGE_WALKS_ITLB_CYCLES / PAGE_WALKS_ITLB_COUNT
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
diff --git a/groups/skylake/BRANCH.txt b/groups/skylake/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/groups/skylake/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 BR_INST_RETIRED_ALL_BRANCHES
+PMC1 BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Branch rate PMC0/FIXC0
+Branch misprediction rate PMC1/FIXC0
+Branch misprediction ratio PMC1/PMC0
+Instructions per branch FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/groups/skylake/CLOCK.txt b/groups/skylake/CLOCK.txt
new file mode 100644
index 0000000..79a4480
--- /dev/null
+++ b/groups/skylake/CLOCK.txt
@@ -0,0 +1,27 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Energy [J] PWR0
+Power [W] PWR0/time
+Energy DRAM [J] PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Skylake implements the RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) and DRAM level.
+
diff --git a/groups/skylake/DATA.txt b/groups/skylake/DATA.txt
new file mode 100644
index 0000000..4e6e938
--- /dev/null
+++ b/groups/skylake/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_INST_RETIRED_ALL_LOADS
+PMC1 MEM_INST_RETIRED_ALL_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/skylake/ENERGY.txt b/groups/skylake/ENERGY.txt
new file mode 100644
index 0000000..06baa72
--- /dev/null
+++ b/groups/skylake/ENERGY.txt
@@ -0,0 +1,39 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0 TEMP_CORE
+PWR0 PWR_PKG_ENERGY
+PWR1 PWR_PP0_ENERGY
+PWR2 PWR_PP1_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Temperature [C] TMP0
+Energy [J] PWR0
+Power [W] PWR0/time
+Energy PP0 [J] PWR1
+Power PP0 [W] PWR1/time
+Energy PP1 [J] PWR2
+Power PP1 [W] PWR2/time
+Energy DRAM [J] PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power PP1 = PWR_PP1_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Skylake implements the RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) and DRAM level.
+
diff --git a/groups/skylake/FALSE_SHARE.txt b/groups/skylake/FALSE_SHARE.txt
new file mode 100644
index 0000000..626277a
--- /dev/null
+++ b/groups/skylake/FALSE_SHARE.txt
@@ -0,0 +1,25 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_L3_HIT_RETIRED_XSNP_HITM
+PMC2 MEM_INST_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Local LLC false sharing [MByte] 1.E-06*PMC0*64
+Local LLC false sharing rate PMC0/PMC2
+
+LONG
+Formula:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_L3_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_L3_HIT_RETIRED_XSNP_HITM/MEM_INST_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory loads as reference.
diff --git a/groups/skylake/FLOPS_AVX.txt b/groups/skylake/FLOPS_AVX.txt
new file mode 100644
index 0000000..6088bca
--- /dev/null
+++ b/groups/skylake/FLOPS_AVX.txt
@@ -0,0 +1,24 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC1 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Packed SP MFLOP/s 1.0E-06*(PMC0*8.0)/time
+Packed DP MFLOP/s 1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formula:
+Packed SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+-
+Packed 32b AVX FLOPs rates.
+
diff --git a/groups/skylake/FLOPS_DP.txt b/groups/skylake/FLOPS_DP.txt
new file mode 100644
index 0000000..c99d2c1
--- /dev/null
+++ b/groups/skylake/FLOPS_DP.txt
@@ -0,0 +1,29 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+MFLOP/s 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFLOP/s 1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+-
+SSE scalar and packed double precision FLOP rates.
+
diff --git a/groups/skylake/FLOPS_SP.txt b/groups/skylake/FLOPS_SP.txt
new file mode 100644
index 0000000..a273e84
--- /dev/null
+++ b/groups/skylake/FLOPS_SP.txt
@@ -0,0 +1,29 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+MFLOP/s 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFLOP/s 1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+-
+SSE scalar and packed single precision FLOP rates.
+
diff --git a/groups/skylake/ICACHE.txt b/groups/skylake/ICACHE.txt
new file mode 100644
index 0000000..aab7dac
--- /dev/null
+++ b/groups/skylake/ICACHE.txt
@@ -0,0 +1,30 @@
+SHORT Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 ICACHE_64B_IFTAG_ALL
+PMC1 ICACHE_64B_IFTAG_MISS
+PMC2 ICACHE_64B_IFTAG_STALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+L1I stalls PMC2
+L1I stall rate PMC2/FIXC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I stalls = ICACHE_IFETCH_STALL
+L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/skylake/L2.txt b/groups/skylake/L2.txt
new file mode 100644
index 0000000..1a92a95
--- /dev/null
+++ b/groups/skylake/L2.txt
@@ -0,0 +1,38 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L1D_REPLACEMENT
+PMC1 L1D_M_EVICT
+PMC2 ICACHE_64B_IFTAG_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes] 1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_64B_IFTAG_MISS)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_64B_IFTAG_MISS)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L1 and the number of modified cache lines
+evicted from the L1. The group also output total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
+
diff --git a/groups/skylake/L2CACHE.txt b/groups/skylake/L2CACHE.txt
new file mode 100644
index 0000000..fbc3745
--- /dev/null
+++ b/groups/skylake/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L2_TRANS_ALL_REQUESTS
+PMC1 L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/skylake/L3.txt b/groups/skylake/L3.txt
new file mode 100644
index 0000000..f63a918
--- /dev/null
+++ b/groups/skylake/L3.txt
@@ -0,0 +1,36 @@
+SHORT L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L2_LINES_IN_ALL
+PMC1 L2_TRANS_L2_WB
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/groups/skylake/L3CACHE.txt b/groups/skylake/L3CACHE.txt
new file mode 100644
index 0000000..8c91d39
--- /dev/null
+++ b/groups/skylake/L3CACHE.txt
@@ -0,0 +1,35 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_RETIRED_L3_HIT
+PMC1 MEM_LOAD_RETIRED_L3_MISS
+PMC2 UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L3 request rate (PMC0+PMC1)/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = (MEM_LOAD_RETIRED_L3_HIT+MEM_LOAD_RETIRED_L3_MISS)/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/skylake/RECOVERY.txt b/groups/skylake/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/groups/skylake/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 INT_MISC_RECOVERY_CYCLES
+PMC1 INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/groups/skylake/TLB_DATA.txt b/groups/skylake/TLB_DATA.txt
new file mode 100644
index 0000000..10ee5e1
--- /dev/null
+++ b/groups/skylake/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1 DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2 DTLB_LOAD_MISSES_WALK_ACTIVE
+PMC3 DTLB_STORE_MISSES_WALK_ACTIVE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1 DTLB load misses PMC0
+L1 DTLB load miss rate PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses PMC1
+L1 DTLB store miss rate PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_ACTIVE / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_ACTIVE / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/skylake/TLB_INSTR.txt b/groups/skylake/TLB_INSTR.txt
new file mode 100644
index 0000000..9bc65a7
--- /dev/null
+++ b/groups/skylake/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 ITLB_MISSES_CAUSES_A_WALK
+PMC1 ITLB_MISSES_WALK_ACTIVE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1 ITLB misses PMC0
+L1 ITLB miss rate PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_ACTIVE / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/skylake/UOPS.txt b/groups/skylake/UOPS.txt
new file mode 100644
index 0000000..fbb01e1
--- /dev/null
+++ b/groups/skylake/UOPS.txt
@@ -0,0 +1,29 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_ANY
+PMC1 UOPS_EXECUTED_THREAD
+PMC2 UOPS_RETIRED_ALL
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Issued UOPs PMC0
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formula:
+Issued UOPs = UOPS_ISSUED_ANY
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs.
diff --git a/groups/skylake/UOPS_EXEC.txt b/groups/skylake/UOPS_EXEC.txt
new file mode 100644
index 0000000..7042df7
--- /dev/null
+++ b/groups/skylake/UOPS_EXEC.txt
@@ -0,0 +1,31 @@
+SHORT UOPs execution
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_EXECUTED_USED_CYCLES
+PMC1 UOPS_EXECUTED_STALL_CYCLES
+PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT UOPS_EXECUTED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the execution stage in the pipeline. Used cycles are all cycles where uOPs are
+executed while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/skylake/UOPS_ISSUE.txt b/groups/skylake/UOPS_ISSUE.txt
new file mode 100644
index 0000000..9aac923
--- /dev/null
+++ b/groups/skylake/UOPS_ISSUE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs issueing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_USED_CYCLES
+PMC1 UOPS_ISSUED_STALL_CYCLES
+PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT UOPS_ISSUED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the issue stage in the pipeline. Used cycles are all cycles where uOPs are
+issued while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/skylake/UOPS_RETIRE.txt b/groups/skylake/UOPS_RETIRE.txt
new file mode 100644
index 0000000..0f37585
--- /dev/null
+++ b/groups/skylake/UOPS_RETIRE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_RETIRED_USED_CYCLES
+PMC1 UOPS_RETIRED_STALL_CYCLES
+PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the retirement stage in the pipeline (re-order buffer). Used cycles are all
+cycles where uOPs are retired while unused cycles refer to pipeline stalls.
+Moreover, the group calculates the average stall duration in cycles.
diff --git a/groups/westmere/BRANCH.txt b/groups/westmere/BRANCH.txt
index 3d81416..b8d41b2 100644
--- a/groups/westmere/BRANCH.txt
+++ b/groups/westmere/BRANCH.txt
@@ -19,13 +19,13 @@ Instructions per branch FIXC0/PMC0
LONG
Formulas:
-Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
-
-The rates state how often in average a branch or a mispredicted branch occured
-per instruction retired in total. The Branch misprediction ratio sets directly
-into relation what ration of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate.
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
diff --git a/groups/westmere/CACHE.txt b/groups/westmere/CACHE.txt
index 4ceed06..6a5e4fe 100644
--- a/groups/westmere/CACHE.txt
+++ b/groups/westmere/CACHE.txt
@@ -11,15 +11,16 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-Data cache misses PMC0
-Data cache miss rate PMC0/FIXC0
+data cache misses PMC0
+data cache miss rate PMC0/FIXC0
LONG
Formulas:
-Data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
+data cache misses = L1D_REPL
+data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
-
This group measures the locality of your data accesses with regard to the
-L1 Cache.
-The Data cache miss rate gives a measure how often it was necessary to get
-cachelines from higher levels of the memory hierarchy.
+L1 cache.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy.
diff --git a/groups/westmere/CLOCK.txt b/groups/westmere/CLOCK.txt
new file mode 100644
index 0000000..9139668
--- /dev/null
+++ b/groups/westmere/CLOCK.txt
@@ -0,0 +1,18 @@
+SHORT CPU clock information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+
+LONG
+Formula:
+-
+
+
diff --git a/groups/westmere/DATA.txt b/groups/westmere/DATA.txt
index a5611bc..31bba51 100644
--- a/groups/westmere/DATA.txt
+++ b/groups/westmere/DATA.txt
@@ -12,11 +12,11 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
LONG
Formulas:
-Load to Store ratio = MEM_INST_RETIRED_LOADS / MEM_INST_RETIRED_STORES
+Load to store ratio = MEM_INST_RETIRED_LOADS/MEM_INST_RETIRED_STORES
-
-This is a simple metric to determine your Load to store ratio.
+This is a simple metric to determine your load to store ratio.
diff --git a/groups/westmere/FLOPS_DP.txt b/groups/westmere/FLOPS_DP.txt
index c5ba91c..2773f06 100644
--- a/groups/westmere/FLOPS_DP.txt
+++ b/groups/westmere/FLOPS_DP.txt
@@ -1,4 +1,4 @@
-SHORT Double Precision MFlops/s
+SHORT Double Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -14,7 +14,7 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-DP MFlops/s 1.0E-06*(PMC0*2.0+PMC1)/time
+MFLOP/s 1.0E-06*(PMC0*2.0+PMC1)/time
Packed MUOPS/s 1.0E-06*PMC0/time
Scalar MUOPS/s 1.0E-06*PMC1/time
SP MUOPS/s 1.0E-06*PMC2/time
@@ -22,10 +22,14 @@ DP MUOPS/s 1.0E-06*PMC3/time
LONG
Formula:
-DP MFlops/s = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 + FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
-
-The Nehalem has not possibility to measure MFlops if mixed precision calculations are done.
-Therefore both Single as well as Double precision are measured to ensure the correctness
+The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done.
+Therefore both single as well as double precision are measured to ensure the correctness
of the measurements. You can check if your code was vectorized on the number of
FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
diff --git a/groups/westmere/FLOPS_SP.txt b/groups/westmere/FLOPS_SP.txt
index 4478c8f..8254fd9 100644
--- a/groups/westmere/FLOPS_SP.txt
+++ b/groups/westmere/FLOPS_SP.txt
@@ -1,4 +1,4 @@
-SHORT Single Precision MFlops/s
+SHORT Single Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -14,7 +14,7 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-SP MFlops/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
+MFLOP/s 1.0E-06*(PMC0*4.0+PMC1)/time
Packed MUOPS/s 1.0E-06*PMC0/time
Scalar MUOPS/s 1.0E-06*PMC1/time
SP MUOPS/s 1.0E-06*PMC2/time
@@ -22,10 +22,14 @@ DP MUOPS/s 1.0E-06*PMC3/time
LONG
Formula:
-SP MFlops/s = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 + FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
-
-The Nehalem has not possibility to measure MFlops if mixed precision calculations are done.
-Therefore both Single as well as Double precision are measured to ensure the correctness
+The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done.
+Therefore both single as well as double precision are measured to ensure the correctness
of the measurements. You can check if your code was vectorized on the number of
FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
diff --git a/groups/westmere/FLOPS_X87.txt b/groups/westmere/FLOPS_X87.txt
index 6447b93..a4176f0 100644
--- a/groups/westmere/FLOPS_X87.txt
+++ b/groups/westmere/FLOPS_X87.txt
@@ -1,4 +1,4 @@
-SHORT X87 MFlops/s
+SHORT X87 MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -11,8 +11,8 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-X87 MFlops/s 1.0E-06*PMC0/time
+X87 MFLOP/s 1.0E-06*PMC0/time
LONG
-Profiling group to measure X87 flop rate.
+Profiling group to measure X87 FLOP rate.
diff --git a/groups/westmere/ICACHE.txt b/groups/westmere/ICACHE.txt
new file mode 100644
index 0000000..49943ff
--- /dev/null
+++ b/groups/westmere/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L1I_READS
+PMC1 L1I_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = L1I_READS / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / L1I_READS
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/westmere/L2.txt b/groups/westmere/L2.txt
index 5506f1f..74f7d58 100644
--- a/groups/westmere/L2.txt
+++ b/groups/westmere/L2.txt
@@ -6,27 +6,33 @@ FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 L1D_REPL
PMC1 L1D_M_EVICT
+PMC2 L1I_MISSES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
-L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes] 1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
LONG
Formulas:
-L2 Load [MBytes/s] = 1.0E-06*L1D_REPL*64/time
-L2 Evict [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
-L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT)*64/time
-L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT)*64
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64
-
Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
-number of cacheline allocated in the L1 and the number of modified cachelines
-evicted from the L1. The group also reports on data volume transfered between
+number of cache line allocated in the L1 and the number of modified cache lines
+evicted from the L1. The group also reports of data volume transferred between
L2 and L1 cache. Note that this bandwidth also includes data transfers due to a
-write allocate load on a store miss in L1.
+write allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
diff --git a/groups/westmere/L2CACHE.txt b/groups/westmere/L2CACHE.txt
index 49778be..343b263 100644
--- a/groups/westmere/L2CACHE.txt
+++ b/groups/westmere/L2CACHE.txt
@@ -4,7 +4,7 @@ EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
-PMC0 L2_DATA_RQSTS_DEMAND_ANY
+PMC0 L2_RQSTS_REFERENCES
PMC1 L2_RQSTS_MISS
METRICS
@@ -18,18 +18,17 @@ L2 miss ratio PMC1/PMC0
LONG
Formulas:
-L2 request rate = L2_DATA_RQSTS_DEMAND_ANY / INSTR_RETIRED_ANY
-L2 miss rate = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_DATA_RQSTS_DEMAND_ANY
+L2 request rate = L2_RQSTS_REFERENCES/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_RQSTS_REFERENCES
-
This group measures the locality of your data accesses with regard to the
-L2 Cache. L2 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
The L2 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L2 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
diff --git a/groups/westmere/L3.txt b/groups/westmere/L3.txt
index 6a58f78..a1d95e3 100644
--- a/groups/westmere/L3.txt
+++ b/groups/westmere/L3.txt
@@ -4,28 +4,33 @@ EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
-PMC0 L2_LINES_IN_ANY
-PMC1 L2_LINES_OUT_ANY
+PMC0 L2_RQSTS_MISS
+PMC1 L2_LINES_OUT_DIRTY_ANY
+
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-L3 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L3 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
+L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s] 1.0E-06*(PMC1)*64.0/time
+L3 evict data volume [GBytes] 1.0E-09*(PMC1)*64.0
L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
LONG
Formulas:
-L3 Load [MBytes/s] 1.0E-06*L2_LINES_IN_ANY*64/time
-L3 Evict [MBytes/s] 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64/time
-L3 bandwidth [MBytes/s] 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64/time
-L3 data volume [GBytes] 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_RQSTS_MISS*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_RQSTS_MISS*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DIRTY_ANY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DIRTY_ANY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_RQSTS_MISS+L2_LINES_OUT_DIRTY_ANY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_RQSTS_MISS+L2_LINES_OUT_DIRTY_ANY)*64
-
Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
-number of cacheline allocated in the L2 and the number of modified cachelines
+number of cache line allocated in the L2 and the number of modified cache lines
evicted from the L2. The group also reports total data volume between L3 and
the measured L2 cache. Note that this bandwidth also includes data transfers
due to a write allocate load on a store miss in L2.
diff --git a/groups/westmere/L3CACHE.txt b/groups/westmere/L3CACHE.txt
index 944bc97..58072c1 100644
--- a/groups/westmere/L3CACHE.txt
+++ b/groups/westmere/L3CACHE.txt
@@ -1,36 +1,34 @@
SHORT L3 cache miss rate/ratio
EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
UPMC0 UNC_L3_HITS_ANY
UPMC1 UNC_L3_MISS_ANY
-UPMC2 UNC_L3_LINES_IN_ANY
-UPMC3 UNC_L3_LINES_OUT_ANY
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-L3 request rate UPMC0/FIXC0
+L3 request rate (UPMC0+UPMC1)/FIXC0
L3 miss rate UPMC1/FIXC0
L3 miss ratio UPMC1/(UPMC0+UPMC1)
LONG
Formulas:
-L3 request rate UNC_L3_HITS_ANY / INSTR_RETIRED_ANY
-L3 miss rate UNC_L3_MISS_ANY / INSTR_RETIRED_ANY
-L3 miss ratio UNC_L3_MISS_ANY / (UNC_L3_HITS_ANY + UNC_L3_MISS_ANY)
+L3 request rate = (UNC_L3_HITS_ANY+UNC_L3_MISS_ANY)/INSTR_RETIRED_ANY
+L3 miss rate = UNC_L3_MISS_ANY/INSTR_RETIRED_ANY
+L3 miss ratio = UNC_L3_MISS_ANY/(UNC_L3_HITS_ANY+UNC_L3_MISS_ANY)
-
This group measures the locality of your data accesses with regard to the L3
Cache. L3 request rate tells you how data intensive your code is or how many
-Data accesses you have in average per instruction. The L3 miss rate gives a
-measure how often it was necessary to get cachelines from memory. And finally
-L3 miss ratio tells you how many of your memory references required a cacheline
-to be loaded from a higher level. While the Data cache miss rate might be given
-by your algorithm you should try to get Data cache miss ratio as low as
+data accesses you have on average per instruction. The L3 miss rate gives a
+measure how often it was necessary to get cache lines from memory. And finally
+L3 miss ratio tells you how many of your memory references required a cache line
+to be loaded from a higher level. While the data cache miss rate might be given
+by your algorithm you should try to get data cache miss ratio as low as
possible by increasing your cache reuse.
diff --git a/groups/westmere/MEM.txt b/groups/westmere/MEM.txt
index f9e19ad..513ec60 100644
--- a/groups/westmere/MEM.txt
+++ b/groups/westmere/MEM.txt
@@ -1,37 +1,50 @@
-SHORT Main memory bandwidth
+SHORT Main memory bandwidth in MBytes/s
EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
UPMC0 UNC_QMC_NORMAL_READS_ANY
UPMC1 UNC_QMC_WRITES_FULL_ANY
-UPMC2 UNC_QHL_REQUESTS_REMOTE_READS
-UPMC3 UNC_QHL_REQUESTS_LOCAL_READS
-UPMC4 UNC_QHL_REQUESTS_REMOTE_WRITES
+UPMC2 UNC_QHL_REQUESTS_REMOTE_READS
+UPMC3 UNC_QHL_REQUESTS_REMOTE_WRITES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64/time
-Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64
-Remote Read BW [MBytes/s] 1.0E-06*(UPMC2)*64/time
-Remote Write BW [MBytes/s] 1.0E-06*(UPMC4)*64/time
-Remote BW [MBytes/s] 1.0E-06*(UPMC2+UPMC4)*64/time
+Memory read bandwidth [MBytes/s] 1.0E-06*UPMC0*64.0/time
+Memory data volume [GBytes] 1.0E-09*UPMC0*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*UPMC1*64.0/time
+Memory data volume [GBytes] 1.0E-09*UPMC1*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64.0
+Remote memory read bandwidth [MBytes/s] 1.0E-06*UPMC2*64.0/time
+Remote memory read data volume [GBytes] 1.0E-09*UPMC2*64.0
+Remote memory write bandwidth [MBytes/s] 1.0E-06*UPMC3*64.0/time
+Remote memory write data volume [GBytes] 1.0E-09*UPMC3*64.0
+Remote memory bandwidth [MBytes/s] 1.0E-06*(UPMC2+UPMC3)*64.0/time
+Remote memory data volume [GBytes] 1.0E-09*(UPMC2+UPMC3)*64.0
LONG
Formulas:
-Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64/time
-Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64
-Remote Read BW [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS)*64/time;
-Remote Write BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
-Remote BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
+Memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_NORMAL_READS_ANY*64.0/time
+Memory data volume [GBytes] = 1.0E-09*UNC_QMC_NORMAL_READS_ANY*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_WRITES_FULL_ANY*64.0/time
+Memory data volume [GBytes] = 1.0E-09*UNC_QMC_WRITES_FULL_ANY*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0
+Remote memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_READS*64.0/time
+Remote memory read data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_READS*64.0
+Remote memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0/time
+Remote memory write data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0
+Remote memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0/time
+Remote memory data volume [GBytes] = 1.0E-09*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0
-
Profiling group to measure memory bandwidth drawn by all cores of a socket.
-This group will be measured by one core per socket. The Remote Read BW tells
-you if cachelines are transfered between sockets, meaning that cores access
+This group will be measured by one core per socket. The remote read BW tells
+you if cache lines are transferred between sockets, meaning that cores access
data owned by a remote NUMA domain. The group also reports total data volume
-transfered from main memory.
+transferred from main memory.
diff --git a/groups/westmere/TLB.txt b/groups/westmere/TLB.txt
deleted file mode 100644
index 0077350..0000000
--- a/groups/westmere/TLB.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-SHORT TLB miss rate/ratio
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PMC0 DTLB_MISSES_ANY
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI FIXC1/FIXC0
-L1 DTLB miss rate PMC0/FIXC0
-
-LONG
-Formulas:
-DTLB miss rate = DTLB_MISSES_ANY / INSTR_RETIRED_ANY
--
-The DTLB miss rate gives a measure how often a TLB miss occured
-per instruction.
-
diff --git a/groups/westmere/TLB_DATA.txt b/groups/westmere/TLB_DATA.txt
new file mode 100644
index 0000000..d256b8c
--- /dev/null
+++ b/groups/westmere/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 DTLB_LOAD_MISSES_ANY
+PMC1 DTLB_MISSES_ANY
+PMC2 DTLB_LOAD_MISSES_WALK_CYCLES
+PMC3 DTLB_MISSES_WALK_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1 DTLB load misses PMC0
+L1 DTLB load miss rate PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses (PMC1-PMC0)
+L1 DTLB store miss rate (PMC1-PMC0)/FIXC0
+L1 DTLB store miss duration [Cyc] (PMC3-PMC2)/(PMC1-PMC0)
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_ANY
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_ANY / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_CYCLES / DTLB_LOAD_MISSES_ANY
+L1 DTLB store misses = DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY
+L1 DTLB store miss rate = (DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY) / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = (DTLB_MISSES_WALK_CYCLES-DTLB_LOAD_MISSES_WALK_CYCLES) / (DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY)
+-
+The DTLB miss rate gives a measure how often a TLB miss occurred
+per instruction. The store miss calculations are done using ALL-LOADS TLB walks.
+
diff --git a/groups/westmere/TLB_INSTR.txt b/groups/westmere/TLB_INSTR.txt
new file mode 100644
index 0000000..2f0f90c
--- /dev/null
+++ b/groups/westmere/TLB_INSTR.txt
@@ -0,0 +1,27 @@
+SHORT L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 ITLB_MISSES_ANY
+PMC1 ITLB_MISSES_WALK_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1 ITLB misses PMC0
+L1 ITLB miss rate PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_ANY
+L1 ITLB miss rate = ITLB_MISSES_ANY / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_CYCLES / ITLB_MISSES_ANY
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/westmere/UOPS.txt b/groups/westmere/UOPS.txt
new file mode 100644
index 0000000..9d738d0
--- /dev/null
+++ b/groups/westmere/UOPS.txt
@@ -0,0 +1,35 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_ANY
+PMC1 UOPS_EXECUTED_THREAD
+PMC2 UOPS_RETIRED_ALL
+PMC3 UOPS_ISSUED_FUSED
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formula:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FUSED
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/groups/westmere/VIEW.txt b/groups/westmere/VIEW.txt
index a0708f4..76809ed 100644
--- a/groups/westmere/VIEW.txt
+++ b/groups/westmere/VIEW.txt
@@ -11,16 +11,16 @@ PMC3 FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
UPMC0 UNC_QMC_NORMAL_READS_ANY
UPMC1 UNC_QMC_WRITES_FULL_ANY
UPMC2 UNC_QHL_REQUESTS_REMOTE_READS
-UPMC3 UNC_QHL_REQUESTS_LOCAL_READS
-UPMC4 UNC_QHL_REQUESTS_REMOTE_WRITES
+UPMC3 UNC_QHL_REQUESTS_LOCAL_READS
+UPMC4 UNC_QHL_REQUESTS_REMOTE_WRITES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-DP MFlops/s (DP assumed) 1.0E-06*(PMC0*2.0+PMC1)/time
-SP MFlops/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
+DP MFLOP/s (DP assumed) 1.0E-06*(PMC0*2.0+PMC1)/time
+SP MFLOP/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
Packed MUOPS/s 1.0E-06*PMC0/time
Scalar MUOPS/s 1.0E-06*PMC1/time
SP MUOPS/s 1.0E-06*PMC2/time
@@ -33,8 +33,8 @@ Remote BW [MBytes/s] 1.0E-06*(UPMC2+UPMC4)*64/time
LONG
Formula:
-DP MFlops/s = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 + FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-SP MFlops/s = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 + FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+DP MFLOP/s = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 + FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+SP MFLOP/s = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 + FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
Packed MUOPS/s 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/time
Scalar MUOPS/s 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/time
SP MUOPS/s 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/time
@@ -45,6 +45,6 @@ Remote Read BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS)*64/time
Remote Write BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
Remote BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
-
-This is a overview group using the capabilities of westmere to measure multiple events at
+This is a overview group using the capabilities of Westmere to measure multiple events at
the same time.
diff --git a/groups/westmereEX/BRANCH.txt b/groups/westmereEX/BRANCH.txt
index 3d81416..b8d41b2 100644
--- a/groups/westmereEX/BRANCH.txt
+++ b/groups/westmereEX/BRANCH.txt
@@ -19,13 +19,13 @@ Instructions per branch FIXC0/PMC0
LONG
Formulas:
-Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
-
-The rates state how often in average a branch or a mispredicted branch occured
-per instruction retired in total. The Branch misprediction ratio sets directly
-into relation what ration of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate.
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
diff --git a/groups/westmereEX/CACHE.txt b/groups/westmereEX/CACHE.txt
index 490f982..eb160f6 100644
--- a/groups/westmereEX/CACHE.txt
+++ b/groups/westmereEX/CACHE.txt
@@ -11,14 +11,15 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-Data cache misses PMC0
-Data cache miss rate PMC0/FIXC0
+data cache misses PMC0
+data cache miss rate PMC0/FIXC0
LONG
Formulas:
-Data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
+data cache misses = L1D_REPL
+data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
-
This group measures the locality of your data accesses with regard to the L1
-Cache. The Data cache miss rate gives a measure how often it was necessary to
-get cachelines from higher levels of the memory hierarchy.
+cache. The data cache miss rate gives a measure how often it was necessary to
+get cache lines from higher levels of the memory hierarchy.
diff --git a/groups/westmereEX/DATA.txt b/groups/westmereEX/DATA.txt
index a5611bc..31bba51 100644
--- a/groups/westmereEX/DATA.txt
+++ b/groups/westmereEX/DATA.txt
@@ -12,11 +12,11 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
LONG
Formulas:
-Load to Store ratio = MEM_INST_RETIRED_LOADS / MEM_INST_RETIRED_STORES
+Load to store ratio = MEM_INST_RETIRED_LOADS/MEM_INST_RETIRED_STORES
-
-This is a simple metric to determine your Load to store ratio.
+This is a simple metric to determine your load to store ratio.
diff --git a/groups/westmereEX/FLOPS_DP.txt b/groups/westmereEX/FLOPS_DP.txt
index a62cbe3..3e75cad 100644
--- a/groups/westmereEX/FLOPS_DP.txt
+++ b/groups/westmereEX/FLOPS_DP.txt
@@ -1,4 +1,4 @@
-SHORT Double Precision MFlops/s
+SHORT Double Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -14,7 +14,7 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-MFlops/s 1.0E-06*(PMC0*2.0+PMC1)/time
+MFLOP/s 1.0E-06*(PMC0*2.0+PMC1)/time
Packed MUOPS/s 1.0E-06*PMC0/time
Scalar MUOPS/s 1.0E-06*PMC1/time
SP MUOPS/s 1.0E-06*PMC2/time
@@ -22,10 +22,14 @@ DP MUOPS/s 1.0E-06*PMC3/time
LONG
Formula:
-DP MFlops/s = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 + FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
-
-The Nehalem has not possibility to measure MFlops if mixed precision calculations are done.
-Therefore both Single as well as Double precision are measured to ensure the correctness
+The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done.
+Therefore both single as well as double precision are measured to ensure the correctness
of the measurements. You can check if your code was vectorized on the number of
-FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
+FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
diff --git a/groups/westmereEX/FLOPS_SP.txt b/groups/westmereEX/FLOPS_SP.txt
index 1485615..601027b 100644
--- a/groups/westmereEX/FLOPS_SP.txt
+++ b/groups/westmereEX/FLOPS_SP.txt
@@ -1,4 +1,4 @@
-SHORT Single Precision MFlops/s
+SHORT Single Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -14,7 +14,7 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-MFlops/s 1.0E-06*(PMC0*4.0+PMC1)/time
+MFLOP/s 1.0E-06*(PMC0*4.0+PMC1)/time
Packed MUOPS/s 1.0E-06*PMC0/time
Scalar MUOPS/s 1.0E-06*PMC1/time
SP MUOPS/s 1.0E-06*PMC2/time
@@ -22,10 +22,14 @@ DP MUOPS/s 1.0E-06*PMC3/time
LONG
Formula:
-SP MFlops/s = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 + FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
-
-The Nehalem has not possibility to measure MFlops if mixed precision calculations are done.
-Therefore both Single as well as Double precision are measured to ensure the correctness
+The Westmere EX has no possibility to measure MFLOPs if mixed precision calculations are done.
+Therefore both single as well as double precision are measured to ensure the correctness
of the measurements. You can check if your code was vectorized on the number of
-FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
+FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
diff --git a/groups/westmereEX/FLOPS_X87.txt b/groups/westmereEX/FLOPS_X87.txt
index 6447b93..a4176f0 100644
--- a/groups/westmereEX/FLOPS_X87.txt
+++ b/groups/westmereEX/FLOPS_X87.txt
@@ -1,4 +1,4 @@
-SHORT X87 MFlops/s
+SHORT X87 MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
@@ -11,8 +11,8 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-X87 MFlops/s 1.0E-06*PMC0/time
+X87 MFLOP/s 1.0E-06*PMC0/time
LONG
-Profiling group to measure X87 flop rate.
+Profiling group to measure X87 FLOP rate.
diff --git a/groups/westmereEX/ICACHE.txt b/groups/westmereEX/ICACHE.txt
new file mode 100644
index 0000000..49943ff
--- /dev/null
+++ b/groups/westmereEX/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L1I_READS
+PMC1 L1I_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = L1I_READS / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / L1I_READS
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/westmereEX/L2.txt b/groups/westmereEX/L2.txt
index 9201cd0..e950021 100644
--- a/groups/westmereEX/L2.txt
+++ b/groups/westmereEX/L2.txt
@@ -6,27 +6,33 @@ FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 L1D_REPL
PMC1 L1D_M_EVICT
+PMC2 L1I_MISSES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
-L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes] 1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
LONG
Formulas:
-L2 Load [MBytes/s] = 1.0E-06*L1D_REPL*64/time
-L2 Evict [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
-L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT)*64/time
-L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT)*64
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64
-
Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
-number of cacheline allocated in the L1 and the number of modified cachelines
-evicted from the L1. Also reports on total data volume transfered between L2
+number of cache line allocated in the L1 and the number of modified cache lines
+evicted from the L1. Also reports on total data volume transferred between L2
and L1 cache. Note that this bandwidth also includes data transfers due to a
-write allocate load on a store miss in L1.
+write allocate load on a store miss in L1 and traffic caused by misses in the
+instruction cache.
diff --git a/groups/westmereEX/L2CACHE.txt b/groups/westmereEX/L2CACHE.txt
index 49778be..343b263 100644
--- a/groups/westmereEX/L2CACHE.txt
+++ b/groups/westmereEX/L2CACHE.txt
@@ -4,7 +4,7 @@ EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
-PMC0 L2_DATA_RQSTS_DEMAND_ANY
+PMC0 L2_RQSTS_REFERENCES
PMC1 L2_RQSTS_MISS
METRICS
@@ -18,18 +18,17 @@ L2 miss ratio PMC1/PMC0
LONG
Formulas:
-L2 request rate = L2_DATA_RQSTS_DEMAND_ANY / INSTR_RETIRED_ANY
-L2 miss rate = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_DATA_RQSTS_DEMAND_ANY
+L2 request rate = L2_RQSTS_REFERENCES/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_RQSTS_REFERENCES
-
This group measures the locality of your data accesses with regard to the
-L2 Cache. L2 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
The L2 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L2 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
diff --git a/groups/westmereEX/L3.txt b/groups/westmereEX/L3.txt
index f80761a..7e5cb04 100644
--- a/groups/westmereEX/L3.txt
+++ b/groups/westmereEX/L3.txt
@@ -12,21 +12,25 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-L3 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L3 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
+L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0
L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
LONG
Formulas:
-L3 Load [MBytes/s] 1.0E-06*L2_LINES_IN_ANY*64/time
-L3 Evict [MBytes/s] 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64/time
-L3 bandwidth [MBytes/s] 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64/time
-L3 data volume [GBytes] 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ANY*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ANY*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DEMAND_DIRTY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64
-
Profiling group to measure L3 cache bandwidth. The bandwidth is
-computed by the number of cacheline allocated in the L2 and the number of
-modified cachelines evicted from the L2. Also reports data volume transfered
+computed by the number of cache line allocated in the L2 and the number of
+modified cache lines evicted from the L2. Also reporto data volume transferred
between L3 and L2 caches. Note that this bandwidth also includes data transfers
due to a write allocate load on a store miss in L2.
diff --git a/groups/westmereEX/L3CACHE.txt b/groups/westmereEX/L3CACHE.txt
new file mode 100644
index 0000000..262f948
--- /dev/null
+++ b/groups/westmereEX/L3CACHE.txt
@@ -0,0 +1,52 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+CBOX0C0 LLC_HITS_ALL
+CBOX0C1 LLC_MISSES_ALL
+CBOX1C0 LLC_HITS_ALL
+CBOX1C1 LLC_MISSES_ALL
+CBOX2C0 LLC_HITS_ALL
+CBOX2C1 LLC_MISSES_ALL
+CBOX3C0 LLC_HITS_ALL
+CBOX3C1 LLC_MISSES_ALL
+CBOX4C0 LLC_HITS_ALL
+CBOX4C1 LLC_MISSES_ALL
+CBOX5C0 LLC_HITS_ALL
+CBOX5C1 LLC_MISSES_ALL
+CBOX6C0 LLC_HITS_ALL
+CBOX6C1 LLC_MISSES_ALL
+CBOX7C0 LLC_HITS_ALL
+CBOX7C1 LLC_MISSES_ALL
+CBOX8C0 LLC_HITS_ALL
+CBOX8C1 LLC_MISSES_ALL
+CBOX9C0 LLC_HITS_ALL
+CBOX9C1 LLC_MISSES_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L3 request rate (CBOX0C0+CBOX0C1+CBOX1C0+CBOX1C1+CBOX2C0+CBOX2C1+CBOX3C0+CBOX3C1+CBOX4C0+CBOX4C1+CBOX5C0+CBOX5C1+CBOX6C0+CBOX6C1+CBOX7C0+CBOX7C1+CBOX8C0+CBOX8C1+CBOX9C0+CBOX9C1)/FIXC0
+L3 miss rate (CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1)/FIXC0
+L3 miss ratio (CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1)/(CBOX0C0+CBOX0C1+CBOX1C0+CBOX1C1+CBOX2C0+CBOX2C1+CBOX3C0+CBOX3C1+CBOX4C0+CBOX4C1+CBOX5C0+CBOX5C1+CBOX6C0+CBOX6C1+CBOX7C0+CBOX7C1+CBOX8C0+CBOX8C1+CBOX9C0+CBOX9C1)
+
+LONG
+Formulas:
+L3 request rate = (SUM(LLC_HITS_ALL)+SUM(LLC_MISSES_ALL))/INSTR_RETIRED_ANY
+L3 miss rate = SUM(LLC_MISSES_ALL)/INSTR_RETIRED_ANY
+L3 miss ratio = SUM(LLC_MISSES_ALL)/(SUM(LLC_HITS_ALL)+SUM(LLC_MISSES_ALL))
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/westmereEX/MEM.txt b/groups/westmereEX/MEM.txt
index defa391..5d4fc62 100644
--- a/groups/westmereEX/MEM.txt
+++ b/groups/westmereEX/MEM.txt
@@ -1,19 +1,15 @@
-SHORT Main memory bandwidth
+SHORT Main memory bandwidth in MBytes/s
EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-MBOX0C0 FVC_EV0_BBOX_CMDS_READS
-MBOX0C1 FVC_EV0_BBOX_RSP_ACK
-MBOX1C0 FVC_EV0_BBOX_CMDS_READS
-MBOX1C1 FVC_EV0_BBOX_RSP_ACK
-BBOX0C1 IMT_INSERTS_WR
-BBOX1C1 IMT_INSERTS_WR
-RBOX0C0 NEW_PACKETS_RECV_PORT0_IPERF0_ANY_DRS
-RBOX0C1 NEW_PACKETS_RECV_PORT1_IPERF0_ANY_DRS
-RBOX1C0 NEW_PACKETS_RECV_PORT4_IPERF0_ANY_DRS
-RBOX1C1 NEW_PACKETS_RECV_PORT5_IPERF0_ANY_DRS
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+MBOX0C0 FVC_EV0_BBOX_CMDS_READS
+MBOX0C1 DRAM_CMD_CAS_WR_OPN
+MBOX0C2 DRAM_MISC_CAS_WR_CLS
+MBOX1C0 FVC_EV0_BBOX_CMDS_READS
+MBOX1C1 DRAM_CMD_CAS_WR_OPN
+MBOX1C2 DRAM_MISC_CAS_WR_CLS
METRICS
@@ -21,17 +17,22 @@ Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0)*64/time
-Memory Write BW [MBytes/s] 1.0E-06*(BBOX0C1+BBOX1C1)*64/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+BBOX0C1+BBOX1C1)*64/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+BBOX0C1+BBOX1C1)*64
-Remote write data traffic Port 0 [MBytes/s] 1.0E-06*(RBOX0C0)*64/time
-Remote write data traffic Port 1 [MBytes/s] 1.0E-06*(RBOX0C1)*64/time
-Remote write data traffic Port 4 [MBytes/s] 1.0E-06*(RBOX1C0)*64/time
-Remote write data traffic Port 5 [MBytes/s] 1.0E-06*(RBOX1C1)*64/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX0C2+MBOX1C1+MBOX1C2)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX0C2+MBOX1C1+MBOX1C2)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1+MBOX0C2+MBOX1C2)*64/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1+MBOX0C2+MBOX1C2)*64
LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1)+SUM(MBOXxC2))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1)+SUM(MBOXxC2))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1)+SUM(MBOXxC2))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1)+SUM(MBOXxC2))*64.0
+-
Profiling group to measure memory bandwidth drawn by all cores of a socket.
-Addional to the bandwidth it also outputs the data volume and the remote
-traffic over QPI links to other sockets.
+Addional to the bandwidth it also outputs the data volume.
diff --git a/groups/westmereEX/NUMA.txt b/groups/westmereEX/NUMA.txt
new file mode 100644
index 0000000..0c1b8fb
--- /dev/null
+++ b/groups/westmereEX/NUMA.txt
@@ -0,0 +1,33 @@
+SHORT Local and remote memory accesses
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 OFFCORE_RESPONSE_0_LOCAL_DRAM
+PMC1 OFFCORE_RESPONSE_1_REMOTE_DRAM
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Local DRAM data volume [GByte] 1.E-09*PMC0*64
+Local DRAM bandwidth [MByte/s] 1.E-06*(PMC0*64)/time
+Remote DRAM data volume [GByte] 1.E-09*PMC1*64
+Remote DRAM bandwidth [MByte/s] 1.E-06*(PMC1*64)/time
+Memory data volume [GByte] 1.E-09*(PMC0+PMC1)*64
+Memory bandwidth [MByte/s] 1.E-06*((PMC0+PMC1)*64)/time
+
+LONG
+Formula:
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+Local DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_0_LOCAL_DRAM*64
+Local DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_0_LOCAL_DRAM*64)/time
+Remote DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_1_REMOTE_DRAM*64
+Remote DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_1_REMOTE_DRAM*64)/time
+Memory data volume [GByte] = 1.E-09*(OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64
+Memory bandwidth [MByte/s] = 1.E-06*((OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64)/time
+--
+This performance group measures the data traffic of CPU cores to local and remote
+memory.
diff --git a/groups/westmereEX/TLB.txt b/groups/westmereEX/TLB.txt
deleted file mode 100644
index 0077350..0000000
--- a/groups/westmereEX/TLB.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-SHORT TLB miss rate/ratio
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PMC0 DTLB_MISSES_ANY
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI FIXC1/FIXC0
-L1 DTLB miss rate PMC0/FIXC0
-
-LONG
-Formulas:
-DTLB miss rate = DTLB_MISSES_ANY / INSTR_RETIRED_ANY
--
-The DTLB miss rate gives a measure how often a TLB miss occured
-per instruction.
-
diff --git a/groups/westmereEX/TLB_DATA.txt b/groups/westmereEX/TLB_DATA.txt
new file mode 100644
index 0000000..d256b8c
--- /dev/null
+++ b/groups/westmereEX/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 DTLB_LOAD_MISSES_ANY
+PMC1 DTLB_MISSES_ANY
+PMC2 DTLB_LOAD_MISSES_WALK_CYCLES
+PMC3 DTLB_MISSES_WALK_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1 DTLB load misses PMC0
+L1 DTLB load miss rate PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses (PMC1-PMC0)
+L1 DTLB store miss rate (PMC1-PMC0)/FIXC0
+L1 DTLB store miss duration [Cyc] (PMC3-PMC2)/(PMC1-PMC0)
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_ANY
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_ANY / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_CYCLES / DTLB_LOAD_MISSES_ANY
+L1 DTLB store misses = DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY
+L1 DTLB store miss rate = (DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY) / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = (DTLB_MISSES_WALK_CYCLES-DTLB_LOAD_MISSES_WALK_CYCLES) / (DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY)
+-
+The DTLB miss rate gives a measure how often a TLB miss occurred
+per instruction. The store miss calculations are done using ALL-LOADS TLB walks.
+
diff --git a/groups/westmereEX/TLB_INSTR.txt b/groups/westmereEX/TLB_INSTR.txt
new file mode 100644
index 0000000..2f0f90c
--- /dev/null
+++ b/groups/westmereEX/TLB_INSTR.txt
@@ -0,0 +1,27 @@
+SHORT L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 ITLB_MISSES_ANY
+PMC1 ITLB_MISSES_WALK_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+L1 ITLB misses PMC0
+L1 ITLB miss rate PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_ANY
+L1 ITLB miss rate = ITLB_MISSES_ANY / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_CYCLES / ITLB_MISSES_ANY
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/westmereEX/UOPS.txt b/groups/westmereEX/UOPS.txt
new file mode 100644
index 0000000..9d738d0
--- /dev/null
+++ b/groups/westmereEX/UOPS.txt
@@ -0,0 +1,35 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_ANY
+PMC1 UOPS_EXECUTED_THREAD
+PMC2 UOPS_RETIRED_ALL
+PMC3 UOPS_ISSUED_FUSED
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formula:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FUSED
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/kernel/Makefile b/kernel/Makefile
index fd0ffdf..b9b814a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,4 +9,5 @@ modules_install:
install -m 666 enable_rdpmc.ko /lib/modules/$(shell uname -r)/extra/
clean:
- rm -f *.ko *.o modules.order Module.symvers enable_rdpmc.mod.c
+ rm -f *.ko *.o modules.order Module.symvers enable_rdpmc.mod.c .enable_rdpmc*.cmd
+ rm -rf .tmp_versions
diff --git a/kernel/README b/kernel/README
new file mode 100644
index 0000000..771b217
--- /dev/null
+++ b/kernel/README
@@ -0,0 +1,3 @@
+
+The kernel module enable_rdpmc is deprecated. Please use the sysfs entry
+/sys/devices/cpu/rdpmc to enable or disable the RDPMC instruction.
diff --git a/make/config_checks.mk b/make/config_checks.mk
new file mode 100644
index 0000000..ab266cf
--- /dev/null
+++ b/make/config_checks.mk
@@ -0,0 +1,49 @@
+
+ifneq ($(MAKECMDGOALS),docs)
+# determine kernel Version
+KERNEL_VERSION_MAJOR := $(shell uname -r | awk '{split($$1,a,"."); print a[1]}' | cut -d '-' -f1)
+KERNEL_VERSION := $(shell uname -r | awk '{split($$1,a,"."); print a[2]}' | cut -d '-' -f1)
+KERNEL_VERSION_MINOR := $(shell uname -r | awk '{split($$1,a,"."); print a[3]}' | cut -d '-' -f1)
+
+HAS_MEMPOLICY = $(shell if [ $(KERNEL_VERSION) -lt 7 -a $(KERNEL_VERSION_MAJOR) -lt 3 -a $(KERNEL_VERSION_MINOR) -lt 8 ]; then \
+ echo 0; else echo 1; \
+ fi; )
+HAS_PERFEVENT = $(shell if [ $(KERNEL_VERSION) -lt 6 -a $(KERNEL_VERSION_MAJOR) -lt 2 -a $(KERNEL_VERSION_MINOR) -lt 31 ]; then echo 0; else echo 1; fi; )
+
+# determine glibc Version
+GLIBC_VERSION := $(shell ldd --version | grep ldd | awk '{ print $$NF }' | awk -F. '{ print $$2 }')
+
+HAS_SCHEDAFFINITY = $(shell if [ $(GLIBC_VERSION) -lt 4 ]; then \
+ echo 0; else echo 1; \
+ fi; )
+
+INST_PREFIX := $(INSTALLED_PREFIX)
+ifneq "$(PREFIX)" "$(INST_PREFIX)"
+$(info Info: PREFIX and INSTALLED_PREFIX differ, be aware that you have to move stuff after make install from $(PREFIX) to $(INSTALLED_PREFIX). You can use make move for this.)
+endif
+
+FORTRAN_IF_NAME := likwid.mod
+ifneq ($(FORTRAN_INTERFACE),false)
+HAS_FORTRAN_COMPILER = $(shell $(FC) --version 2>/dev/null || echo 'NOFORTRAN' )
+ifeq ($(HAS_FORTRAN_COMPILER),NOFORTRAN)
+FORTRAN_IF=
+$(info Warning: You have selected the fortran interface in config.mk, but there seems to be no fortran compiler $(FC) - not compiling it!)
+FORTRAN_INSTALL =
+FORTRAN_REMOVE =
+FORTRAN_REMOVE_MOVED =
+else
+FORTRAN_IF := $(FORTRAN_IF_NAME)
+FORTRAN_INSTALL = @echo "===> INSTALL fortran interface to $(PREFIX)/include/"; \
+ cp -f likwid.mod $(PREFIX)/include/$(FORTRAN_IF_NAME)
+FORTRAN_REMOVE = @echo "===> REMOVING fortran interface from $(PREFIX)/include/"; \
+ rm -f $(PREFIX)/include/$(FORTRAN_IF_NAME)
+FORTRAN_REMOVE_MOVED = @echo "===> REMOVING fortran interface from $(INSTALLED_PREFIX)/include/"; \
+ rm -f $(INSTALLED_PREFIX)/include/$(FORTRAN_IF_NAME)
+endif
+else
+FORTRAN_IF =
+FORTRAN_INSTALL =
+FORTRAN_REMOVE =
+FORTRAN_REMOVE_MOVED =
+endif
+endif
diff --git a/make/config_defines.mk b/make/config_defines.mk
new file mode 100644
index 0000000..f2b632c
--- /dev/null
+++ b/make/config_defines.mk
@@ -0,0 +1,117 @@
+DEFINES += -DVERSION=$(VERSION) \
+ -DRELEASE=$(RELEASE) \
+ -DCFGFILE=$(CFG_FILE_PATH) \
+ -DTOPOFILE=$(TOPO_FILE_PATH) \
+ -DINSTALL_PREFIX=$(INSTALLED_PREFIX) \
+ -DMAX_NUM_THREADS=$(MAX_NUM_THREADS) \
+ -DMAX_NUM_NODES=$(MAX_NUM_NODES) \
+ -DACCESSDAEMON=$(INSTALLED_ACCESSDAEMON) \
+ -DGROUPPATH=$(LIKWIDGROUPPATH) \
+ -D_GNU_SOURCE
+
+DYNAMIC_TARGET_LIB := liblikwid.so
+STATIC_TARGET_LIB := liblikwid.a
+
+LUA_FOLDER := ext/lua
+SHARED_LIBLUA := liblikwid-lua.so
+STATIC_LIBLUA := liblikwid-lua.a
+HWLOC_FOLDER := ext/hwloc
+STATIC_LIBHWLOC := liblikwid-hwloc.a
+SHARED_LIBHWLOC := liblikwid-hwloc.so
+
+BENCH_FOLDER := bench
+BENCH_NAME := likwid-bench
+BENCH_TARGET := $(BENCH_FOLDER)/$(BENCH_NAME)
+
+ifneq ($(COLOR),NONE)
+DEFINES += -DCOLOR=$(COLOR)
+endif
+
+ifeq ($(BUILDDAEMON),true)
+ifneq ($(COMPILER),MIC)
+ DAEMON_TARGET = likwid-accessD
+else
+ $(info Info: Compiling for Xeon Phi. Disabling build of likwid-accessD.);
+ DAEMON_TARGET =
+endif
+endif
+
+ifeq ($(BUILDFREQ),true)
+ifneq ($(COMPILER),MIC)
+ FREQ_TARGET = likwid-setFreq
+else
+ $(info Info: Compiling for Xeon Phi. Disabling build of likwid-setFreq.);
+endif
+endif
+
+ifeq ($(HAS_MEMPOLICY),1)
+DEFINES += -DHAS_MEMPOLICY
+else
+$(info Kernel 2.6.$(KERNEL_VERSION) has no mempolicy support!);
+endif
+
+
+ifeq ($(SHARED_LIBRARY),true)
+CFLAGS += $(SHARED_CFLAGS)
+LIBS += -L. -pthread -lm -ldl
+TARGET_LIB := $(DYNAMIC_TARGET_LIB)
+TARGET_HWLOC_LIB=$(HWLOC_FOLDER)/$(SHARED_LIBHWLOC)
+TARGET_LUA_LIB=$(LUA_FOLDER)/$(SHARED_LIBLUA)
+else
+TARGET_LIB := $(STATIC_TARGET_LIB)
+TARGET_HWLOC_LIB=$(HWLOC_FOLDER)/$(STATIC_LIBHWLOC)
+TARGET_LUA_LIB=$(LUA_FOLDER)/$(STATIC_LIBLUA)
+endif
+
+ifeq ($(HAS_SCHEDAFFINITY),1)
+DEFINES += -DHAS_SCHEDAFFINITY
+PINLIB = liblikwidpin.so
+else
+$(info GLIBC version 2.$(GLIBC_VERSION) has no pthread_setaffinity_np support!);
+PINLIB =
+endif
+
+FILTER_HWLOC_OBJ = yes
+LIBHWLOC =
+ifeq ($(USE_HWLOC),true)
+DEFINES += -DLIKWID_USE_HWLOC
+LIBHWLOC_SHARED = -Lext/hwloc/ -lliblikwid-hwloc
+LIBHWLOC_STATIC = ext/hwloc/liblikwid-hwloc.a
+EXT_TARGETS += ./ext/hwloc
+FILTER_HWLOC_OBJ =
+endif
+
+#DEFINES += -DACCESSDAEMON=$(ACCESSDAEMON)
+
+ifeq ($(ACCESSMODE),sysdaemon)
+ifneq ($(COMPILER),MIC)
+DEFINES += -DACCESSMODE=2
+else
+$(info Info: Compiling for Xeon Phi. Changing accessmode to direct.);
+ACCESSMODE = direct
+DEFINES += -DACCESSMODE=0
+endif
+else
+ifeq ($(ACCESSMODE),accessdaemon)
+ifneq ($(COMPILER),MIC)
+ifneq ($(BUILDDAEMON),true)
+$(info Info: Compiling with accessdaemon access mode but without building the access daemon.);
+$(info Info: Make sure an accessdaemon is installed and the paths ACCESSDAEMON and INSTALLED_ACCESSDAEMON point to it);
+endif
+DEFINES += -DACCESSMODE=1
+else
+$(info Info: Compiling for Xeon Phi. Changing accessmode to direct.);
+DEFINES += -DACCESSMODE=0
+ACCESSMODE = direct
+endif
+else
+DEFINES += -DACCESSMODE=0
+endif
+endif
+
+ifeq ($(DEBUG),true)
+DEBUG_FLAGS = -g
+DEFINES += -DDEBUG_LIKWID
+else
+DEBUG_FLAGS =
+endif
diff --git a/make/include_CLANG.mk b/make/include_CLANG.mk
new file mode 100644
index 0000000..4806e01
--- /dev/null
+++ b/make/include_CLANG.mk
@@ -0,0 +1,28 @@
+CC = clang
+FC = ifort
+AS = as
+AR = ar
+PAS = ./perl/AsmGen.pl
+GEN_PAS = ./perl/generatePas.pl
+GEN_GROUPS = ./perl/generateGroups.pl
+GEN_PMHEADER = ./perl/gen_events.pl
+
+ANSI_CFLAGS =
+
+CFLAGS = -O2 -std=c99 -Wno-format -fPIC
+FCFLAGS = -module ./ # ifort
+#FCFLAGS = -J ./ -fsyntax-only #gfortran
+PASFLAGS = x86-64
+ASFLAGS =
+CPPFLAGS =
+LFLAGS = -pthread
+
+SHARED_CFLAGS = -fPIC -fvisibility=hidden
+SHARED_LFLAGS = -shared -fvisibility=hidden
+
+DEFINES = -DPAGE_ALIGNMENT=4096
+DEFINES += -DLIKWID_MONITOR_LOCK
+DEFINES += -DDEBUGLEV=0
+
+INCLUDES =
+LIBS = -lm -lrt
diff --git a/make/include_GCC.mk b/make/include_GCC.mk
index 1ccfd88..72850a1 100644
--- a/make/include_GCC.mk
+++ b/make/include_GCC.mk
@@ -7,28 +7,27 @@ GEN_PAS = ./perl/generatePas.pl
GEN_GROUPS = ./perl/generateGroups.pl
GEN_PMHEADER = ./perl/gen_events.pl
-#ANSI_CFLAGS = -std=c99
+ANSI_CFLAGS =
#ANSI_CFLAGS += -pedantic
#ANSI_CFLAGS += -Wextra
#ANSI_CFLAGS += -Wall
-CFLAGS = -O2 -Wno-format -Wno-nonnull -std=c99
+CFLAGS = -O2 -std=c99 -Wno-format -fPIC
FCFLAGS = -module ./ # ifort
#FCFLAGS = -J ./ -fsyntax-only #gfortran
PASFLAGS = x86-64
-ASFLAGS =
+ASFLAGS =
CPPFLAGS =
LFLAGS = -pthread
-SHARED_CFLAGS = -fpic
-SHARED_LFLAGS = -shared
+SHARED_CFLAGS = -fPIC -fvisibility=hidden
+SHARED_LFLAGS = -shared -fvisibility=hidden
-DEFINES = -D_GNU_SOURCE
-DEFINES += -DPAGE_ALIGNMENT=4096
+DEFINES = -DPAGE_ALIGNMENT=4096
DEFINES += -DLIKWID_MONITOR_LOCK
DEFINES += -DDEBUGLEV=0
INCLUDES =
-LIBS = -lm
+LIBS = -lm -lrt
diff --git a/make/include_GCCX86.mk b/make/include_GCCX86.mk
index 19add95..5ebef9a 100644
--- a/make/include_GCCX86.mk
+++ b/make/include_GCCX86.mk
@@ -1,25 +1,25 @@
CC = gcc
AS = as
AR = ar
-PAS = ./perl/AsmGen.pl
-GEN_PAS = ./perl/generatePas.pl
-GEN_GROUPS = ./perl/generateGroups.pl
-GEN_PMHEADER = ./perl/gen_events.pl
+PAS = ./perl/AsmGen.pl
+GEN_PAS = ./perl/generatePas.pl
+GEN_GROUPS = ./perl/generateGroups.pl
+GEN_PMHEADER = ./perl/gen_events.pl
-#ANSI_CFLAGS = -std=c99
+ANSI_CFLAGS = -std=c99
#ANSI_CFLAGS += -pedantic
#ANSI_CFLAGS += -Wextra
#ANSI_CFLAGS += -Wall
-CFLAGS = -O2 -m32 -Wno-format -std=c99
+CFLAGS = -O2 -g -m32 -Wno-format -fPIC
FCFLAGS = -J ./ -fsyntax-only
PASFLAGS = x86
-ASFLAGS = --32
+ASFLAGS = --32 -g
CPPFLAGS =
-LFLAGS = -m32 -pthread
+LFLAGS = -m32 -g -pthread
-SHARED_CFLAGS = -fpic
-SHARED_LFLAGS = -shared
+SHARED_CFLAGS = -fpic -fvisibility=hidden
+SHARED_LFLAGS = -shared -fvisibility=hidden
DEFINES = -D_GNU_SOURCE
DEFINES += -DPAGE_ALIGNMENT=4096
@@ -27,6 +27,6 @@ DEFINES += -DLIKWID_MONITOR_LOCK
DEFINES += -DDEBUGLEV=0
INCLUDES =
-LIBS = -lm
+LIBS = -lm -lrt
diff --git a/make/include_ICC.mk b/make/include_ICC.mk
index ce49bfe..9dfe66b 100644
--- a/make/include_ICC.mk
+++ b/make/include_ICC.mk
@@ -7,24 +7,22 @@ GEN_PAS = ./perl/generatePas.pl
GEN_GROUPS = ./perl/generateGroups.pl
GEN_PMHEADER = ./perl/gen_events.pl
-ANSI_CFLAGS += -std=c99
+ANSI_CFLAGS = -std=c99 #-strict-ansi
-CFLAGS = -O1 -Wno-format
-FCFLAGS = -module ./
+CFLAGS = -O1 -Wno-format -vec-report=0 -fPIC -pthread
+FCFLAGS = -module ./
ASFLAGS = -gdwarf-2
PASFLAGS = x86-64
CPPFLAGS =
LFLAGS = -pthread
-SHARED_CFLAGS = -fpic
-SHARED_LFLAGS = -shared
+SHARED_CFLAGS = -fPIC -pthread -fvisibility=hidden
+SHARED_LFLAGS = -shared -pthread -fvisibility=hidden
DEFINES = -D_GNU_SOURCE
DEFINES += -DPAGE_ALIGNMENT=4096
-#enable this option to build likwid-bench with marker API for likwid-perfctr
-#DEFINES += -DPERFMON
INCLUDES =
-LIBS =
+LIBS = -lrt
diff --git a/make/include_MIC.mk b/make/include_MIC.mk
index aa3c39a..b63efce 100644
--- a/make/include_MIC.mk
+++ b/make/include_MIC.mk
@@ -1,5 +1,5 @@
CC = icc
-FC = gfortran
+FC = ifort
AS = icc
AR = ar
PAS = ./perl/AsmGen.pl
@@ -7,27 +7,31 @@ GEN_PAS = ./perl/generatePas.pl
GEN_GROUPS = ./perl/generateGroups.pl
GEN_PMHEADER = ./perl/gen_events.pl
-#ANSI_CFLAGS = -std=c99
-#ANSI_CFLAGS += -pedantic
+ANSI_CFLAGS = -std=c99 -fPIC
+ANSI_CFLAGS += -pedantic
#ANSI_CFLAGS += -Wextra
#ANSI_CFLAGS += -Wall
-CFLAGS = -mmic -O2 -Wno-format -std=c99
+CFLAGS = -mmic -O1 -g -Wno-format -fPIC
FCFLAGS = -J ./ -fsyntax-only
#FCFLAGS = -module ./
-ASFLAGS = -mmic -c
+ASFLAGS = -mmic -c -x assembler
PASFLAGS = x86-64
CPPFLAGS =
LFLAGS = -pthread -g -mmic
-SHARED_CFLAGS = -fpic -mmic
-SHARED_LFLAGS = -shared -mmic
+SHARED_CFLAGS = -fpic -mmic -fvisibility=hidden
+SHARED_LFLAGS = -shared -mmic -fvisibility=hidden
DEFINES = -D_GNU_SOURCE
DEFINES += -DPAGE_ALIGNMENT=4096
DEFINES += -DDEBUGLEV=0
INCLUDES =
-LIBS = -lm
-
+LIBS = -lm -lrt
+# colon seperated list of paths to search for libs at runtime on Phi file system
+MIC_LIB_RPATHS = /opt/intel/compilers_and_libraries_2016.1.150/linux/compiler/lib/mic
+ifneq (strip $(MIC_LIB_RPATHS),)
+RPATHS += -Wl,-rpath=$(MIC_LIB_RPATHS)
+endif
diff --git a/monitoring/README.agent b/monitoring/README.agent
new file mode 100644
index 0000000..756d015
--- /dev/null
+++ b/monitoring/README.agent
@@ -0,0 +1,66 @@
+The likwid-agent application is a daemon that reads hardware performance
+counters in a periodic fashion. Which counters can be measured is determined by
+the system's CPU architecture. Each architecture has its own set of events and
+corresponding counter registers. For the measurement the likwid library is used
+and interfaced through the Lua interface. The measured values can be exported in
+multiple ways like RRD, syslog or gmetric from the Ganglia Monitoring System.
+
+
+The configuration file needs to be given at startup and has the following
+format:
+GROUPPATH <PATH_TO_GROUPS> # default is set during installation
+EVENTSET <SPACE_SEPARATED_LIST_OF_GROUPS>
+DURATION <TIME_IN_SECONDS_TO_MEASURE_EACH_GROUP>
+ACCESSMODE <0/1> # 0 is direct access, 1 forward access to the accessDaemon
+LOGPATH <PATH_TO_STORE_LOGFILES> # each montitoring group creates a logfile there named likwid.<GROUP>.log
+LOGSTYLE <log/update> # log appends new lines, update clears file previously
+GMETRIC <True/False> # send measured values to Gangla
+GMETRICPATH <PATH_TO_THE_GMETRIC_EXECUTABLE>
+GMETRICCONFIG <EXTRA_CONFIG_OPTIONS_TO_GMETRIC>
+RRD <True/False> # write measured values to RRD files, one RRD per group
+RRDPATH <PATH_TO_STORE_RRD_FILES>
+SYSLOG <True/False> # write measured values to syslog
+SYSLOGPRIO <prio> # Use priority level <prio> for syslog, default is local0.notice
+
+
+
+The group files cannot lie directly in GROUPPATH, you need to create a folder
+with the short name of the architecture like sandybridge or ivybridge. This
+enables to use the same group path distributed over a set of systems with different
+CPU architecture. The format of a group file is the following:
+SHORT <SHORT_NAME_OF_THE GROUP>
+
+EVENTSET // Starts event/counter definitions
+FIXC0 INSTR_RETIRED_ANY // Measure event INSTR_RETIRED_ANY in counter FIXC0
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 L1D_REPLACEMENT
+PMC1 L2_TRANS_L1D_WB
+PMC2 L2_LINES_IN_ALL
+PMC3 L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS // Starts section of derived metrics and output items
+ONCE Runtime (RDTSC) [s] time # Output runtime only once
+MIN CPI FIXC1/FIXC0 # Output the minimum of the formula FIXC1/FIXC0 named CPI
+AVG CPI FIXC1/FIXC0 # Output the average of the same formula
+MAX L2 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time # Calculate bandwidth and output only the maximum
+MIN L2 load data volume [GBytes] 1.0E-09*PMC0*64.0
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time # Sum up all the values of all CPUs
+SUM L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+
+LONG
+<LONG DESCRIPTION OF THE GROUP>
+
+Possible functions are:
+ONCE: Output only once (CPU core 0), no aggregation is done
+MIN: Output the minimum of all cores
+MAX: Output the maximum of all cores
+AVG: Output the average of all cores
+SUM: Output the sum of all cores' values
+If no function is set, the values of all HW threads is written to output and
+T<ID> is written in front of the name.
+
+The output metric names can be equal, the function is glued to the output name for later separation.
+
diff --git a/monitoring/groups/atom/BW_MEM.txt b/monitoring/groups/atom/BW_MEM.txt
new file mode 100644
index 0000000..8eb701f
--- /dev/null
+++ b/monitoring/groups/atom/BW_MEM.txt
@@ -0,0 +1,10 @@
+SHORT Memory bandwidth
+
+EVENTSET
+PMC0 BUS_TRANS_MEM_THIS_CORE_THIS_A
+
+METRICS
+SUM Memory bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+
+
+LONG
diff --git a/monitoring/groups/atom/FLOPS_DP.txt b/monitoring/groups/atom/FLOPS_DP.txt
new file mode 100644
index 0000000..14961f0
--- /dev/null
+++ b/monitoring/groups/atom/FLOPS_DP.txt
@@ -0,0 +1,13 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0 SIMD_COMP_INST_RETIRED_PACKED_DOUBLE
+PMC1 SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE
+
+METRICS
+DP MFlops/s 1.0E-06*(PMC0*2.0+PMC1)/time
+
+
+LONG
+Double Precision MFlops/s Double Precision MFlops/s
+
diff --git a/monitoring/groups/atom/FLOPS_SP.txt b/monitoring/groups/atom/FLOPS_SP.txt
new file mode 100644
index 0000000..d67704f
--- /dev/null
+++ b/monitoring/groups/atom/FLOPS_SP.txt
@@ -0,0 +1,12 @@
+SHORT Single Precision MFlops/s
+
+EVENTSET
+PMC0 SIMD_COMP_INST_RETIRED_PACKED_SINGLE
+PMC1 SIMD_COMP_INST_RETIRED_SCALAR_SINGLE
+
+METRICS
+SP MFlops/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
+
+LONG
+Single Precision MFlops/s Double Precision MFlops/s
+
diff --git a/monitoring/groups/broadwell/BW.txt b/monitoring/groups/broadwell/BW.txt
new file mode 100644
index 0000000..3a2eb90
--- /dev/null
+++ b/monitoring/groups/broadwell/BW.txt
@@ -0,0 +1,13 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0 L1D_REPLACEMENT
+PMC1 L2_TRANS_L1D_WB
+PMC2 L2_LINES_IN_ALL
+PMC3 L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+
+LONG
diff --git a/monitoring/groups/broadwell/ENERGY.txt b/monitoring/groups/broadwell/ENERGY.txt
new file mode 100644
index 0000000..7256f1e
--- /dev/null
+++ b/monitoring/groups/broadwell/ENERGY.txt
@@ -0,0 +1,18 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+SUM Total Power DRAM [W] PWR3/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/broadwell/FLOPS_DP.txt b/monitoring/groups/broadwell/FLOPS_DP.txt
new file mode 100644
index 0000000..53b2463
--- /dev/null
+++ b/monitoring/groups/broadwell/FLOPS_DP.txt
@@ -0,0 +1,22 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+MFlops/s 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFlops/s 1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+-
+AVX/SSE scalar and packed double precision flop rates.
+
diff --git a/monitoring/groups/broadwell/FLOPS_SP.txt b/monitoring/groups/broadwell/FLOPS_SP.txt
new file mode 100644
index 0000000..b04f87a
--- /dev/null
+++ b/monitoring/groups/broadwell/FLOPS_SP.txt
@@ -0,0 +1,22 @@
+SHORT Single Precision MFlops/s
+
+EVENTSET
+PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+
+METRICS
+MFlops/s 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFlops/s 1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+-
+AVX/SSE scalar and packed single precision flop rates.
+
diff --git a/monitoring/groups/broadwellEP/BW.txt b/monitoring/groups/broadwellEP/BW.txt
new file mode 100644
index 0000000..3a2eb90
--- /dev/null
+++ b/monitoring/groups/broadwellEP/BW.txt
@@ -0,0 +1,13 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0 L1D_REPLACEMENT
+PMC1 L2_TRANS_L1D_WB
+PMC2 L2_LINES_IN_ALL
+PMC3 L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+
+LONG
diff --git a/monitoring/groups/broadwellEP/ENERGY.txt b/monitoring/groups/broadwellEP/ENERGY.txt
new file mode 100644
index 0000000..7256f1e
--- /dev/null
+++ b/monitoring/groups/broadwellEP/ENERGY.txt
@@ -0,0 +1,18 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+SUM Total Power DRAM [W] PWR3/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/core2/BW_L2.txt b/monitoring/groups/core2/BW_L2.txt
new file mode 100644
index 0000000..6d73bf8
--- /dev/null
+++ b/monitoring/groups/core2/BW_L2.txt
@@ -0,0 +1,11 @@
+SHORT Cache bandwidth
+
+EVENTSET
+PMC0 L1D_REPL
+PMC1 L1D_M_EVICT
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+
+
+LONG
diff --git a/monitoring/groups/core2/BW_MEM.txt b/monitoring/groups/core2/BW_MEM.txt
new file mode 100644
index 0000000..8eb701f
--- /dev/null
+++ b/monitoring/groups/core2/BW_MEM.txt
@@ -0,0 +1,10 @@
+SHORT Memory bandwidth
+
+EVENTSET
+PMC0 BUS_TRANS_MEM_THIS_CORE_THIS_A
+
+METRICS
+SUM Memory bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+
+
+LONG
diff --git a/monitoring/groups/haswell/BW.txt b/monitoring/groups/haswell/BW.txt
new file mode 100644
index 0000000..3a2eb90
--- /dev/null
+++ b/monitoring/groups/haswell/BW.txt
@@ -0,0 +1,13 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0 L1D_REPLACEMENT
+PMC1 L2_TRANS_L1D_WB
+PMC2 L2_LINES_IN_ALL
+PMC3 L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+
+LONG
diff --git a/monitoring/groups/haswell/ENERGY.txt b/monitoring/groups/haswell/ENERGY.txt
new file mode 100644
index 0000000..7256f1e
--- /dev/null
+++ b/monitoring/groups/haswell/ENERGY.txt
@@ -0,0 +1,18 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+SUM Total Power DRAM [W] PWR3/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/haswellEP/BW.txt b/monitoring/groups/haswellEP/BW.txt
new file mode 100644
index 0000000..e6f4b73
--- /dev/null
+++ b/monitoring/groups/haswellEP/BW.txt
@@ -0,0 +1,32 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0 L1D_REPLACEMENT
+PMC1 L2_TRANS_L1D_WB
+PMC2 L2_LINES_IN_ALL
+PMC3 L2_LINES_OUT_DEMAND_DIRTY
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+SUM Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+
+LONG
diff --git a/monitoring/groups/haswellEP/ENERGY.txt b/monitoring/groups/haswellEP/ENERGY.txt
new file mode 100644
index 0000000..7256f1e
--- /dev/null
+++ b/monitoring/groups/haswellEP/ENERGY.txt
@@ -0,0 +1,18 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+SUM Total Power DRAM [W] PWR3/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/interlagos/BW.txt b/monitoring/groups/interlagos/BW.txt
new file mode 100644
index 0000000..3f465f6
--- /dev/null
+++ b/monitoring/groups/interlagos/BW.txt
@@ -0,0 +1,16 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0 DATA_CACHE_REFILLS_ALL
+PMC1 DATA_CACHE_REFILLS_SYSTEM
+PMC2 L2_FILL_WB_FILL
+PMC3 L2_FILL_WB_WB
+UPMC0 UNC_DRAM_ACCESSES_DCT0_ALL
+UPMC1 UNC_DRAM_ACCESSES_DCT1_ALL
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0-PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time
+
+LONG
diff --git a/monitoring/groups/interlagos/CPI.txt b/monitoring/groups/interlagos/CPI.txt
new file mode 100644
index 0000000..d599a34
--- /dev/null
+++ b/monitoring/groups/interlagos/CPI.txt
@@ -0,0 +1,19 @@
+SHORT Cycles per instruction
+
+EVENTSET
+PMC0 RETIRED_INSTRUCTIONS
+PMC1 CPU_CLOCKS_UNHALTED
+PMC2 RETIRED_UOPS
+
+METRICS
+CPI PMC1/PMC0
+Cycles per UOPS PMC1/PMC2
+IPC PMC0/PMC1
+
+LONG
+This group measures how efficient the processor works with
+regard to instruction throughput. Also important as a standalone
+metric is RETIRED_INSTRUCTIONS as it tells you how many instruction
+you need to execute for a task. An optimization might show very
+low CPI values but execute many more instruction for it.
+
diff --git a/monitoring/groups/interlagos/FLOPS.txt b/monitoring/groups/interlagos/FLOPS.txt
new file mode 100644
index 0000000..7bfb29a
--- /dev/null
+++ b/monitoring/groups/interlagos/FLOPS.txt
@@ -0,0 +1,18 @@
+SHORT Floating point operations
+
+EVENTSET
+PMC0 RETIRED_FLOPS_DOUBLE_ALL
+PMC1 RETIRED_FLOPS_SINGLE_ALL
+
+METRICS
+DP MFlops/s 1.0E-06*(PMC0)/time
+SP MFlops/s 1.0E-06*(PMC1)/time
+
+LONG
+Formulas:
+DP MFlops/s = 1.0E-06*(RETIRED_FLOPS_DOUBLE_ALL)/time
+SP MFlops/s = 1.0E-06*(RETIRED_FLOPS_SINGLE_ALL)/time
+-
+Profiling group to measure double precisision flop rate.
+
+
diff --git a/monitoring/groups/ivybridge/BW.txt b/monitoring/groups/ivybridge/BW.txt
new file mode 100644
index 0000000..3a2eb90
--- /dev/null
+++ b/monitoring/groups/ivybridge/BW.txt
@@ -0,0 +1,13 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0 L1D_REPLACEMENT
+PMC1 L2_TRANS_L1D_WB
+PMC2 L2_LINES_IN_ALL
+PMC3 L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+
+LONG
diff --git a/monitoring/groups/ivybridge/ENERGY.txt b/monitoring/groups/ivybridge/ENERGY.txt
new file mode 100644
index 0000000..7256f1e
--- /dev/null
+++ b/monitoring/groups/ivybridge/ENERGY.txt
@@ -0,0 +1,18 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+SUM Total Power DRAM [W] PWR3/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/ivybridge/FLOPS_DP.txt b/monitoring/groups/ivybridge/FLOPS_DP.txt
new file mode 100644
index 0000000..496b8a5
--- /dev/null
+++ b/monitoring/groups/ivybridge/FLOPS_DP.txt
@@ -0,0 +1,23 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2 SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+MFlops/s 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFlops/s 1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+-
+SSE scalar and packed double precision flop rates. Please note that the current flop measurements on SandyBridge are
+potentially wrong. So you cannot trust these counters at the moment!
+
diff --git a/monitoring/groups/ivybridge/FLOPS_SP.txt b/monitoring/groups/ivybridge/FLOPS_SP.txt
new file mode 100644
index 0000000..64edd19
--- /dev/null
+++ b/monitoring/groups/ivybridge/FLOPS_SP.txt
@@ -0,0 +1,24 @@
+SHORT Single Precision MFlops/s
+
+EVENTSET
+PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2 SIMD_FP_256_PACKED_SINGLE
+
+METRICS
+MFlops/s 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFlops/s 1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+-
+SSE scalar and packed single precision flop rates. Please note that the current
+flop measurements on IvyBridge are potentially wrong. So you cannot trust
+these counters at the moment!
+
diff --git a/monitoring/groups/ivybridgeEP/BW.txt b/monitoring/groups/ivybridgeEP/BW.txt
new file mode 100644
index 0000000..e6f4b73
--- /dev/null
+++ b/monitoring/groups/ivybridgeEP/BW.txt
@@ -0,0 +1,32 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0 L1D_REPLACEMENT
+PMC1 L2_TRANS_L1D_WB
+PMC2 L2_LINES_IN_ALL
+PMC3 L2_LINES_OUT_DEMAND_DIRTY
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+SUM Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+
+LONG
diff --git a/monitoring/groups/ivybridgeEP/ENERGY.txt b/monitoring/groups/ivybridgeEP/ENERGY.txt
new file mode 100644
index 0000000..7256f1e
--- /dev/null
+++ b/monitoring/groups/ivybridgeEP/ENERGY.txt
@@ -0,0 +1,18 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+SUM Total Power DRAM [W] PWR3/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/ivybridgeEP/FLOPS_DP.txt b/monitoring/groups/ivybridgeEP/FLOPS_DP.txt
new file mode 100644
index 0000000..496b8a5
--- /dev/null
+++ b/monitoring/groups/ivybridgeEP/FLOPS_DP.txt
@@ -0,0 +1,23 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2 SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+MFlops/s 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFlops/s 1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+-
+SSE scalar and packed double precision flop rates. Please note that the current flop measurements on SandyBridge are
+potentially wrong. So you cannot trust these counters at the moment!
+
diff --git a/monitoring/groups/ivybridgeEP/FLOPS_SP.txt b/monitoring/groups/ivybridgeEP/FLOPS_SP.txt
new file mode 100644
index 0000000..64edd19
--- /dev/null
+++ b/monitoring/groups/ivybridgeEP/FLOPS_SP.txt
@@ -0,0 +1,24 @@
+SHORT Single Precision MFlops/s
+
+EVENTSET
+PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2 SIMD_FP_256_PACKED_SINGLE
+
+METRICS
+MFlops/s 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFlops/s 1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+-
+SSE scalar and packed single precision flop rates. Please note that the current
+flop measurements on IvyBridge are potentially wrong. So you cannot trust
+these counters at the moment!
+
diff --git a/monitoring/groups/kabini/BW.txt b/monitoring/groups/kabini/BW.txt
new file mode 100644
index 0000000..7e34078
--- /dev/null
+++ b/monitoring/groups/kabini/BW.txt
@@ -0,0 +1,14 @@
+SHORT Cache and memory bandwidth
+
+EVENTSET
+PMC0 DATA_CACHE_REFILLS_ALL
+PMC1 DATA_CACHE_EVICTED_ALL
+UPMC0 UNC_DRAM_ACCESSES_DCT0_ALL
+UPMC1 UNC_DRAM_ACCESSES_DCT1_ALL
+
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time
+
+LONG
diff --git a/monitoring/groups/kabini/CPI.txt b/monitoring/groups/kabini/CPI.txt
new file mode 100644
index 0000000..d599a34
--- /dev/null
+++ b/monitoring/groups/kabini/CPI.txt
@@ -0,0 +1,19 @@
+SHORT Cycles per instruction
+
+EVENTSET
+PMC0 RETIRED_INSTRUCTIONS
+PMC1 CPU_CLOCKS_UNHALTED
+PMC2 RETIRED_UOPS
+
+METRICS
+CPI PMC1/PMC0
+Cycles per UOPS PMC1/PMC2
+IPC PMC0/PMC1
+
+LONG
+This group measures how efficient the processor works with
+regard to instruction throughput. Also important as a standalone
+metric is RETIRED_INSTRUCTIONS as it tells you how many instruction
+you need to execute for a task. An optimization might show very
+low CPI values but execute many more instruction for it.
+
diff --git a/monitoring/groups/kabini/FLOPS.txt b/monitoring/groups/kabini/FLOPS.txt
new file mode 100644
index 0000000..ccb2f92
--- /dev/null
+++ b/monitoring/groups/kabini/FLOPS.txt
@@ -0,0 +1,14 @@
+SHORT Floating point operations
+
+EVENTSET
+PMC0 RETIRED_FLOPS_DOUBLE_ALL
+PMC1 RETIRED_FLOPS_SINGLE_ALL
+
+METRICS
+DP MFlops/s 1.0E-06*(PMC0)/time
+SP MFlops/s 1.0E-06*(PMC1)/time
+
+LONG
+Formulas:
+DP MFlops/s = 1.0E-06*(RETIRED_FLOPS_DOUBLE_ALL)/time
+SP MFlops/s = 1.0E-06*(RETIRED_FLOPS_SINGLE_ALL)/time
diff --git a/monitoring/groups/nehalem/BW.txt b/monitoring/groups/nehalem/BW.txt
new file mode 100644
index 0000000..ddc8c82
--- /dev/null
+++ b/monitoring/groups/nehalem/BW.txt
@@ -0,0 +1,20 @@
+SHORT Cache and memory bandwidth
+
+EVENTSET
+PMC0 L1D_REPL
+PMC1 L1D_M_EVICT
+PMC2 L2_LINES_IN_ANY
+PMC3 L2_LINES_OUT_DEMAND_DIRTY
+UPMC0 UNC_QMC_NORMAL_READS_ANY
+UPMC1 UNC_QMC_WRITES_FULL_ANY
+UPMC2 UNC_QHL_REQUESTS_REMOTE_READS
+UPMC3 UNC_QHL_REQUESTS_REMOTE_WRITES
+
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time
+SUM Remote memory bandwidth [MBytes/s] 1.0E-06*(UPMC2+UPMC3)*64.0/time
+
+LONG
diff --git a/monitoring/groups/nehalem/CPI.txt b/monitoring/groups/nehalem/CPI.txt
new file mode 100644
index 0000000..9852da8
--- /dev/null
+++ b/monitoring/groups/nehalem/CPI.txt
@@ -0,0 +1,14 @@
+SHORT Cycles per instruction
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+
+METRICS
+CPI FIXC1/FIXC0
+IPC FIXC0/FIXC1
+
+
+LONG
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+IPC = INSTR_RETIRED_ANY/CPU_CLK_UNHALTED_CORE
diff --git a/monitoring/groups/nehalem/FLOPS.txt b/monitoring/groups/nehalem/FLOPS.txt
new file mode 100644
index 0000000..e372504
--- /dev/null
+++ b/monitoring/groups/nehalem/FLOPS.txt
@@ -0,0 +1,20 @@
+SHORT Floating point operations
+
+EVENTSET
+PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED
+PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR
+PMC2 FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
+PMC3 FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
+
+METRICS
+Packed MUOPS/s 1.0E-06*PMC0/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+SP MUOPS/s 1.0E-06*PMC2/time
+DP MUOPS/s 1.0E-06*PMC3/time
+
+LONG
+Formulas:
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
diff --git a/monitoring/groups/nehalemEX/BW.txt b/monitoring/groups/nehalemEX/BW.txt
new file mode 100644
index 0000000..473ce76
--- /dev/null
+++ b/monitoring/groups/nehalemEX/BW.txt
@@ -0,0 +1,29 @@
+SHORT Cache and memory bandwidth
+
+EVENTSET
+PMC0 L1D_REPL
+PMC1 L1D_M_EVICT
+PMC2 L2_LINES_IN_ANY
+PMC3 L2_LINES_OUT_DEMAND_DIRTY
+MBOX0C0 FVC_EV0_BBOX_CMDS_READS
+MBOX0C1 DRAM_CMD_CAS_WR_OPN
+MBOX1C0 FVC_EV0_BBOX_CMDS_READS
+MBOX1C1 DRAM_CMD_CAS_WR_OPN
+
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64/time
+
+LONG
+Formula:
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT)*64/time
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64/time
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(FVC_EV0_BBOX_CMDS_READS)+SUM(DRAM_CMD_CAS_WR_OPN))*64.0/time
+
+On Nehalem EX it is not possible to measure the write operations with the
+FVC_EV0_BBOX_CMDS_WRITES event at the same time as the FVC_EV0_BBOX_CMDS_READS
+because they set contrary bits. The DRAM_CMD_CAS_WR_OPN is an alternative but
+it only measures write operations to open pages, hence writes to closed pages
+are not included here.
diff --git a/monitoring/groups/nehalemEX/CPI.txt b/monitoring/groups/nehalemEX/CPI.txt
new file mode 100644
index 0000000..0e4faa3
--- /dev/null
+++ b/monitoring/groups/nehalemEX/CPI.txt
@@ -0,0 +1,12 @@
+SHORT Cycles per instruction
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+
+METRICS
+CPI FIXC1/FIXC0
+
+
+LONG
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
diff --git a/monitoring/groups/nehalemEX/FLOPS.txt b/monitoring/groups/nehalemEX/FLOPS.txt
new file mode 100644
index 0000000..e372504
--- /dev/null
+++ b/monitoring/groups/nehalemEX/FLOPS.txt
@@ -0,0 +1,20 @@
+SHORT Floating point operations
+
+EVENTSET
+PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED
+PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR
+PMC2 FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
+PMC3 FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
+
+METRICS
+Packed MUOPS/s 1.0E-06*PMC0/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+SP MUOPS/s 1.0E-06*PMC2/time
+DP MUOPS/s 1.0E-06*PMC3/time
+
+LONG
+Formulas:
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
diff --git a/monitoring/groups/pentiumm/BW.txt b/monitoring/groups/pentiumm/BW.txt
new file mode 100644
index 0000000..5877abc
--- /dev/null
+++ b/monitoring/groups/pentiumm/BW.txt
@@ -0,0 +1,12 @@
+SHORT Cache and memory bandwidth
+
+EVENTSET
+PMC0 L2_LINES_IN_ALL_ALL
+PMC1 L2_LINES_OUT_ALL_ALL
+
+METRICS
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+
+LONG
+Formulas:
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL_ALL+L2_LINES_OUT_ALL_ALL)*64/time
diff --git a/monitoring/groups/pentiumm/CPI.txt b/monitoring/groups/pentiumm/CPI.txt
new file mode 100644
index 0000000..fb0d97b
--- /dev/null
+++ b/monitoring/groups/pentiumm/CPI.txt
@@ -0,0 +1,17 @@
+SHORT Cycles per instruction
+
+EVENTSET
+PMC0 UOPS_RETIRED
+PMC1 CPU_CLK_UNHALTED
+
+METRICS
+CPI PMC1/PMC0
+IPC PMC0/PMC1
+
+LONG
+This group measures how efficient the processor works with
+regard to instruction throughput. Also important as a standalone
+metric is UOPS_RETIRED as it tells you how many uops
+you need to execute for a task. An optimization might show very
+low CPI values but execute many more instruction for it.
+
diff --git a/monitoring/groups/phi/CPI.txt b/monitoring/groups/phi/CPI.txt
new file mode 100644
index 0000000..0ce61cd
--- /dev/null
+++ b/monitoring/groups/phi/CPI.txt
@@ -0,0 +1,17 @@
+SHORT Cycles per instruction
+
+EVENTSET
+PMC0 INSTRUCTIONS_EXECUTED
+PMC1 CPU_CLK_UNHALTED
+
+METRICS
+CPI PMC1/PMC0
+IPC PMC0/PMC1
+
+LONG
+This group measures how efficient the processor works with
+regard to instruction throughput. Also important as a standalone
+metric is INSTRUCTIONS_RETIRED as it tells you how many instruction
+you need to execute for a task. An optimization might show very
+low CPI values but execute many more instruction for it.
+
diff --git a/monitoring/groups/sandybridge/BW.txt b/monitoring/groups/sandybridge/BW.txt
new file mode 100644
index 0000000..3a2eb90
--- /dev/null
+++ b/monitoring/groups/sandybridge/BW.txt
@@ -0,0 +1,13 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0 L1D_REPLACEMENT
+PMC1 L2_TRANS_L1D_WB
+PMC2 L2_LINES_IN_ALL
+PMC3 L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+
+LONG
diff --git a/monitoring/groups/sandybridge/ENERGY.txt b/monitoring/groups/sandybridge/ENERGY.txt
new file mode 100644
index 0000000..7256f1e
--- /dev/null
+++ b/monitoring/groups/sandybridge/ENERGY.txt
@@ -0,0 +1,18 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+SUM Total Power DRAM [W] PWR3/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/sandybridge/FLOPS_DP.txt b/monitoring/groups/sandybridge/FLOPS_DP.txt
new file mode 100644
index 0000000..c004b88
--- /dev/null
+++ b/monitoring/groups/sandybridge/FLOPS_DP.txt
@@ -0,0 +1,24 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2 SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+MFlops/s 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFlops/s 1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+-
+SSE scalar and packed double precision flop rates. Please note that the current
+flop measurements on IvyBridge are potentially wrong. So you cannot trust
+these counters at the moment!
+
diff --git a/monitoring/groups/sandybridge/FLOPS_SP.txt b/monitoring/groups/sandybridge/FLOPS_SP.txt
new file mode 100644
index 0000000..f9e6df7
--- /dev/null
+++ b/monitoring/groups/sandybridge/FLOPS_SP.txt
@@ -0,0 +1,24 @@
+SHORT Single Precision MFlops/s
+
+EVENTSET
+PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2 SIMD_FP_256_PACKED_SINGLE
+
+METRICS
+MFlops/s 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFlops/s 1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+-
+SSE scalar and packed single precision flop rates. Please note that the current
+flop measurements on SandyBridge are potentially wrong. So you cannot trust
+these counters at the moment!
+
diff --git a/monitoring/groups/sandybridgeEP/BW.txt b/monitoring/groups/sandybridgeEP/BW.txt
new file mode 100644
index 0000000..18eea4f
--- /dev/null
+++ b/monitoring/groups/sandybridgeEP/BW.txt
@@ -0,0 +1,24 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0 L1D_REPLACEMENT
+PMC1 L2_TRANS_L1D_WB
+PMC2 L2_LINES_IN_ALL
+PMC3 L2_LINES_OUT_DEMAND_DIRTY
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
+SUM Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+
+LONG
diff --git a/monitoring/groups/sandybridgeEP/ENERGY.txt b/monitoring/groups/sandybridgeEP/ENERGY.txt
new file mode 100644
index 0000000..7256f1e
--- /dev/null
+++ b/monitoring/groups/sandybridgeEP/ENERGY.txt
@@ -0,0 +1,18 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+SUM Total Power DRAM [W] PWR3/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/sandybridgeEP/FLOPS_DP.txt b/monitoring/groups/sandybridgeEP/FLOPS_DP.txt
new file mode 100644
index 0000000..c004b88
--- /dev/null
+++ b/monitoring/groups/sandybridgeEP/FLOPS_DP.txt
@@ -0,0 +1,24 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2 SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+MFlops/s 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFlops/s 1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+-
+SSE scalar and packed double precision flop rates. Please note that the current
+flop measurements on IvyBridge are potentially wrong. So you cannot trust
+these counters at the moment!
+
diff --git a/monitoring/groups/sandybridgeEP/FLOPS_SP.txt b/monitoring/groups/sandybridgeEP/FLOPS_SP.txt
new file mode 100644
index 0000000..f9e6df7
--- /dev/null
+++ b/monitoring/groups/sandybridgeEP/FLOPS_SP.txt
@@ -0,0 +1,24 @@
+SHORT Single Precision MFlops/s
+
+EVENTSET
+PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2 SIMD_FP_256_PACKED_SINGLE
+
+METRICS
+MFlops/s 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFlops/s 1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+-
+SSE scalar and packed single precision flop rates. Please note that the current
+flop measurements on SandyBridge are potentially wrong. So you cannot trust
+these counters at the moment!
+
diff --git a/monitoring/groups/silvermont/BW.txt b/monitoring/groups/silvermont/BW.txt
new file mode 100644
index 0000000..952e64a
--- /dev/null
+++ b/monitoring/groups/silvermont/BW.txt
@@ -0,0 +1,12 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0 LONGEST_LAT_CACHE_MISS
+PMC1 OFFCORE_RESPONSE_1_WB_ANY
+
+METRICS
+SUM Memory read bandwidth [MBytes/s] 1.0E-06*(PMC0)*64.0/time
+SUM Memory writeback bandwidth [MBytes/s] 1.0E-06*(PMC1)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+
+LONG
diff --git a/monitoring/groups/silvermont/CPI.txt b/monitoring/groups/silvermont/CPI.txt
new file mode 100644
index 0000000..4eb4d40
--- /dev/null
+++ b/monitoring/groups/silvermont/CPI.txt
@@ -0,0 +1,14 @@
+SHORT Cycles per instruction
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+
+METRICS
+CPI FIXC0/FIXC1
+IPC FIXC1/FIXC0
+
+LONG
+CPI = INSTR_RETIRED_ANY/CPU_CLK_UNHALTED_CORE
+IPC = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
diff --git a/monitoring/groups/silvermont/ENERGY.txt b/monitoring/groups/silvermont/ENERGY.txt
new file mode 100644
index 0000000..3814560
--- /dev/null
+++ b/monitoring/groups/silvermont/ENERGY.txt
@@ -0,0 +1,16 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/westmere/BW.txt b/monitoring/groups/westmere/BW.txt
new file mode 100644
index 0000000..4925077
--- /dev/null
+++ b/monitoring/groups/westmere/BW.txt
@@ -0,0 +1,19 @@
+SHORT Cache and memory bandwidth
+
+EVENTSET
+PMC0 L1D_REPL
+PMC1 L1D_M_EVICT
+PMC2 L2_LINES_IN_ANY
+PMC3 L2_LINES_OUT_ANY
+UPMC0 UNC_QMC_NORMAL_READS_ANY
+UPMC1 UNC_QMC_WRITES_FULL_ANY
+UPMC2 UNC_QHL_REQUESTS_REMOTE_READS
+UPMC3 UNC_QHL_REQUESTS_REMOTE_WRITES
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time
+SUM Remote memory bandwidth [MBytes/s] 1.0E-06*(UPMC2+UPMC3)*64.0/time
+
+LONG
diff --git a/monitoring/groups/westmere/CPI.txt b/monitoring/groups/westmere/CPI.txt
new file mode 100644
index 0000000..9852da8
--- /dev/null
+++ b/monitoring/groups/westmere/CPI.txt
@@ -0,0 +1,14 @@
+SHORT Cycles per instruction
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+
+METRICS
+CPI FIXC1/FIXC0
+IPC FIXC0/FIXC1
+
+
+LONG
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+IPC = INSTR_RETIRED_ANY/CPU_CLK_UNHALTED_CORE
diff --git a/monitoring/groups/westmere/FLOPS.txt b/monitoring/groups/westmere/FLOPS.txt
new file mode 100644
index 0000000..e372504
--- /dev/null
+++ b/monitoring/groups/westmere/FLOPS.txt
@@ -0,0 +1,20 @@
+SHORT Floating point operations
+
+EVENTSET
+PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED
+PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR
+PMC2 FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
+PMC3 FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
+
+METRICS
+Packed MUOPS/s 1.0E-06*PMC0/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+SP MUOPS/s 1.0E-06*PMC2/time
+DP MUOPS/s 1.0E-06*PMC3/time
+
+LONG
+Formulas:
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
diff --git a/monitoring/groups/westmereEX/BW.txt b/monitoring/groups/westmereEX/BW.txt
new file mode 100644
index 0000000..a960025
--- /dev/null
+++ b/monitoring/groups/westmereEX/BW.txt
@@ -0,0 +1,20 @@
+SHORT Cache and memory bandwidth
+
+EVENTSET
+PMC0 L1D_REPL
+PMC1 L1D_M_EVICT
+PMC2 L2_LINES_IN_ANY
+PMC3 L2_LINES_OUT_ANY
+MBOX0C0 FVC_EV0_BBOX_CMDS_READS
+MBOX0C1 DRAM_CMD_CAS_WR_OPN
+MBOX0C2 DRAM_MISC_CAS_WR_CLS
+MBOX1C0 FVC_EV0_BBOX_CMDS_READS
+MBOX1C1 DRAM_CMD_CAS_WR_OPN
+MBOX1C2 DRAM_MISC_CAS_WR_CLS
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1+MBOX0C2+MBOX1C2)*64/time
+
+LONG
diff --git a/monitoring/groups/westmereEX/CPI.txt b/monitoring/groups/westmereEX/CPI.txt
new file mode 100644
index 0000000..9852da8
--- /dev/null
+++ b/monitoring/groups/westmereEX/CPI.txt
@@ -0,0 +1,14 @@
+SHORT Cycles per instruction
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+
+METRICS
+CPI FIXC1/FIXC0
+IPC FIXC0/FIXC1
+
+
+LONG
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+IPC = INSTR_RETIRED_ANY/CPU_CLK_UNHALTED_CORE
diff --git a/monitoring/groups/westmereEX/FLOPS.txt b/monitoring/groups/westmereEX/FLOPS.txt
new file mode 100644
index 0000000..e372504
--- /dev/null
+++ b/monitoring/groups/westmereEX/FLOPS.txt
@@ -0,0 +1,20 @@
+SHORT Floating point operations
+
+EVENTSET
+PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED
+PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR
+PMC2 FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
+PMC3 FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
+
+METRICS
+Packed MUOPS/s 1.0E-06*PMC0/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+SP MUOPS/s 1.0E-06*PMC2/time
+DP MUOPS/s 1.0E-06*PMC3/time
+
+LONG
+Formulas:
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
diff --git a/monitoring/likwid-agent.conf b/monitoring/likwid-agent.conf
new file mode 100644
index 0000000..7aadbda
--- /dev/null
+++ b/monitoring/likwid-agent.conf
@@ -0,0 +1,52 @@
+### Global section ###
+
+# Set path to monitoring group files. Default is the normal LIKWID group path
+# <INSTALLEDPREFIX>/share/likwid/mongroups
+#GROUPPATH <path_to_mon_groups>
+# List of monitoring groups that should be measured
+#EVENTSET <group1> <group2> ...
+# Define access mode for LIKWID. If likwid-agent runs as root, use 0 for direct
+# access to the MSR and PCI registers. If you are running it as common user, you
+# have to select 1 to use the accessDaemon of LIKWID. Default is 1.
+#ACCESSMODE <0/1>
+# Define the time in seconds that each given monitoring group should be measured
+#DURATION 1
+
+
+### Output section ###
+
+## Simple logfile output ##
+# Specify path for the logfile. For each monitoring group a own logfile is
+# created with the format likwid.<group>.log
+#LOGPATH <path>
+# Specify the logfile writing style. The two possible options are log and
+# update.
+# log appends all new messages to the logfile, while update empties the logfile
+# before performing any writing. The update option is recommended when the
+# output is further parsed with other tools. If LOGPATH is set but no LOGSTYLE
+# set, the style log is selected.
+#LOGSTYLE <log/update>
+
+## Syslog output ##
+# De/Activate the output to the syslog system using shell tool logger
+#SYSLOG <True/False>
+# Define the priority value for logger. Default priority is local0.notice.
+#SYSLOGPRIO local0.notice
+
+## RRD output ##
+# Likwid-agent tries to create basic RRD configurations for the selected
+# groups. Each monitoring group gets its own RRD file containing all metrics
+# as data sources. For better printing, RRAs are created to hold the min, max
+# and average values for every 10 minutes in the last hour, every hour for the
+# last day and every day for the last month.
+#RRD <True/False>
+# Store the RRDs in RRDPATH
+#RRDPATH <path>
+
+## GMetric output ##
+# De/Activate the output to the Ganglia Monitoring System using the gmetric tool
+#GMETRIC <True/False>
+# Set path to the executable of gmetric.
+#GMETRICPATH <path_to_gmetric>
+# In some environments they need to hand over a special config file for gmetric.
+#GMETRICCONFIG <path_to_gmetric_config>
diff --git a/perl/AsmGen.pl b/perl/AsmGen.pl
deleted file mode 100755
index dcd7946..0000000
--- a/perl/AsmGen.pl
+++ /dev/null
@@ -1,284 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-no strict "refs";
-use warnings;
-use lib './perl';
-use Parse::RecDescent;
-use Data::Dumper;
-use Getopt::Std;
-use Cwd 'abs_path';
-
-use gas;
-
-my $ROOT = abs_path('./');
-my $DEBUG=0;
-my $VERBOSE=0;
-our $ISA = 'x86';
-our $AS = 'gas';
-my $OPT_STRING = 'hpvda:i:o:';
-my %OPT;
-my $INPUTFILE;
-my $OUTPUTFILE;
-my $CPP_ARGS='';
-
-# Enable warnings within the Parse::RecDescent module.
-$::RD_ERRORS = 1; # Make sure the parser dies when it encounters an error
-#$::RD_WARN = 1; # Enable warnings. This will warn on unused rules &c.
-#$::RD_HINT = 1; # Give out hints to help fix problems.
-#$::RD_TRACE = 1; # if defined, also trace parsers' behaviour
-$::RD_AUTOACTION = q { [@item[0..$#item]] };
-
-sub init
-{
- getopts( "$OPT_STRING", \%OPT ) or usage();
- if ($OPT{h}) { usage(); };
- if ($OPT{v}) { $VERBOSE = 1;}
- if ($OPT{d}) { $DEBUG = 1;}
-
- if (! $ARGV[0]) {
- die "ERROR: Please specify a input file!\n\nCall script with argument -h for help.\n";
- }
-
- $INPUTFILE = $ARGV[0];
- $CPP_ARGS = $ARGV[1] if ($ARGV[1]);
-
- if ($INPUTFILE =~ /.pas$/) {
- $INPUTFILE =~ s/\.pas//;
- } else {
- die "ERROR: Input file must have pas ending!\n";
- }
- if ($OPT{o}) {
- $OUTPUTFILE = $OPT{o};
- }else {
- $OUTPUTFILE = "$INPUTFILE.s";
- }
- if ($OPT{i}) {
- $ISA = $OPT{i};
- print "INFO: Using isa $ISA.\n\n" if ($VERBOSE);
- } else {
- print "INFO: No isa specified.\n Using default $ISA.\n\n" if ($VERBOSE);
- }
- if ($OPT{a}) {
- $AS = $OPT{a};
- print "INFO: Using as $AS.\n\n" if ($VERBOSE);
- } else {
- print "INFO: No as specified.\n Using default $AS.\n\n" if ($VERBOSE);
- }
-
- as::isa_init();
-}
-
-sub usage
-{
- print <<END;
-usage: $0 [-$OPT_STRING] <INFILE>
-
-Required:
-<INFILE> : Input pas file
-
-Optional:
--h : this (help) message
--v : verbose output
--d : debug mode: prints out the parse tree
--p : Print out intermediate preprocessed output
--o <FILE> : Output file
--a <ASM> : Specify different assembler (Default: gas)
--i <ISA> : Specify different isa (Default: x86)
-
-Example:
-$0 -i x86-64 -a masm -o out.s myfile.pas
-
-END
-
-exit(0);
-}
-
-#=======================================
-# GRAMMAR
-#=======================================
-$main::grammar = <<'_EOGRAMMAR_';
-# Terminals
-FUNC : /func/i
-LOOP : /loop/i
-ALLOCATE : /allocate/i
-FACTOR : /factor/i
-DEFINE : /define/i
-USE : /use/i
-STOP : /stop/i
-START : /start/i
-LOCAL : /local/i
-TIMER : /timer/i
-INCREMENT : /increment/i
-ALIGN : /align/i
-INT : /int/i
-SINGLE : /single/i
-DOUBLE : /double/i
-INUMBER : NUMBER
-UNUMBER : NUMBER
-SNUMBER : NUMBER
-FNUMBER : NUMBER
-OFFSET : /([0-9]+\,){15}[0-9]+/
-NUMBER : /[-+]?[0-9]*\.?[0-9]+/
-SYMBOL : /[.A-Z-a-z_][A-Za-z0-9_]*/
-REG : /GPR[0-9]+/i
-SREG : /GPR[0-9]+/i
-COMMENT : /#.*/
-{'skip'}
-
-type: SINGLE
- |DOUBLE
- |INT
-
-align: ALIGN <commit> NUMBER
-{
-{FUNC => 'as::align',
- ARGS => ["$item{NUMBER}[1]"]}
-}
-
-ASMCODE : /[A-Za-z1-9.:]+.*/
-{
-{FUNC => 'as::emit_code',
- ARGS => [$item[1]]}
-}
-
-function: FUNC SYMBOL block
-{[
- {FUNC => 'as::function_entry',
- ARGS => [$item{SYMBOL}[1],0]},
- $item{block},
- {FUNC => 'as::function_exit',
- ARGS => [$item{SYMBOL}[1]]}
-]}
-
-function_allocate: FUNC SYMBOL ALLOCATE NUMBER block
-{[
- {FUNC => 'as::function_entry',
- ARGS => [$item{SYMBOL}[1],$item{NUMBER}[1]]},
- $item{block},
- {FUNC => 'as::function_exit',
- ARGS => [$item{SYMBOL}[1]]}
-]}
-
-loop: LOOP SYMBOL INUMBER SNUMBER block
-{[
-{FUNC => 'as::loop_entry',
- ARGS => [$item{SYMBOL}[1],$item{SNUMBER}[1][1]]},
- $item{block},
-{FUNC => 'as::loop_exit',
- ARGS => [$item{SYMBOL}[1],$item{INUMBER}[1][1]]}
-]}
-| LOOP SYMBOL INUMBER SREG block
-{[
-{FUNC => 'as::loop_entry',
- ARGS => [$item{SYMBOL}[1],$item{SREG}[1]]},
- $item{block},
-{FUNC => 'as::loop_exit',
- ARGS => [$item{SYMBOL}[1],$item{INUMBER}[1][1]]}
-]}
-
-timer: START TIMER
-{
-{FUNC => 'isa::start_timer',
- ARGS => []}
-}
-| STOP TIMER
-{
-{FUNC => 'isa::stop_timer',
- ARGS => []}
-}
-
-mode: START LOCAL
-{
-{FUNC => 'as::mode',
- ARGS => [$item[1][1]]}
-}
-| STOP LOCAL
-{
-{FUNC => 'as::mode',
- ARGS => [$item[1][1]]}
-}
-
-block: '{' expression(s) '}'
-{ $item[2] }
-
-define_data: DEFINE type SYMBOL OFFSET
-{
-{FUNC => 'as::define_offset',
- ARGS => [$item{SYMBOL}[1], $item{type}[1][1],"$item{OFFSET}[1]"]}
-}
-
-define_data: DEFINE type SYMBOL NUMBER
-{
-{FUNC => 'as::define_data',
- ARGS => [$item{SYMBOL}[1], $item{type}[1][1],"$item{NUMBER}[1]"]}
-}
-
-
-expression: align
- |COMMENT
- |loop
- |timer
- |mode
- |ASMCODE
-{ $item[1] }
-
-instruction : define_data
- | align
- | COMMENT
- | mode
- | function
- | function_allocate
-{ $item[1] }
-
-startrule: instruction(s)
-{ $item[1] }
-
-_EOGRAMMAR_
-
-
-#=======================================
-# MAIN
-#=======================================
-init();
-print "INFO: Calling cpp with arguments $CPP_ARGS.\n" if ($VERBOSE);
-my $text = `cpp -x assembler-with-cpp $CPP_ARGS $INPUTFILE.pas`;
-
-if ($OPT{p}) {
- open FILE,">$INPUTFILE.Pas";
- print FILE $text;
- close FILE;
-}
-
-open STDOUT,">$OUTPUTFILE";
-print "$as::AS->{HEADER}\n";
-
-my $parser = new Parse::RecDescent ($main::grammar) or die "ERROR: Bad grammar!\n";
-my $parse_tree = $parser->startrule($text) or print STDERR "ERROR: Syntax Error\n";
-tree_exec($parse_tree);
-
-if ($DEBUG) {
- open FILE,'>parse_tree.txt';
- print FILE Dumper $parse_tree,"\n";
- close FILE;
-}
-
-print "$as::AS->{FOOTER}\n";
-
-sub tree_exec
-{
- my $tree = shift;
-
- foreach my $node (@$tree) {
- if ($node !~ /^skip|^instruction|^expression|^loop/) {
- if (ref($node) eq 'ARRAY') {
- tree_exec($node);
- }else {
- if (ref($node) eq 'HASH') {
- &{$node->{FUNC}}(@{$node->{ARGS}});
- }
- }
- }
- }
-}
-
-
diff --git a/perl/feedGnuplot b/perl/feedGnuplot
index 67aaf37..d379981 100755
--- a/perl/feedGnuplot
+++ b/perl/feedGnuplot
@@ -1,27 +1,36 @@
#!/usr/bin/perl
+
+package feedgnuplot; # for the metacpan indexer
+
use strict;
use warnings;
use Getopt::Long;
-use Time::HiRes qw( usleep );
+use Time::HiRes qw( usleep gettimeofday tv_interval );
use IO::Handle;
use List::Util qw( first );
+use Scalar::Util qw( looks_like_number );
use Text::ParseWords;
use threads;
use threads::shared;
use Thread::Queue;
use Pod::Usage;
+use Time::Piece;
-
-our $VERSION = '1.11';
+my $VERSION = 1.34;
my %options;
-interpretCommandline(\%options);
+interpretCommandline();
+
+# list containing the plot data. Each element is a hashref of parameters.
+# $curve->{datastring} is a string of all the data in this curve that can be
+# sent directly to gnuplot. $curve->{datastring_meta} is a hashref {domain =>
+# ..., offset_start => ...}. offset_start represents a position in the
+# datastring where this particular data element begins. As the data is culled
+# with --xlen, the offsets are preserved by using $curve->{datastring_offset} to
+# represent the offset IN THE ORIGINAL STRING of the current start of the
+# datastring
-my $gnuplotVersion = getGnuplotVersion();
-# list containing the plot data. Each element is a reference to a list, representing the data for
-# one curve. The first 'point' is a hash describing various curve parameters. The rest are all
-# references to lists of (x,y) tuples
my @curves = ();
# list mapping curve names to their indices in the @curves list
@@ -29,24 +38,32 @@ my %curveIndices = ();
# now start the data acquisition and plotting threads
my $dataQueue;
-my $xwindow;
+
+# Whether any new data has arrived since the last replot
+my $haveNewData;
+
+# when the last replot happened
+my $last_replot_time = [gettimeofday];
+
+# whether the previous replot was timer based
+my $last_replot_is_from_timer = 1;
my $streamingFinished : shared = undef;
+
if($options{stream})
{
- if( $options{hardcopy})
- {
- $options{stream} = undef;
- }
-
$dataQueue = Thread::Queue->new();
my $addThr = threads->create(\&mainThread);
- my $plotThr = threads->create(\&plotThread);
+
+ # spawn the plot updating thread. If I'm replotting from a data trigger, I don't need this
+ my $plotThr = threads->create(\&plotUpdateThread) if $options{stream} > 0;
while(<>)
{
chomp;
+ last if /^exit/;
+
# place every line of input to the queue, so that the plotting thread can process it. if we are
# using an implicit domain (x = line number), then we send it on the data queue also, since
# $. is not meaningful in the plotting thread
@@ -58,8 +75,9 @@ if($options{stream})
}
$streamingFinished = 1;
+ $dataQueue->enqueue(undef);
- $plotThr->join();
+ $plotThr->join() if defined $plotThr;
$addThr->join();
}
else
@@ -81,86 +99,296 @@ sub interpretCommandline
unshift @ARGV, shellwords shift @ARGV;
}
- my $options = shift;
-
# everything off by default:
# do not stream in the data by default
# point plotting by default.
# no monotonicity checks by default
+ # normal histograms by default
$options{ maxcurves } = 100;
+ $options{ histstyle} = 'freq';
+
+ # Previously I was using 'legend=s%' and 'curvestyle=s%' for curve addressing. This had cleaner
+ # syntax, but disregarded the order of the given options. This resulted in arbitrarily ordered
+ # curves. I thus make parse these into lists, and then also make hashes, for later use
+
+ # needed for these to be parsed into an array-ref, these default to []
+ $options{legend} = [];
+ $options{curvestyle} = [];
+ $options{style} = [];
+ $options{histogram} = [];
+ $options{y2} = [];
+ $options{extracmds} = [];
+ $options{set} = [];
+ $options{unset} = [];
+
+ $options{curvestyleall} = '';
+ $options{styleall} = '';
+ $options{with} = '';
+
+ $options{rangesize} = [];
+
+ GetOptions(\%options, 'stream:s', 'domain!', 'dataid!', '3d!', 'colormap!', 'lines!', 'points!',
+ 'circles', 'legend=s{2}', 'autolegend!', 'xlabel=s', 'ylabel=s', 'y2label=s', 'zlabel=s',
+ 'title=s', 'xlen=f', 'ymin=f', 'ymax=f', 'xmin=s', 'xmax=s', 'y2min=f', 'y2max=f',
+ 'zmin=f', 'zmax=f', 'y2=s@',
+ 'style=s{2}', 'curvestyle=s{2}', 'curvestyleall=s', 'styleall=s', 'with=s', 'extracmds=s@', 'set=s@', 'unset=s@',
+ 'square!', 'square_xy!', 'hardcopy=s', 'maxcurves=i', 'monotonic!', 'timefmt=s',
+ 'histogram=s@', 'binwidth=f', 'histstyle=s',
+ 'terminal=s',
+ 'rangesize=s{2}', 'rangesizeall=i', 'extraValuesPerPoint=i',
+ 'help', 'dump', 'exit', 'version',
+ 'geometry=s') or pod2usage( -exitval => 1,
+ -verbose => 1, # synopsis and args
+ -output => \*STDERR );
- GetOptions($options, 'stream!', 'domain!', 'dataid!', '3d!', 'colormap!', 'lines!', 'points!',
- 'circles', 'legend=s%', 'autolegend!', 'xlabel=s', 'ylabel=s', 'y2label=s', 'zlabel=s',
- 'title=s', 'xlen=f', 'ymin=f', 'ymax=f', 'xmin=f', 'xmax=f', 'y2min=f', 'y2max=f',
- 'zmin=f', 'zmax=f', 'y2=s@', 'curvestyle=s%', 'curvestyleall=s', 'extracmds=s@',
- 'size=s', 'square!', 'square_xy!', 'hardcopy=s', 'maxcurves=i', 'monotonic!',
- 'extraValuesPerPoint=i', 'help', 'dump') or pod2usage(1);
# handle various cmdline-option errors
- if ( $options->{help} )
- { pod2usage(0); }
+ if ( $options{help} )
+ {
+ pod2usage( -exitval => 0,
+ -verbose => 1, # synopsis and args
+ -output => \*STDOUT );
+ }
+
+ if( $options{version} )
+ {
+ print "feedgnuplot version $VERSION\n";
+ exit 0;
+ }
+
+ # expand options that are given as comma-separated lists
+ for my $listkey (qw(histogram y2))
+ {
+ @{$options{$listkey}} = map split('\s*,\s*', $_), @{$options{$listkey}}
+ if defined $options{$listkey};
+ }
+
+ # --style and --curvestyle are synonyms, as are --styleall and
+ # --curvestyleall, so fill that in
+ if( $options{styleall} )
+ {
+ if($options{curvestyleall} )
+ {
+ $options{curvestyleall} .= " $options{styleall}";
+ }
+ else
+ {
+ $options{curvestyleall} = $options{styleall};
+ }
+ }
+ push @{$options{curvestyle}}, @{$options{style}};
+
+
+ # --legend and --curvestyle options are conceptually hashes, but are parsed as
+ # arrays in order to preserve the ordering. I parse both of these into hashes
+ # because those are useful to have later. After this I can access individual
+ # legends with $options{legend_hash}{curveid}
+ for my $listkey (qw(legend curvestyle rangesize))
+ {
+ $options{"${listkey}_hash"} = {};
+
+ my $n = scalar @{$options{$listkey}}/2;
+ foreach my $idx (0..$n-1)
+ {
+ $options{"${listkey}_hash"}{$options{$listkey}[$idx*2]} = $options{$listkey}[$idx*2 + 1];
+ }
+ }
+
+ if ( defined $options{hardcopy} && defined $options{stream} )
+ {
+ print STDERR "--stream doesn't make sense together with --hardcopy\n";
+ exit -1;
+ }
+
+ if ( defined $options{rangesizeall} && defined $options{extraValuesPerPoint} )
+ {
+ print STDERR "Only one of --rangesizeall and --extraValuesPerPoint may be given\n";
+ exit -1;
+ }
+
+
+ # I now set up the rangesize to always be
+ # $options{rangesize_hash}{$id} // $options{rangesize_default}
+ if ( $options{rangesizeall} )
+ {
+ $options{rangesize_default} = $options{rangesizeall};
+ }
+ else
+ {
+ $options{rangesize_default} = 1;
+
+ $options{rangesize_default} += $options{extraValuesPerPoint} if ($options{extraValuesPerPoint});
+ $options{rangesize_default}++ if ($options{colormap});
+ $options{rangesize_default}++ if ($options{circles} );
+ }
- $options->{curvestyleall} = '' unless defined $options->{curvestyleall};
- if ($options->{colormap})
+ # parse stream option. Allowed only numbers >= 0 or 'trigger'. After this code
+ # $options{stream} is
+ # -1 for triggered replotting
+ # >0 for timed replotting
+ # undef if not streaming
+ if(defined $options{stream})
+ {
+ # if no streaming period is given, default to 1Hz.
+ $options{stream} = 1 if $options{stream} eq '';
+
+ if( !looks_like_number $options{stream} )
+ {
+ if($options{stream} eq 'trigger')
+ {
+ $options{stream} = 0;
+ }
+ else
+ {
+ print STDERR "--stream can only take in values >=0 or 'trigger'\n";
+ exit -1;
+ }
+ }
+
+ if ( $options{stream} == 0 )
+ {
+ $options{stream} = -1;
+ }
+ elsif ( $options{stream} <= 0)
+ {
+ print STDERR "--stream can only take in values >=0 or 'trigger'\n";
+ exit -1;
+ }
+ }
+
+ if( $options{curvestyleall} && $options{with} )
+ {
+ print STDERR "--curvestyleall and --with are mutually exclusive. Please just use one.\n";
+ exit -1;
+ }
+ if( $options{with} )
+ {
+ $options{curvestyleall} = "with $options{with}";
+ $options{with} = '';
+ }
+
+ if ($options{colormap})
{
# colormap styles all curves with palette. Seems like there should be a way to do this with a
# global setting, but I can't get that to work
- $options->{curvestyleall} .= ' palette';
+ $options{curvestyleall} .= ' palette';
}
- if ( $options->{'3d'} )
+ if ( $options{'3d'} )
{
- if ( !$options->{domain} )
+ if ( !$options{domain} )
{
print STDERR "--3d only makes sense with --domain\n";
exit -1;
}
- if ( defined $options->{y2min} || defined $options->{y2max} || defined $options->{y2} )
+ if ( $options{timefmt} )
+ {
+ print STDERR "--3d makes no sense with --timefmt\n";
+ exit -1;
+ }
+
+ if ( defined $options{y2min} || defined $options{y2max} || @{$options{y2}} )
{
print STDERR "--3d does not make sense with --y2...\n";
exit -1;
}
- if ( defined $options->{xlen} )
+ if ( defined $options{xlen} )
{
print STDERR "--3d does not make sense with --xlen\n";
exit -1;
}
- if ( defined $options->{monotonic} )
+ if ( defined $options{monotonic} )
{
print STDERR "--3d does not make sense with --monotonic\n";
exit -1;
}
+
+ if ( defined $options{binwidth} || @{$options{histogram}} )
+ {
+ print STDERR "--3d does not make sense with histograms\n";
+ exit -1;
+ }
+
+ if ( defined $options{circles} )
+ {
+ print STDERR "--3d does not make sense with circles (gnuplot doesn't support this)\n";
+ exit -1;
+ }
}
else
{
- if(!$options->{colormap})
+ if ( $options{timefmt} && !$options{domain} )
+ {
+ print STDERR "--timefmt makes sense only with --domain\n";
+ exit -1;
+ }
+
+ if(!$options{colormap})
{
- if ( defined $options->{zmin} || defined $options->{zmax} || defined $options->{zlabel} )
+ if ( defined $options{zmin} || defined $options{zmax} || defined $options{zlabel} )
{
print STDERR "--zmin/zmax/zlabel only makes sense with --3d or --colormap\n";
exit -1;
}
}
- if ( defined $options->{square_xy} )
+ if ( defined $options{square_xy} )
{
print STDERR "--square_xy only makes sense with --3d\n";
exit -1;
}
}
- if(defined $options{xlen} && !defined $options{stream} )
+ if(defined $options{xlen} && !$options{stream} )
{
print STDERR "--xlen does not make sense without --stream\n";
exit -1;
}
+ if($options{stream} && defined $options{xlen} &&
+ ( defined $options{xmin} || defined $options{xmax}))
+ {
+ print STDERR "With --stream and --xlen the X bounds are set, so neither --xmin nor --xmax make sense\n";
+ exit -1;
+ }
+
# --xlen implies an order to the data, so I force monotonicity
- $options{monotonic} = defined $options{xlen};
+ $options{monotonic} = 1 if defined $options{xlen};
+
+ if( $options{histstyle} !~ /freq|cum|uniq|cnorm/ )
+ {
+ print STDERR "unknown histstyle. Allowed are 'freq...', 'cum...', 'uniq...', 'cnorm...'\n";
+ exit -1;
+ }
+
+ # deal with timefmt
+ if ( $options{timefmt} )
+ {
+ # I need to compute a regex to match the time field and I need to count how
+ # many whilespace-separated fields there are.
+
+ # strip leading and trailing whitespace
+ $options{timefmt} =~ s/^\s*//;
+ $options{timefmt} =~ s/\s*$//;
+
+ my $Nfields = () = split /\s+/, $options{timefmt}, -1;
+ $options{timefmt_Ncols} = $Nfields;
+
+ # make sure --xlen is an integer. With a timefmt xlen goes through strptime
+ # and strftime, and those are integer-only
+ if( defined $options{xlen} )
+ {
+ if( $options{xlen} - int($options{xlen}) )
+ {
+ say STDERR "When streaming --xlen MUST be an integer. Rounding up to the nearest second";
+ $options{xlen} = 1 + int($options{xlen});
+ }
+ }
+ }
}
sub getGnuplotVersion
@@ -177,31 +405,60 @@ sub getGnuplotVersion
return $gnuplotVersion;
}
-sub plotThread
+sub plotUpdateThread
{
while(! $streamingFinished)
{
- sleep(1);
- $dataQueue->enqueue('Plot now');
+ usleep( $options{stream} * 1e6 );
+
+ # indicate that the timer was the replot source
+ $dataQueue->enqueue('replot timertick');
}
+}
- $dataQueue->enqueue(undef);
+sub sendRangeCommand
+{
+ my ($name, $min, $max) = @_;
+
+ return unless defined $min || defined $max;
+
+ if( defined $min )
+ { $min = "\"$min\""; }
+ else
+ { $min = ''; }
+
+ if( defined $max )
+ { $max = "\"$max\""; }
+ else
+ { $max = ''; }
+ my $cmd = "set $name [$min:$max]\n";
+ print PIPE $cmd;
}
-sub mainThread
+sub makeDomainNumeric
{
- my $valuesPerPoint = 1;
- if($options{extraValuesPerPoint}) { $valuesPerPoint += $options{extraValuesPerPoint}; }
- if($options{colormap}) { $valuesPerPoint++; }
- if($options{circles} ) { $valuesPerPoint++; }
+ my ($domain0) = @_;
+
+ if ( $options{timefmt} )
+ {
+ my $timepiece = Time::Piece->strptime( $domain0, $options{timefmt} )
+ or die "Couldn't parse time format. String '$domain0' doesn't fit format '$options{timefmt}'";
+
+ return $timepiece->epoch();
+ }
+
+ return $domain0;
+}
+sub mainThread
+{
local *PIPE;
my $dopersist = '';
- if($gnuplotVersion >= 4.3)
+ if( !$options{stream} && getGnuplotVersion() >= 4.3)
{
- $dopersist = '--persist' if(!$options{stream});
+ $dopersist = '--persist';
}
if(exists $options{dump})
@@ -210,51 +467,43 @@ sub mainThread
}
else
{
- open PIPE, "|gnuplot $dopersist" or die "Can't initialize gnuplot\n";
+ my $geometry = defined $options{geometry} ?
+ "-geometry $options{geometry}" : '';
+ open PIPE, "|gnuplot $geometry $dopersist" or die "Can't initialize gnuplot\n";
}
autoflush PIPE 1;
my $outputfile;
my $outputfileType;
- if( $options{hardcopy})
+ if( defined $options{hardcopy})
{
$outputfile = $options{hardcopy};
- ($outputfileType) = $outputfile =~ /\.(eps|ps|pdf|png)$/;
- if(!$outputfileType) { die("Only .eps, .ps, .pdf and .png supported\n"); }
+ if( $outputfile =~ /^[^|] # starts with anything other than |
+ .* # stuff in the middle
+ \.(eps|ps|pdf|png|svg)$/ix) # ends with a known extension
+ {
+ $outputfileType = lc $1;
+ }
my %terminalOpts =
( eps => 'postscript solid color enhanced eps',
ps => 'postscript solid color landscape 10',
pdf => 'pdfcairo solid color font ",10" size 11in,8.5in',
- png => 'png size 1280,1024' );
-
- print PIPE "set terminal $terminalOpts{$outputfileType}\n";
- print PIPE "set output \"$outputfile\"\n";
- }
- else
- {
- print PIPE "set terminal x11\n";
- }
+ png => 'png size 1280,1024',
+ svg => 'svg');
- # If a bound isn't given I want to set it to the empty string, so I can communicate it simply to
- # gnuplot
- $options{xmin} = '' unless defined $options{xmin};
- $options{xmax} = '' unless defined $options{xmax};
- $options{ymin} = '' unless defined $options{ymin};
- $options{ymax} = '' unless defined $options{ymax};
- $options{y2min} = '' unless defined $options{y2min};
- $options{y2max} = '' unless defined $options{y2max};
- $options{zmin} = '' unless defined $options{zmin};
- $options{zmax} = '' unless defined $options{zmax};
+ if( !defined $options{terminal} &&
+ defined $outputfileType &&
+ $terminalOpts{$outputfileType} )
+ {
+ $options{terminal} = $terminalOpts{$outputfileType};
+ }
- print PIPE "set xtics\n";
- if($options{y2})
- {
- print PIPE "set ytics nomirror\n";
- print PIPE "set y2tics\n";
- # if any of the ranges are given, set the range
- print PIPE "set y2range [". $options{y2min} . ":" . $options{y2max} ."]\n" if length( $options{y2min} . $options{y2max} );
+ die "Asked to plot to file '$outputfile', but I don't know which terminal to use, and no --terminal given"
+ unless $options{terminal};
}
+ print PIPE "set terminal $options{terminal}\n" if $options{terminal};
+ print PIPE "set output \"$outputfile\"\n" if $outputfile;
# set up plotting style
my $style = '';
@@ -265,94 +514,143 @@ sub mainThread
$options{curvestyleall} = "with circles $options{curvestyleall}";
}
- # if any of the ranges are given, set the range
- print PIPE "set xrange [". $options{xmin} . ":" . $options{xmax} ."]\n" if length( $options{xmin} . $options{xmax} );
- print PIPE "set yrange [". $options{ymin} . ":" . $options{ymax} ."]\n" if length( $options{ymin} . $options{ymax} );
- print PIPE "set zrange [". $options{zmin} . ":" . $options{zmax} ."]\n" if length( $options{zmin} . $options{zmax} );
print PIPE "set style data $style\n" if $style;
print PIPE "set grid\n";
- print(PIPE "set xlabel \"" . $options{xlabel } . "\"\n") if defined $options{xlabel};
- print(PIPE "set ylabel \"" . $options{ylabel } . "\"\n") if defined $options{ylabel};
- print(PIPE "set zlabel \"" . $options{zlabel } . "\"\n") if defined $options{zlabel};
- print(PIPE "set y2label \"" . $options{y2label} . "\"\n") if defined $options{y2label};
- print(PIPE "set title \"" . $options{title } . "\"\n") if defined $options{title};
+ print(PIPE "set xlabel \"$options{xlabel }\"\n") if defined $options{xlabel};
+ print(PIPE "set ylabel \"$options{ylabel }\"\n") if defined $options{ylabel};
+ print(PIPE "set zlabel \"$options{zlabel }\"\n") if defined $options{zlabel};
+ print(PIPE "set y2label \"$options{y2label}\"\n") if defined $options{y2label};
+ print(PIPE "set title \"$options{title }\"\n") if defined $options{title};
if($options{square})
{
# set a square aspect ratio. Gnuplot does this differently for 2D and 3D plots
if(! $options{'3d'})
{
- $options{size} = '' unless defined $options{size};
- $options{size} .= ' ratio -1';
+ print(PIPE "set size ratio -1\n");
}
else
{
print(PIPE "set view equal xyz\n");
}
}
- print(PIPE "set size $options{size}\n") if defined $options{size};
if($options{square_xy})
{
print(PIPE "set view equal xy\n");
}
- if($options{colormap})
- {
- print PIPE "set cbrange [". $options{zmin} . ":" . $options{zmax} ."]\n" if length( $options{zmin} . $options{zmax} );
- }
-
# For the specified values, set the legend entries to 'title "blah blah"'
- if($options{legend})
+ if(@{$options{legend}})
{
- foreach my $id (keys %{$options{legend}})
+ # @{$options{legend}} is a list where consecutive pairs are (curveID,
+ # legend). I use $options{legend} here instead of $options{legend_hash}
+ # because I create a new curve when I see a new one, and the hash is
+ # unordered, thus messing up the ordering
+ my $n = scalar @{$options{legend}}/2;
+ foreach my $idx (0..$n-1)
{
- setCurveLabel($id, $options{legend}{$id});
+ setCurveLabel($options{legend}[$idx*2 ],
+ $options{legend}[$idx*2 + 1]);
}
}
# add the extra curve options
- if($options{curvestyle})
+ if(@{$options{curvestyle}})
{
- foreach my $id (keys %{$options{curvestyle}})
+ # @{$options{curvestyle}} is a list where consecutive pairs are (curveID,
+ # style). I use $options{curvestyle} here instead of
+ # $options{curvestyle_hash} because I create a new curve when I see a new
+ # one, and the hash is unordered, thus messing up the ordering
+ my $n = scalar @{$options{curvestyle}}/2;
+ foreach my $idx (0..$n-1)
{
- addCurveOption($id, $options{curvestyle}{$id});
+ addCurveOption($options{curvestyle}[$idx*2 ],
+ $options{curvestyle}[$idx*2 + 1]);
}
}
# For the values requested to be printed on the y2 axis, set that
- foreach (@{$options{y2}})
+ addCurveOption($_, 'axes x1y2') foreach (@{$options{y2}});
+
+# timefmt
+ if( $options{timefmt} )
{
- addCurveOption($_, 'axes x1y2 linewidth 3');
+ print(PIPE "set timefmt '$options{timefmt}'\n");
+ print(PIPE "set xdata time\n");
}
# add the extra global options
- if($options{extracmds})
+ print(PIPE "$_\n") foreach (@{$options{extracmds}});
+ print(PIPE "set $_\n") foreach (@{$options{set}});
+ print(PIPE "unset $_\n") foreach (@{$options{unset}});
+
+# set up histograms
+ $options{binwidth} ||= 1; # if no binwidth given, set it to 1
+ print PIPE
+ "set boxwidth $options{binwidth}\n" .
+ "histbin(x) = $options{binwidth} * floor(0.5 + x/$options{binwidth})\n";
+
+ setCurveAsHistogram( $_ ) foreach (@{$options{histogram}});
+
+# set all the axis ranges
+ # If a bound isn't given I want to set it to the empty string, so I can communicate it simply to
+ # gnuplot
+ print PIPE "set xtics\n";
+
+ if(@{$options{y2}})
{
- foreach (@{$options{extracmds}})
- {
- print(PIPE "$_\n");
- }
+ print PIPE "set ytics nomirror\n";
+ print PIPE "set y2tics\n";
+ # if any of the ranges are given, set the range
+ sendRangeCommand( "y2range", $options{y2min}, $options{y2max} );
}
- # regexp for a possibly floating point, possibly scientific notation number
- my $numRE = '-?\d*\.?\d+(?:[Ee][-+]?\d+)?';
+ # if any of the ranges are given, set the range
+ sendRangeCommand( "xrange", $options{xmin}, $options{xmax} );
+ sendRangeCommand( "yrange", $options{ymin}, $options{ymax} );
+ sendRangeCommand( "zrange", $options{zmin}, $options{zmax} );
+ sendRangeCommand( "cbrange", $options{zmin}, $options{zmax} ) if($options{colormap});
+
- # a point may be preceded by an id
- my $pointRE = $options{dataid} ? '(\w+)\s+' : '()';
- $pointRE .= '(' . join('\s+', ($numRE) x $valuesPerPoint) . ')';
- $pointRE = qr/$pointRE/;
+
+ # latest domain variable present in our data
+ my $latestX;
+
+ # The domain of the current point
my @domain;
- my $haveNewData;
+
+ # The x-axis domain represented as a number. This is exactly the same as
+ # $domain[0] unless the x-axis domain uses a timefmt. Then this is the
+ # number of seconds since the UNIX epoch.
+ my $domain0_numeric;
# I should be using the // operator, but I'd like to be compatible with perl 5.8
while( $_ = (defined $dataQueue ? $dataQueue->dequeue() : <>))
{
next if /^#/o;
- if($_ ne 'Plot now')
+ if( $options{stream} )
+ {
+ if(/^clear/o )
+ {
+ clearCurves();
+ next;
+ }
+
+ if(/^replot/o )
+ {
+ # /timertick/ determines if the timer was the source of the replot
+ replot( $domain0_numeric, /timertick/ );
+ next;
+ }
+
+ # /exit/ is handled in the data-reading thread
+ }
+
+ if(! /^replot/o)
{
# parse the incoming data lines. The format is
# x id0 dat0 id1 dat1 ....
@@ -364,14 +662,49 @@ sub mainThread
# line is used)
# 3d plots require $options{domain}, and dictate "x y" for the domain instead of just "x"
+ my @fields = split;
+
if($options{domain})
{
- /($numRE)/go or next;
- $domain[0] = $1;
- if($options{'3d'})
+ if( $options{timefmt} )
{
- /($numRE)/go or next;
- $domain[1] = $1;
+ # no point if doing anything unless I have at least the domain and
+ # 1 piece of data
+ next if @fields < $options{timefmt_Ncols}+1;
+
+ $domain[0] = join (' ', splice( @fields, 0, $options{timefmt_Ncols}) );
+ $domain0_numeric = makeDomainNumeric( $domain[0] );
+ }
+ elsif(!$options{'3d'})
+ {
+ # no point if doing anything unless I have at least the domain and
+ # 1 piece of data
+ next if @fields < 1+1;
+
+ $domain[0] = $domain0_numeric = shift @fields;
+ }
+ else
+ {
+ # no point if doing anything unless I have at least the domain and
+ # 1 piece of data
+ next if @fields < 2+1;
+
+ @domain = splice(@fields, 0, 2);
+ }
+
+ if( $options{monotonic} )
+ {
+ if( defined $latestX && $domain0_numeric < $latestX )
+ {
+ # the x-coordinate of the new point is in the past, so I wipe out
+ # all the data and start anew. Before I wipe the old data, I
+ # replot the old data
+ replot( $domain0_numeric );
+ clearCurves();
+ $latestX = undef;
+ }
+ else
+ { $latestX = $domain0_numeric; }
}
}
else
@@ -380,53 +713,67 @@ sub mainThread
# $. on the data queue in that case
if(defined $dataQueue)
{
- s/ ([\d]+)$//o;
- $domain[0] = $1;
+ $domain[0] = pop @fields;
}
else
{
$domain[0] = $.;
}
+ $domain0_numeric = makeDomainNumeric( $domain[0] );
}
my $id = -1;
- while (/$pointRE/go)
- {
- if($1 ne '') {$id = $1;}
- else {$id++; }
-
- $haveNewData = 1;
- pushPoint(getCurve($id),
- [@domain, split( /\s+/, $2)]);
- }
- }
-
- elsif($options{stream})
- {
- # only redraw a streaming plot if there's new data to plot
- next unless $haveNewData;
- $haveNewData = undef;
- if( $options{xlen} )
+ while(@fields)
{
- pruneOldData($domain[0] - $options{xlen});
- plotStoredData($domain[0] - $options{xlen}, $domain[0]);
+ if($options{dataid})
+ {
+ $id = shift @fields;
+ }
+ else
+ {
+ $id++;
+ }
+
+ # I'd like to use //, but I guess some people are still on perl 5.8
+ my $rangesize = exists $options{rangesize_hash}{$id} ?
+ $options{rangesize_hash}{$id} :
+ $options{rangesize_default};
+
+ last if @fields < $rangesize;
+
+ pushPoint(getCurve($id),
+ join(' ',
+ @domain,
+ splice( @fields, 0, $rangesize ) ) . "\n",
+ $domain0_numeric);
}
- else
- { plotStoredData(); }
}
}
+ # if we were streaming, we're now done!
+ if( $options{stream} )
+ {
+ return;
+ }
+
# finished reading in all. Plot what we have
plotStoredData();
- if ( $options{hardcopy})
+ if ( defined $options{hardcopy})
{
print PIPE "set output\n";
- # sleep until the plot file exists, and it is closed. Sometimes the output is
- # still being written at this point
- usleep(100_000) until -e $outputfile;
- usleep(100_000) until(system("fuser -s \"$outputfile\""));
+
+ # sleep until the plot file exists, and it is closed. Sometimes the output
+ # is still being written at this point. If the output filename starts with
+ # '|', gnuplot pipes the output to that process, instead of writing to a
+ # file. In that case I don't make sure the file exists, since there IS not
+ # file
+ if( $options{hardcopy} !~ /^\|/ )
+ {
+ usleep(100_000) until -e $outputfile;
+ usleep(100_000) until(system("fuser -s \"$outputfile\""));
+ }
print "Wrote output to $outputfile\n";
return;
@@ -435,46 +782,53 @@ sub mainThread
# we persist gnuplot, so we shouldn't need this sleep. However, once
# gnuplot exits, but the persistent window sticks around, you can no
# longer interactively zoom the plot. So we still sleep
- sleep(100000);
+ sleep(100000) unless $options{dump} || $options{exit};
}
sub pruneOldData
{
my ($oldestx) = @_;
- foreach my $xy (@curves)
+ foreach my $curve (@curves)
{
- if( @$xy > 1 )
+ next unless $curve->{datastring};
+
+ my $meta = $curve->{datastring_meta};
+
+ my $firstInWindow = first {$meta->[$_]{domain} >= $oldestx} 0..$#$meta;
+ if ( !defined $firstInWindow )
{
- if( my $firstInWindow = first {$xy->[$_][0] >= $oldestx} 1..$#$xy )
- { splice( @$xy, 1, $firstInWindow-1 ); }
- else
- { splice( @$xy, 1); }
+ # everything is too old. Clear out all the data
+ $curve->{datastring} = '';
+ $curve->{datastring_meta} = [];
+ $curve->{datastring_offset} = 0;
+ }
+ elsif ( $firstInWindow >= 2 )
+ {
+ # clear out everything that's too old, except for one point. This point
+ # will be off the plot, but if we're plotting lines there will be a
+ # connecting line to it. Some of the line will be visible
+ substr( $curve->{datastring}, 0,
+ $meta->[$firstInWindow-1]{offset_start} - $curve->{datastring_offset},
+ '' );
+ $curve->{datastring_offset} = $meta->[$firstInWindow-1]{offset_start};
}
}
}
sub plotStoredData
{
- my ($xmin, $xmax) = @_;
- print PIPE "set xrange [$xmin:$xmax]\n" if defined $xmin;
+ # get the options for those curves that havse any data
+ my @nonemptyCurves = grep { $_->{datastring} } @curves;
+ my @extraopts = map {$_->{options}} @nonemptyCurves;
- # get the options for those curves that have any data
- my @nonemptyCurves = grep {@$_ > 1} @curves;
- my @extraopts = map {$_->[0]{options}} @nonemptyCurves;
-
- my $body = join(', ' , map({ '"-"' . $_} @extraopts) );
+ my $body = join(', ' , map({ "'-' $_" } @extraopts) );
if($options{'3d'}) { print PIPE "splot $body\n"; }
else { print PIPE "plot $body\n"; }
- foreach my $buf (@nonemptyCurves)
+ foreach my $curve (@nonemptyCurves)
{
- # send each point to gnuplot. Ignore the first "point" since it's the
- # curve options
- for my $elem (@{$buf}[1..$#$buf])
- {
- print PIPE "@$elem\n";
- }
+ print PIPE $curve->{datastring};
print PIPE "e\n";
}
}
@@ -486,19 +840,51 @@ sub updateCurveOptions
# case. When no title is specified, gnuplot will still add a legend entry with an unhelpful '-'
# label. Thus I explicitly do 'notitle' for that case
- my ($curveoptions, $id) = @_;
+ my ($curve, $id) = @_;
# use the given title, unless we're generating a legend automatically. Given titles
# override autolegend
my $title;
- if(defined $curveoptions->{title})
- { $title = $curveoptions->{title}; }
+ if(defined $curve->{title})
+ { $title = $curve->{title}; }
elsif( $options{autolegend} )
{ $title = $id; }
my $titleoption = defined $title ? "title \"$title\"" : "notitle";
- my $extraoption = defined $options{curvestyleall} ? $options{curvestyleall} : '';
- $curveoptions->{options} = "$titleoption $curveoptions->{extraoptions} $extraoption";
+
+ my ($curvestyleall);
+ if( defined $options{curvestyle_hash}{$id} )
+ {
+ # I have a curve-specific style set with --curvestyle. This style lives in
+ # $curve->{extraoptions}, and it overrides the global styles
+ $curvestyleall = '';
+ }
+ else
+ {
+ $curvestyleall = $options{curvestyleall};
+ }
+
+ my $histoptions = $curve->{histoptions} || '';
+
+ my $usingoptions = '';
+ if( $options{timefmt} )
+ {
+ # with --timefmt I need an explicit 'using' specification. I specify the
+ # columns as 1:2:3..... I need the right number of columns (this is given
+ # as 1 + rangesize). I also need to start the range at the first column
+ # past the timefmt
+
+ # I'd like to use //, but I guess some people are still on perl 5.8
+ my $rangesize = exists $options{rangesize_hash}{$id} ?
+ $options{rangesize_hash}{$id} :
+ $options{rangesize_default};
+
+ my @rest = map {$_ + $options{timefmt_Ncols}} (1..$rangesize);
+
+ $usingoptions = "using 1:" . join(':', @rest);
+ }
+
+ $curve->{options} = "$histoptions $usingoptions $titleoption $curve->{extraoptions} $curvestyleall";
}
sub getCurve
@@ -510,17 +896,20 @@ sub getCurve
{
print STDERR "Tried to exceed the --maxcurves setting.\n";
print STDERR "Invoke with a higher --maxcurves limit if you really want to do this.\n";
- exit;
+ exit -1;
}
my ($id) = @_;
if( !exists $curveIndices{$id} )
{
- push @curves, [{extraoptions => ' '}]; # push a curve with no data and no options
+ push @curves, {extraoptions => ' ',
+ datastring => '',
+ datastring_meta => [],
+ datastring_offset => 0}; # push a curve with no data and no options
$curveIndices{$id} = $#curves;
- updateCurveOptions($curves[$#curves][0], $id);
+ updateCurveOptions($curves[$#curves], $id);
}
return $curves[$curveIndices{$id}];
}
@@ -530,8 +919,8 @@ sub addCurveOption
my ($id, $str) = @_;
my $curve = getCurve($id);
- $curve->[0]{extraoptions} .= "$str ";
- updateCurveOptions($curve->[0], $id);
+ $curve->{extraoptions} .= "$str ";
+ updateCurveOptions($curve, $id);
}
sub setCurveLabel
@@ -539,37 +928,114 @@ sub setCurveLabel
my ($id, $str) = @_;
my $curve = getCurve($id);
- $curve->[0]{title} = $str;
- updateCurveOptions($curve->[0], $id);
+ $curve->{title} = $str;
+ updateCurveOptions($curve, $id);
}
-# function to add a point to the plot. Assumes that the curve indexed by $idx already exists
-sub pushPoint
+sub setCurveAsHistogram
{
- my ($curve, $xy) = @_;
+ my ($id, $str) = @_;
+
+ my $curve = getCurve($id);
+ $curve->{histoptions} = 'using (histbin($2)):(1.0) smooth ' . $options{histstyle};
+
+ updateCurveOptions($curve, $id);
+}
+
+# remove all the curve data
+sub clearCurves
+{
+ foreach my $curve(@curves)
+ {
+ $curve->{datastring} = '';
+ $curve->{datastring_meta} = [];
+ $curve->{datastring_offset} = 0;
+ }
+}
+
+sub replot
+{
+ return unless $haveNewData;
+ $haveNewData = undef;
+
+ return if !$options{stream};
- if($options{monotonic})
+
+ # The logic involving domain rollover replotting due to --monotonic is a bit
+ # tricky. I want this:
+
+ # if( domain rolls over slowly )
+ # {
+ # should update on a timer;
+ # when the domain rolls over, --monotonic should force a replot
+ # }
+ # if( domain rolls over quickly )
+ # {
+ # should update when the domain rolls over,
+ # at most as quickly as the timer indicates
+ # }
+
+
+ my ($domain0_numeric, $replot_is_from_timer) = @_;
+
+ my $now = [gettimeofday];
+
+ if( # If there is no replot timer at all, replot at any indication
+ $options{stream} < 0 ||
+
+ # if the last replot was timer-based, but this one isn't, force a replot.
+ # This makes sure that a replot happens for a domain rollover shortly
+ # after a timer replot
+ !$replot_is_from_timer && $last_replot_is_from_timer ||
+
+ # if enough time has elapsed since the last replot, it's ok to replot
+ tv_interval ( $last_replot_time, $now ) > 0.8*$options{stream} )
{
- if( @$curve > 1 && $xy->[0] < $curve->[$#{$curve}][0] )
+ # ok, then. We really need to replot
+ if ( defined $options{xlen} )
{
- # the x-coordinate of the new point is in the past, so I wipe out all the data for this curve
- # and start anew
- splice( @$curve, 1, @$curve-1 );
+ # we have an --xlen, so we need to clean out the old data
+ pruneOldData( $domain0_numeric - $options{xlen} );
+
+ my ($xmin, $xmax) = ($domain0_numeric - $options{xlen}, $domain0_numeric);
+ if ( defined $options{timefmt} )
+ {
+ # if we're using a timefmt, I need to convert my xmin range from
+ # seconds-since-the-epoch BACK to the timefmt. Sheesh
+ ($xmin, $xmax) = map {Time::Piece->strptime( $_, '%s' )->strftime( $options{timefmt} ) } ($xmin, $xmax);
+ }
+ sendRangeCommand( "xrange", $xmin, $xmax );
}
+
+ plotStoredData();
+
+
+ # update replot state
+ $last_replot_time = $now;
+ $last_replot_is_from_timer = $replot_is_from_timer;
}
+}
+
+# function to add a point to the plot. Assumes that the curve indexed by $idx already exists
+sub pushPoint
+{
+ my ($curve, $datastring, $domain0_numeric) = @_;
+
+ push @{$curve->{datastring_meta}}, { offset_start => length( $curve->{datastring} ) + $curve->{datastring_offset},
+ domain => $domain0_numeric };
+ $curve->{datastring} .= $datastring;
- push @$curve, $xy;
+ $haveNewData = 1;
}
-__END__
=head1 NAME
-feedGnuplot - A pipe-oriented frontend to Gnuplot
+feedgnuplot - General purpose pipe-oriented plotting tool
=head1 SYNOPSIS
-Simple plotting of stored data:
+Simple plotting of piped data:
$ seq 5 | awk '{print 2*$1, $1*$1}'
2 1
@@ -579,14 +1045,55 @@ Simple plotting of stored data:
10 25
$ seq 5 | awk '{print 2*$1, $1*$1}' |
- feedGnuplot --lines --points --legend 0="data 0" --title "Test plot" --y2 1
+ feedgnuplot --lines --points --legend 0 "data 0" --title "Test plot" --y2 1
+ --terminal 'dumb 80,40' --exit
+
+ Test plot
+
+ 10 ++------+--------+-------+-------+-------+--------+-------+------*A 25
+ + + + + + + + + **#+
+ | : : : : : : data 0+**A*** |
+ | : : : : : : :** # |
+ 9 ++.......................................................**.##....|
+ | : : : : : : ** :# |
+ | : : : : : : ** # |
+ | : : : : : :** ##: ++ 20
+ 8 ++................................................A....#..........|
+ | : : : : : **: # : |
+ | : : : : : ** : ## : |
+ | : : : : : ** :# : |
+ | : : : : :** B : |
+ 7 ++......................................**......##................|
+ | : : : : ** : ## : : ++ 15
+ | : : : : ** : # : : |
+ | : : : :** : ## : : |
+ 6 ++..............................*A.......##.......................|
+ | : : : ** : ##: : : |
+ | : : : ** : # : : : |
+ | : : :** : ## : : : ++ 10
+ 5 ++......................**........##..............................|
+ | : : ** : #B : : : |
+ | : : ** : ## : : : : |
+ | : :** : ## : : : : |
+ 4 ++...............A.......###......................................|
+ | : **: ##: : : : : |
+ | : ** : ## : : : : : ++ 5
+ | : ** : ## : : : : : |
+ | :** ##B# : : : : : |
+ 3 ++.....**..####...................................................|
+ | **#### : : : : : : |
+ | **## : : : : : : : |
+ B** + + + + + + + +
+ 2 A+------+--------+-------+-------+-------+--------+-------+------++ 0
+ 1 1.5 2 2.5 3 3.5 4 4.5 5
+
Simple real-time plotting example: plot how much data is received on the wlan0
network interface in bytes/second (uses bash, awk and Linux):
$ while true; do sleep 1; cat /proc/net/dev; done |
- awk '/wlan0/ {if(b) {print $2-b; fflush()} b=$2}' |
- feedGnuplot --lines --stream --xlen 10 --ylabel 'Bytes/sec' --xlabel seconds
+ gawk '/wlan0/ {if(b) {print $2-b; fflush()} b=$2}' |
+ feedgnuplot --lines --stream --xlen 10 --ylabel 'Bytes/sec' --xlabel seconds
=head1 DESCRIPTION
@@ -595,23 +1102,31 @@ plots from data coming in on STDIN or given in a filename passed on the
commandline. Various data representations are supported, as is hardcopy
output and streaming display of live data. A simple example:
- $ seq 5 | awk '{print 2*$1, $1*$1}' | feedGnuplot
+ $ seq 5 | awk '{print 2*$1, $1*$1}' | feedgnuplot
You should see a plot with two curves. The C<awk> command generates some data to
-plot and the C<feedGnuplot> reads it in from STDIN and generates the plot. The
+plot and the C<feedgnuplot> reads it in from STDIN and generates the plot. The
C<awk> invocation is just an example; more interesting things would be plotted
in normal usage. No commandline-options are required for the most basic
plotting. Input parsing is flexible; every line need not have the same number of
points. New curves will be created as needed.
The most commonly used functionality of gnuplot is supported directly by the
-script. Anything not directly supported can still be done with the
-C<--extracmds> and C<--curvestyle> options. Arbitrary gnuplot commands can be
-passed in with C<--extracmds>. For example, to turn off the grid, pass in
-C<--extracmds 'unset grid'>. As many of these options as needed can be passed
-in. To add arbitrary curve styles, use C<--curvestyle curveID=extrastyle>. Pass
-these more than once to affect more than one curve. To apply an extra style to
-I<all> the curves, pass in C<--curvestyleall extrastyle>.
+script. Anything not directly supported can still be done with options such as
+C<--set>, C<--extracmds> C<--style>, etc. Arbitrary gnuplot commands can be
+passed in with C<--extracmds>. For example, to turn off the grid, you can pass
+in C<--extracmds 'unset grid'>. Commands C<--set> and C<--unset> exists to
+provide nicer syntax, so this is equivalent to passing C<--unset grid>. As many
+of these options as needed can be passed in. To add arbitrary curve styles, use
+C<--style curveID extrastyle>. Pass these more than once to affect more than one
+curve.
+
+To apply an extra style to I<all> the curves that lack an explicit C<--style>,
+pass in C<--styleall extrastyle>. In the most common case, the extra style is
+C<with something>. To support this more simply, you can pass in C<--with
+something> instead of C<--styleall 'with something'>. C<--styleall> and
+C<--with> are mutually exclusive. Furthermore any curve-specific C<--style>
+overrides the global C<--styleall> or C<--with> setting.
=head2 Data formats
@@ -627,9 +1142,9 @@ interpreted as the I<X>-value for the rest of the data on that line. Without
C<--domain> the I<X>-value is the line number, and the first value on a line is
a plain data point like the others. Default is C<--nodomain>. Thus the original
example above produces 2 curves, with B<1,2,3,4,5> as the I<X>-values. If we run
-the same command with --domain:
+the same command with C<--domain>:
- $ seq 5 | awk '{print 2*$1, $1*$1}' | feedGnuplot --domain
+ $ seq 5 | awk '{print 2*$1, $1*$1}' | feedgnuplot --domain
we get only 1 curve, with B<2,4,6,8,10> as the I<X>-values. As many points as
desired can appear on a single line, but all points on a line are associated
@@ -642,7 +1157,7 @@ data is to be plotted. With the C<--dataid> option, each point is represented by
2 values: a string identifying the curve, and the value itself. If we add
C<--dataid> to the original example:
- $ seq 5 | awk '{print 2*$1, $1*$1}' | feedGnuplot --dataid --autolegend
+ $ seq 5 | awk '{print 2*$1, $1*$1}' | feedgnuplot --dataid --autolegend
we get 5 different curves with one point in each. The first column, as produced
by C<awk>, is B<2,4,6,8,10>. These are interpreted as the IDs of the curves to
@@ -654,18 +1169,24 @@ conjunction with C<--dataid>.
=head3 Multi-value style support
Depending on how gnuplot is plotting the data, more than one value may be needed
-to represent a single point. For example, the script has support to plot all the
-data with C<--circles>. This requires a radius to be specified for each point in
-addition to the position of the point. Thus, when plotting with C<--circles>, 2
-numbers are read for each data point instead of 1. A similar situation exists
-with C<--colormap> where each point contains the position I<and> the
-color. There are other gnuplot styles that require more data (such as error
-bars), but none of these are directly supported by the script. They can still be
-used, though, by specifying the specific style with C<--curvestyle>, and
-specifying how many extra values are needed for each point with
-C<--extraValuesPerPoint extra>. C<--extraValuesPerPoint> is ONLY needed for the
-styles not explicitly supported; supported styles set that variable
-automatically.
+to represent the range of a single point. Basic 2D plots have 2 numbers
+representing each point: 1 domain and 1 range. But if plotting with
+C<--circles>, for instance, then there's an extra range value: the radius. A
+similar situation exists with C<--colormap> where each point contains the
+position I<and> the color. There are other gnuplot styles that require more data
+(such as error bars), but none of these are directly supported by the script.
+They can still be used, however, by specifying the specific style with
+C<--style>, and specifying how many values are needed for each point with
+C<--rangesizeall> or C<--rangesize> or C<--extraValuesPerPoint>. Those options
+that specify the range size are required I<only> for styles not explicitly
+supported by feedgnuplot; supported styles do the right thing automatically.
+
+More examples: if making a 2d plot of y error bars where gnuplot expects a
+(x,y,ydelta) tuple for each point, you want C<--rangesizeall 2> because you have
+one domain value (x) and 2 range values (y,ydelta). Gnuplot can also plot
+lopsided y errorbars by giving a tuple (x,y,ylow,yhigh). This is similar as
+before, but you want C<--rangesizeall 3> instead.
+
=head3 3D data
@@ -676,21 +1197,96 @@ instead of I<Y> as a function of I<X>). Thus the first 2 values on each line are
interpreted as the domain instead of just 1. The rest of the processing happens
the same way as before.
+=head3 Time/date data
+
+If the input data domain is a time/date, this can be interpreted with
+C<--timefmt>. This option takes a single argument: the format to use to parse
+the data. The format is documented in 'set timefmt' in gnuplot, although the
+common flags that C<strftime> understands are generally supported. The backslash
+sequences in the format are I<not> supported, so if you want a tab, put in a tab
+instead of \t. Whitespace in the format I<is> supported. When this flag is
+given, some other options act a little bit differently:
+
+=over
+
+=item
+
+C<--xlen> is an I<integer> in seconds
+
+=item
+
+C<--xmin> and C<--xmax> I<must> use the format passed in to C<--timefmt>
+
+=back
+
+Using this option changes both the way the input is parsed I<and> the way the
+x-axis tics are labelled. Gnuplot tries to be intelligent in this labelling, but
+it doesn't always do what the user wants. The labelling can be controlled with
+the gnuplot C<set format> command, which takes the same type of format string as
+C<--timefmt>. Example:
+
+ $ sar 1 -1 |
+ awk '$1 ~ /..:..:../ && $8 ~/^[0-9\.]*$/ {print $1,$8; fflush()}' |
+ feedgnuplot --stream --domain
+ --lines --timefmt '%H:%M:%S'
+ --set 'format x "%H:%M:%S"'
+
+This plots the 'idle' CPU consumption against time.
+
+Note that while gnuplot supports the time/date on any axis, I<feedgnuplot>
+currently supports it I<only> as the x-axis domain. This may change in the
+future.
+
=head2 Real-time streaming data
-To plot real-time data, pass in the C<--stream> option. Data will then be
-plotted as it is received, with the refresh rate limited to 1Hz (currently
-hard-coded). To plot only the most recent data (instead of I<all> the data),
-C<--xlen windowsize> can be given. This will create an constantly-updating,
-scrolling view of the recent past. C<windowsize> should be replaced by the
-desired length of the domain window to plot, in domain units (passed-in values
-if C<--domain> or line numbers otherwise).
+To plot real-time data, pass in the C<--stream [refreshperiod]> option. Data
+will then be plotted as it is received. The plot will be updated every
+C<refreshperiod> seconds. If the period isn't specified, a 1Hz refresh rate is
+used. To refresh at specific intervals indicated by the data, set the
+refreshperiod to 0 or to 'trigger'. The plot will then I<only> be refreshed when
+a data line 'replot' is received. This 'replot' command works in both triggered
+and timed modes, but in triggered mode, it's the only way to replot. Look in
+L</"Special data commands"> for more information.
+
+To plot only the most recent data (instead of I<all> the data), C<--xlen
+windowsize> can be given. This will create an constantly-updating, scrolling
+view of the recent past. C<windowsize> should be replaced by the desired length
+of the domain window to plot, in domain units (passed-in values if C<--domain>
+or line numbers otherwise). If the domain is a time/date via C<--timefmt>, then
+C<windowsize> is and I<integer> in seconds.
+
+=head3 Special data commands
+
+If we are reading streaming data, the input stream can contain special commands
+in addition to the raw data. Feedgnuplot looks for these at the start of every
+input line. If a command is detected, the rest of the line is discarded. These
+commands are
+
+=over
+
+=item C<replot>
+
+This command refreshes the plot right now, instead of waiting for the next
+refresh time indicated by the timer. This command works in addition to the timed
+refresh, as indicated by C<--stream [refreshperiod]>.
+
+=item C<clear>
+
+This command clears out the current data in the plot. The plotting process
+continues, however, to any data following the C<clear>.
+
+=item C<exit>
+
+This command causes feedgnuplot to exit.
+
+=back
=head2 Hardcopy output
The script is able to produce hardcopy output with C<--hardcopy outputfile>. The
-output type is inferred from the filename with B<.ps>, B<.eps>, B<.pdf> and
-B<.png> currently supported.
+output type can be inferred from the filename, if B<.ps>, B<.eps>, B<.pdf>,
+B<.svg> or B<.png> is requested. If any other file type is requested,
+C<--terminal> I<must> be passed in to tell gnuplot how to make the plot.
=head2 Self-plotting data files
@@ -702,7 +1298,7 @@ doing this: with a shebang (#!) or with inline perl data.
A self-plotting, executable data file C<data> is formatted as
$ cat data
- #!/usr/bin/feedGnuplot --lines --points
+ #!/usr/bin/feedgnuplot --lines --points
2 1
4 4
6 9
@@ -724,10 +1320,10 @@ data file can be plotted simply with
$ ./data
-The caveats here are that on Linux the whole #! line is limited to 127 charaters
-and that the full path to feedGnuplot must be given. The 127 character limit is
-a serious limitation, but this can likely be resolved with a kernel patch. I
-have only tried on Linux 2.6.
+The caveats here are that on Linux the whole #! line is limited to 127
+characters and that the full path to feedgnuplot must be given. The 127
+character limit is a serious limitation, but this can likely be resolved with a
+kernel patch. I have only tried on Linux 2.6.
=head3 Self-plotting data with perl inline data
@@ -739,7 +1335,7 @@ create self-plotting files:
use strict;
use warnings;
- open PLOT, "| feedGnuplot --lines --points" or die "Couldn't open plotting pipe";
+ open PLOT, "| feedgnuplot --lines --points" or die "Couldn't open plotting pipe";
while( <DATA> )
{
my @xy = split;
@@ -763,127 +1359,407 @@ create self-plotting files:
30 225
This is especially useful if the logged data is not in a format directly
-supported by feedGnuplot. Raw data can be stored after the __DATA__ directive,
+supported by feedgnuplot. Raw data can be stored after the __DATA__ directive,
with a small perl script to manipulate the data into a useable format and send
it to the plotter.
=head1 ARGUMENTS
- --[no]domain If enabled, the first element of each line is the
- domain variable. If not, the point index is used
+=over
+
+=item
+
+--C<[no]domain>
+
+If enabled, the first element of each line is the domain variable. If not, the
+point index is used
+
+=item
+
+--C<[no]dataid>
+
+If enabled, each data point is preceded by the ID of the data set that point
+corresponds to. This ID is interpreted as a string, NOT as just a number. If not
+enabled, the order of the point is used.
+
+As an example, if line 3 of the input is "0 9 1 20" then
+
+=over
+
+=item
+
+C<--nodomain --nodataid> would parse the 4 numbers as points in 4 different
+curves at x=3
+
+=item
+
+C<--domain --nodataid> would parse the 4 numbers as points in 3 different
+curves at x=0. Here, 0 is the x-variable and 9,1,20 are the data values
+
+=item
+
+C<--nodomain --dataid> would parse the 4 numbers as points in 2 different
+curves at x=3. Here 0 and 1 are the data IDs and 9 and 20 are the
+data values
+
+=item
+
+C<--domain --dataid> would parse the 4 numbers as a single point at
+x=0. Here 9 is the data ID and 1 is the data value. 20 is an extra
+value, so it is ignored. If another value followed 20, we'd get another
+point in curve ID 20
+
+=back
+
+=item
+
+C<--[no]3d>
+
+Do [not] plot in 3D. This only makes sense with C<--domain>. Each domain here is
+an (x,y) tuple
+
+=item
+
+--C<timefmt [format]>
+
+Interpret the X data as a time/date, parsed with the given format
+
+=item
+
+C<--colormap>
+
+Show a colormapped xy plot. Requires extra data for the color. zmin/zmax can be
+used to set the extents of the colors. Automatically sets the C<--rangesize>.
+
+=item
+
+C<--stream [period]>
+
+Plot the data as it comes in, in realtime. If period is given, replot every
+period seconds. If no period is given, replot at 1Hz. If the period is given as
+0 or 'trigger', replot I<only> when the incoming data dictates this. See the
+L</"Real-time streaming data"> section of the man page.
+
+=item
+
+C<--[no]lines>
+
+Do [not] draw lines to connect consecutive points
+
+=item
+
+C<--[no]points>
- --[no]dataid If enabled, each data point is preceded by the ID
- of the data set that point corresponds to. This ID is
- interpreted as a string, NOT as just a number. If not
- enabled, the order of the point is used.
+Do [not] draw points
-As an example, if line 3 of the input is "0 9 1 20"
- '--nodomain --nodataid' would parse the 4 numbers as points in 4
- different curves at x=3
-
- '--domain --nodataid' would parse the 4 numbers as points in 3 different
- curves at x=0. Here, 0 is the x-variable and 9,1,20 are the data values
-
- '--nodomain --dataid' would parse the 4 numbers as points in 2 different
- curves at x=3. Here 0 and 1 are the data IDs and 9 and 20 are the
- data values
+=item
- '--domain --dataid' would parse the 4 numbers as a single point at
- x=0. Here 9 is the data ID and 1 is the data value. 20 is an extra
- value, so it is ignored. If another value followed 20, we'd get another
- point in curve ID 20
+C<--circles>
- --[no]3d Do [not] plot in 3D. This only makes sense with --domain.
- Each domain here is an (x,y) tuple
+Plot with circles. This requires a radius be specified for each point.
+Automatically sets the C<--rangesize>. C<Not> supported for 3d plots.
- --colormap Show a colormapped xy plot. Requires extra data for the color.
- zmin/zmax can be used to set the extents of the colors.
- Automatically increments extraValuesPerPoint
+=item
- --[no]stream Do [not] display the data a point at a time, as it
- comes in
+C<--title xxx>
- --[no]lines Do [not] draw lines to connect consecutive points
- --[no]points Do [not] draw points
- --circles Plot with circles. This requires a radius be specified for
- each point. Automatically increments extraValuesPerPoint
+Set the title of the plot
- --xlabel xxx Set x-axis label
- --ylabel xxx Set y-axis label
- --y2label xxx Set y2-axis label. Does not apply to 3d plots
- --zlabel xxx Set y-axis label. Only applies to 3d plots
+=item
- --title xxx Set the title of the plot
+C<--legend curveID legend>
- --legend curveID=legend
- Set the label for a curve plot. Use this option multiple times
- for multiple curves. With --dataid, curveID is the ID. Otherwise,
- it's the index of the curve, starting at 0
+Set the label for a curve plot. Use this option multiple times for multiple
+curves. With C<--dataid>, curveID is the ID. Otherwise, it's the index of the
+curve, starting at 0
- --autolegend Use the curve IDs for the legend. Titles given with --legend
- override these
+=item
- --xlen xxx When using --stream, sets the size of the x-window to plot.
- Omit this or set it to 0 to plot ALL the data. Does not
- make sense with 3d plots. Implies --monotonic
+C<--autolegend>
- --xmin xxx Set the range for the x axis. These are ignored in a
- streaming plot
- --xmax xxx Set the range for the x axis. These are ignored in a
- streaming plot
- --ymin xxx Set the range for the y axis.
- --ymax xxx Set the range for the y axis.
- --y2min xxx Set the range for the y2 axis. Does not apply to 3d plots.
- --y2max xxx Set the range for the y2 axis. Does not apply to 3d plots.
- --zmin xxx Set the range for the z axis. Only applies to 3d plots or colormaps.
- --zmax xxx Set the range for the z axis. Only applies to 3d plots or colormaps.
+Use the curve IDs for the legend. Titles given with C<--legend> override these
- --y2 xxx Plot the data specified by this curve ID on the y2 axis.
- Without --dataid, the ID is just an ordered 0-based index.
- Does not apply to 3d plots.
+=item
- --curvestyle curveID=style
- Additional styles per curve. With --dataid, curveID is the
- ID. Otherwise, it's the index of the curve, starting at 0. Use
- this option multiple times for multiple curves
+C<--xlen xxx>
- --curvestyleall xxx Additional styles for ALL curves.
+When using C<--stream>, sets the size of the x-window to plot. Omit this or set
+it to 0 to plot ALL the data. Does not make sense with 3d plots. Implies
+C<--monotonic>
- --extracmds xxx Additional commands. These could contain extra global styles
- for instance
+=item
- --size xxx Gnuplot size option
+C<--xmin/xmax/ymin/ymax/y2min/y2max/zmin/zmax xxx>
- --square Plot data with aspect ratio 1. For 3D plots, this controls the
- aspect ratio for all 3 axes
+Set the range for the given axis. These x-axis bounds are ignored in a streaming
+plot. The y2-axis bound do not apply in 3d plots. The z-axis bounds apply
+I<only> to 3d plots or colormaps.
- --square_xy For 3D plots, set square aspect ratio for ONLY the x,y axes
+=item
- --hardcopy xxx If not streaming, output to a file specified here. Format
- inferred from filename
+C<--xlabel/ylabel/y2label/zlabel xxx>
- --maxcurves xxx The maximum allowed number of curves. This is 100 by default,
- but can be reset with this option. This exists purely to
- prevent perl from allocating all of the system's memory when
- reading bogus data
+Label the given axis. The y2-axis label does not apply to 3d plots while the
+z-axis label applies I<only> to 3d plots.
- --monotonic If --domain is given, checks to make sure that the x-
- coordinate in the input data is monotonically increasing.
- If a given x-variable is in the past, all data currently
- cached for this curve is purged. Without --monotonic, all
- data is kept. Does not make sense with 3d plots.
- No --monotonic by default.
-
- --extraValuesPerPoint xxx
- How many extra values are given for each data point. Normally this
- is 0, and does not need to be specified, but sometimes we want
- extra data, like for colors or point sizes or error bars, etc.
- feedGnuplot options that require this (colormap, circles)
- automatically set it. This option is ONLY needed if unknown styles are
- used, with --curvestyleall for instance
-
- --dump Instead of printing to gnuplot, print to STDOUT. For
- debugging.
+=item
+
+C<--y2 xxx>
+
+Plot the data specified by this curve ID on the y2 axis. Without C<--dataid>,
+the ID is just an ordered 0-based index. Does not apply to 3d plots. Can be
+passed multiple times, or passed a comma-separated list. By default the y2-axis
+curves look the same as the y-axis ones. I.e. the viewer of the resulting plot
+has to be told which is which via an axes label, legend, etc. Prior to version
+1.25 of feedgnuplot the curves plotted on the y2 axis were drawn with a thicker
+line. This is no longer the case, but that behavior can be brought back by
+passing something like
+
+ --y2 curveid --style curveid 'linewidth 3'
+
+=item
+
+C<--histogram curveID>
+
+
+Set up a this specific curve to plot a histogram. The bin width is given with
+the C<--binwidth> option (assumed 1.0 if omitted). C<--histogram> does I<not>
+touch the drawing style. It is often desired to plot these with boxes, and this
+I<must> be explicitly requested by C<--with boxes>. This works with C<--domain>
+and/or C<--stream>, but in those cases the x-value is used I<only> to cull old
+data because of C<--xlen> or C<--monotonic>. I.e. the x-values are I<not> drawn
+in any way. Can be passed multiple times, or passed a comma- separated list
+
+=item
+
+C<--binwidth width>
+
+The width of bins when making histograms. This setting applies to ALL histograms
+in the plot. Defaults to 1.0 if not given.
+
+=item
+
+C<--histstyle style>
+
+Normally, histograms are generated with the 'smooth freq' gnuplot style.
+C<--histstyle> can be used to select different 'smooth' settings. Allowed are
+'unique', 'cumulative' and 'cnormal'. 'unique' indicates whether a bin has at
+least one item in it: instead of counting the items, it'll always report 0 or 1.
+'cumulative' is the integral of the "normal" histogram. 'cnormal' is like
+'cumulative', but rescaled to end up at 1.0.
+
+=item
+
+C<--style curveID style>
+
+Additional styles per curve. With C<--dataid>, curveID is the ID. Otherwise,
+it's the index of the curve, starting at 0. Use this option multiple times for
+multiple curves. C<--styleall> does I<not> apply to curves that have a
+C<--style>
+
+=item
+
+C<--curvestyle curveID>
+
+Synonym for C<--style>
+
+=item
+
+C<--styleall xxx>
+
+Additional styles for all curves that have no C<--style>. This is overridden by
+any applicable C<--style>. Exclusive with C<--with>.
+
+=item
+
+C<--curvestyleall xxx>
+
+Synonym for C<--styleall>
+
+=item
+
+C<--with xxx>
+
+Same as C<--styleall>, but prefixed with "with". Thus
+
+ --with boxes
+
+is equivalent to
+
+ --styleall 'with boxes'
+
+Exclusive with C<--styleall>.
+
+=item
+
+C<--extracmds xxx>
+
+Additional commands to pass on to gnuplot verbatim. These could contain extra
+global styles for instance. Can be passed multiple times.
+
+=item
+
+C<--set xxx>
+
+Additional 'set' commands to pass on to gnuplot verbatim. C<--set 'a b c'> will
+result in gnuplot seeing a C<set a b c> command. Can be passed multiple times.
+
+=item
+
+C<--unset xxx>
+
+Additional 'unset' commands to pass on to gnuplot verbatim. C<--unset 'a b c'>
+will result in gnuplot seeing a C<unset a b c> command. Can be passed multiple
+times.
+
+=item
+
+C<--square>
+
+Plot data with aspect ratio 1. For 3D plots, this controls the aspect ratio for
+all 3 axes
+
+=item
+
+C<--square_xy>
+
+For 3D plots, set square aspect ratio for ONLY the x,y axes
+
+=item
+
+C<--hardcopy xxx>
+
+If not streaming, output to a file specified here. Format inferred from
+filename, unless specified by C<--terminal>
+
+=item
+
+C<--terminal xxx>
+
+String passed to 'set terminal'. No attempts are made to validate this.
+C<--hardcopy> sets this to some sensible defaults if --hardcopy is given .png,
+.pdf, .ps, .eps or .svg. If any other file type is desired, use both
+C<--hardcopy> and C<--terminal>
+
+=item
+
+C<--maxcurves xxx>
+
+The maximum allowed number of curves. This is 100 by default, but can be reset
+with this option. This exists purely to prevent perl from allocating all of the
+system's memory when reading bogus data
+
+=item
+
+C<--monotonic>
+
+If C<--domain> is given, checks to make sure that the x- coordinate in the input
+data is monotonically increasing. If a given x-variable is in the past, all data
+currently cached for this curve is purged. Without C<--monotonic>, all data is
+kept. Does not make sense with 3d plots. No C<--monotonic> by default. The data is
+replotted before being purged
+
+=item
+
+C<--rangesize curveID xxx>
+
+The options C<--rangesizeall>, C<--rangesize> and C<--extraValuesPerPoint> set
+the number of values are needed to represent each point being plotted (see
+L</"Multi-value style support"> above). These options are I<only> needed if
+unknown styles are used, with C<--styleall> or C<--with> for instance.
+
+C<--rangesize> is used to set how many values are needed to represent the range
+of a point for a particular curve. This overrides any defaults that may exist
+for this curve only.
+
+=item
+
+C<--rangesizeall xxx>
+
+Like C<--rangesize>, but applies to I<all> the curves.
+
+C<--extraValuesPerPoint xxx>
+
+Like C<--rangesizeall>, but instead of overriding the default, adds to it. For
+example, if plotting non-lopsided y errorbars gnuplot wants (x,y,ydelta) tuples.
+These can be specified both with C<--rangesizeall 2> (because there are 2 range
+values) or C<--extraValuesPerPoint 1> (because there's 1 more value than usual).
+
+This option is I<only> needed if unknown styles are used, with C<--styleall> or
+C<--with> for instance.
+
+=item
+
+C<--dump>
+
+Instead of printing to gnuplot, print to STDOUT. Very useful for debugging. It
+is possible to send the output produced this way to gnuplot directly.
+
+=item
+
+C<--exit>
+
+Terminate the feedgnuplot process after passing data to gnuplot. The window will
+persist but will not be interactive. Without this option feedgnuplot keeps
+running and must be killed by the user. Note that this option works only with
+later versions of gnuplot and only with some gnuplot terminals.
+
+=item
+
+C<--geometry>
+
+If using X11, specifies the size, position of the plot window
+
+=item
+
+C<--version>
+
+Print the version and exit
+
+=back
+
+=head1 RECIPES
+
+=head2 Basic plotting of piped data
+
+ $ seq 5 | awk '{print 2*$1, $1*$1}'
+ 2 1
+ 4 4
+ 6 9
+ 8 16
+ 10 25
+
+ $ seq 5 | awk '{print 2*$1, $1*$1}' |
+ feedgnuplot --lines --points --legend 0 "data 0" --title "Test plot" --y2 1
+
+=head2 Realtime plot of network throughput
+
+Looks at wlan0 on Linux.
+
+ $ while true; do sleep 1; cat /proc/net/dev; done |
+ gawk '/wlan0/ {if(b) {print $2-b; fflush()} b=$2}' |
+ feedgnuplot --lines --stream --xlen 10 --ylabel 'Bytes/sec' --xlabel seconds
+
+=head2 Realtime plot of battery charge in respect to time
+
+Uses the result of the C<acpi> command.
+
+ $ while true; do acpi; sleep 15; done |
+ perl -nE 'BEGIN{ $| = 1; } /([0-9]*)%/; say join(" ", time(), $1);' |
+ feedgnuplot --stream --ymin 0 --ymax 100 --lines --domain --xlabel 'Time' --timefmt '%s' --ylabel "Battery charge (%)"
+
+=head2 Realtime plot of temperatures in an IBM Thinkpad
+
+Uses C</proc/acpi/ibm/thermal>, which reports temperatures at various locations
+in a Thinkpad.
+
+ $ while true; do cat /proc/acpi/ibm/thermal | awk '{$1=""; print}' ; sleep 1; done |
+ feedgnuplot --stream --xlen 100 --lines --autolegend --ymax 100 --ymin 20 --ylabel 'Temperature (deg C)'
+
+=head2 Plotting a histogram of file sizes in a directory
+
+ $ ls -l | awk '{print $5/1e6}' |
+ feedgnuplot --histogram 0 --with boxes --ymin 0 --xlabel 'File size (MB)' --ylabel Frequency
=head1 ACKNOWLEDGEMENT
@@ -897,11 +1773,11 @@ L<https://github.com/dkogan/feedgnuplot>
=head1 AUTHOR
-Dima Kogan, C<< <dkogan at cds.caltech.edu> >>
+Dima Kogan, C<< <dima at secretsauce.net> >>
=head1 LICENSE AND COPYRIGHT
-Copyright 2011 Dima Kogan.
+Copyright 2011-2012 Dima Kogan.
This program is free software; you can redistribute it and/or modify it
under the terms of either: the GNU General Public License as published
@@ -910,3 +1786,4 @@ by the Free Software Foundation; or the Artistic License.
See http://dev.perl.org/licenses/ for more information.
=cut
+
diff --git a/perl/gas.pm b/perl/gas.pm
deleted file mode 100644
index 106bee3..0000000
--- a/perl/gas.pm
+++ /dev/null
@@ -1,211 +0,0 @@
-#!/usr/bin/perl
-
-package as;
-use Data::Dumper;
-use isax86;
-use isax86_64;
-
-$AS = { HEADER => '.intel_syntax noprefix',
- FOOTER => ''};
-
-$LOCAL = {};
-$MODE = 'GLOBAL';
-
-my $CURRENT_SECTION='NONE';
-my $WORDLENGTH;
-my $STACKPTR;
-my $BASEPTR;
-my $REG;
-my $ARG;
-
-sub emit_code
-{
- my $code = shift;
- $code =~ s/([GF]PR[0-9]+)/$REG->{$1}/g;
- $code =~ s/(ARG[0-9]+)/$ARG->{$1}/g;
- $code =~ s/(LOCAL[0-9]+)/$LOCAL->{$1}/g;
- print "$code\n";
-}
-
-sub align
-{
- my $number = shift;
- print ".align $number\n";
-
-}
-
-sub mode
-{
- $cmd = shift;
-
- if ($cmd eq 'START') {
- $MODE = 'LOCAL';
- } elsif ($cmd eq 'STOP') {
- $MODE = 'GLOBAL';
- }
-}
-
-sub function_entry
-{
- my $symbolname = shift;
- my $allocate = shift;
- my $distance;
-
- foreach ( (0 .. $allocate) ) {
- $distance = $_ * $WORDLENGTH;
- $LOCAL->{"LOCAL$_"} = "[$BASEPTR-$distance]";
- }
-
- if($CURRENT_SECTION ne 'text') {
- $CURRENT_SECTION = 'text';
- print ".text\n";
- }
-
- print ".globl $symbolname\n";
- print ".type $symbolname, \@function\n";
- print "$symbolname :\n";
-
- if ($main::ISA eq 'x86') {
- print "push ebp\n";
- print "mov ebp, esp\n";
- $distance = $allocate * $WORDLENGTH;
- print "sub esp, $distance\n" if ($allocate);
- print "push ebx\n";
- print "push esi\n";
- print "push edi\n";
- } elsif ($main::ISA eq 'x86-64') {
- print "push rbp\n";
- print "mov rbp, rsp\n";
- $distance = $allocate * $WORDLENGTH;
- print "sub rsp, $distance\n" if ($allocate);
- print "push rbx\n";
- print "push r12\n";
- print "push r13\n";
- print "push r14\n";
- print "push r15\n";
- }
-}
-
-sub function_exit
-{
- my $symbolname = shift;
-
- $LOCAL = {};
-
- if ($main::ISA eq 'x86') {
- print "pop edi\n";
- print "pop esi\n";
- print "pop ebx\n";
- print "mov esp, ebp\n";
- print "pop ebp\n";
- } elsif ($main::ISA eq 'x86-64') {
- print "pop r15\n";
- print "pop r14\n";
- print "pop r13\n";
- print "pop r12\n";
- print "pop rbx\n";
- print "mov rsp, rbp\n";
- print "pop rbp\n";
- }
- print "ret\n";
- print ".size $symbolname, .-$symbolname\n";
- print "\n";
-}
-
-sub define_data
-{
- my $symbolname = shift;
- my $type = shift;
- my $value = shift;
-
- if($CURRENT_SECTION ne 'data') {
- $CURRENT_SECTION = 'data';
- print ".data\n";
- }
- print ".align 64\n";
- print "$symbolname:\n";
- if ($type eq 'DOUBLE') {
- print ".double $value, $value, $value, $value, $value, $value, $value, $value\n"
- } elsif ($type eq 'SINGLE') {
- print ".single $value, $value, $value, $value, $value, $value, $value, $value\n"
- } elsif ($type eq 'INT') {
- print ".int $value, $value\n"
- }
-}
-
-sub define_offset
-{
- my $symbolname = shift;
- my $type = shift;
- my $value = shift;
-
- if($CURRENT_SECTION ne 'data') {
- $CURRENT_SECTION = 'data';
- print ".data\n";
- }
- print ".align 16\n";
- print "$symbolname:\n";
- print ".int $value\n";
-}
-
-
-sub loop_entry
-{
- my $symbolname = shift;
- my $stopping_criterion = shift;
- $stopping_criterion = $REG->{$stopping_criterion} if( exists $REG->{$stopping_criterion});
-
- if ($main::ISA eq 'x86') {
- print "xor eax, eax\n";
- } elsif ($main::ISA eq 'x86-64') {
- print "xor rax, rax\n";
- }
- print ".align 16\n";
- if ($MODE eq 'GLOBAL') {
- print "$symbolname :\n";
- }else {
- print "1:\n";
- }
-
-}
-
-
-sub loop_exit
-{
- my $symbolname = shift;
- my $step = shift;
-
- if ($main::ISA eq 'x86') {
- print "add eax, $step\n";
- print "cmp eax, edi\n";
- } elsif ($main::ISA eq 'x86-64') {
- print "add rax, $step\n";
- print "cmp rax, rdi\n";
- }
- if ($MODE eq 'GLOBAL') {
- print "jl $symbolname\n";
- }else {
- print "jl 1b\n";
- }
- print "\n";
-}
-
-sub isa_init
-{
- if ($main::ISA eq 'x86') {
- $WORDLENGTH = $isax86::WORDLENGTH_X86 ;
- $STACKPTR = $isax86::STACKPTR_X86 ;
- $BASEPTR = $isax86::BASEPTR_X86 ;
- $REG = $isax86::REG_X86;
- $ARG = $isax86::ARG_X86 ;
- } elsif ($main::ISA eq 'x86-64') {
- $WORDLENGTH = $isax86_64::WORDLENGTH_X86_64;
- $STACKPTR = $isax86_64::STACKPTR_X86_64 ;
- $BASEPTR = $isax86_64::BASEPTR_X86_64 ;
- $REG = $isax86_64::REG_X86_64;
- $ARG = $isax86_64::ARG_X86_64 ;
- }
-}
-
-
-1;
diff --git a/perl/gen_events.pl b/perl/gen_events.pl
index f5736ad..4833ccc 100755
--- a/perl/gen_events.pl
+++ b/perl/gen_events.pl
@@ -5,11 +5,16 @@ use warnings;
my $arch;
my $key;
+my $optkey = "";
my $eventId;
+my $eventname;
my $limit;
my $umask;
my $cmask;
my $cfg;
+my $opts = "";
+my $defoptkey = "";
+my $defopts = "";
my $num_events=0;
my @events = ();
@@ -33,31 +38,94 @@ while (<INFILE>) {
if (/^#/) {
# Skip comment
}elsif (/(EVENT_[A-Z0-9_]*)[ ]+(0x[A-F0-9]+)[ ]+([A-Z0-9|]+)/) {
+ $eventname = $1;
$eventId = $2;
$limit = $3;
+ $opts = "EVENT_OPTION_NONE_MASK";
} elsif (/UMASK_([A-Z0-9_]*)[ ]*(0x[A-F0-9]+)[ ]*(0x[A-F0-9]+)[ ]*(0x[A-F0-9]+)/) {
$key = $1;
$umask = $2;
$cfg = $3;
$cmask = $4;
+ my $defaultopts = "{";
+ my $nropts = 0;
+ if ($key ne $optkey or $optkey eq "")
+ {
+ $opts = "EVENT_OPTION_NONE_MASK";
+ }
+ if ($key =~ m/$defoptkey[A-Z0-9_]*/)
+ {
+ my @optlist = split(",", $defopts);
+ foreach my $opt (@optlist)
+ {
+ my @tmplist = split("=", $opt);
+ $defaultopts = $defaultopts."{".$tmplist[0].",".$tmplist[1]."},";
+ $nropts++;
+ }
+ }
+ if (length($defaultopts) > 1)
+ {
+ substr($defaultopts,length($defaultopts)-1,1) = '}';
+ }
+ else
+ {
+ $defaultopts = $defaultopts."}";
+ }
push(@events,{name=>$key,
limit=>$limit,
eventId=>$eventId,
cfg=>$cfg,
cmask=>$cmask,
- mask=>$umask});
+ mask=>$umask,
+ nropts=>$nropts,
+ opts=>$opts,
+ defopts=>$defaultopts});
$num_events++;
} elsif (/UMASK_([A-Z0-9_]*)[ ]*(0x[A-F0-9]+)/) {
$key = $1;
$umask = $2;
+ my $defaultopts = "{";
+ my $nropts = 0;
+ if ($key ne $optkey or $optkey eq "")
+ {
+ $opts = "EVENT_OPTION_NONE_MASK"
+ }
+ if ($key =~ m/$defoptkey[A-Z0-9_]*/)
+ {
+ my @optlist = split(",", $defopts);
+ foreach my $opt (@optlist)
+ {
+ my @tmplist = split("=", $opt);
+ $defaultopts = $defaultopts."{".$tmplist[0].",".$tmplist[1]."},";
+ $nropts++;
+ }
+ }
+ if (length($defaultopts) > 1)
+ {
+ substr($defaultopts,length($defaultopts)-1,1) = '}';
+ }
+ else
+ {
+ $defaultopts = $defaultopts."}";
+ }
push(@events,{name=>$key,
limit=>$limit,
eventId=>$eventId,
cfg=>0x00,
cmask=>0x00,
- mask=>$umask});
+ mask=>$umask,
+ nropts=>$nropts,
+ opts=>$opts,
+ defopts=>$defaultopts});
$num_events++;
}
+ elsif (/DEFAULT_OPTIONS_([A-Z0-9_]*)[ ]*([xA-Z0-9_=,]*)/) {
+ $defoptkey = $1;
+ $defopts = $2;
+ } elsif (/OPTIONS_([A-Z0-9_]*)[ ]*([A-Z0-9_\|]+)/) {
+ $optkey = $1;
+ $opts = $2;
+ }
}
close INFILE;
@@ -72,11 +140,8 @@ print OUTFILE "#define NUM_ARCH_EVENTS_$ucArch $num_events\n\n";
print OUTFILE "static PerfmonEvent ".$arch."_arch_events[NUM_ARCH_EVENTS_$ucArch] = {\n";
foreach my $event (@events) {
-
print OUTFILE <<END;
-$delim {\"$event->{name}\",
- \"$event->{limit}\",
- $event->{eventId},$event->{mask},$event->{cfg},$event->{cmask}}
+$delim {\"$event->{name}\", \"$event->{limit}\", $event->{eventId},$event->{mask},$event->{cfg},$event->{cmask},$event->{nropts},$event->{opts},$event->{defopts}}
END
$delim = ',';
}
diff --git a/perl/generateGroups.pl b/perl/generateGroups.pl
deleted file mode 100755
index bbfb9b9..0000000
--- a/perl/generateGroups.pl
+++ /dev/null
@@ -1,142 +0,0 @@
-#!/usr/bin/perl
-
-use strict;
-use warnings;
-use lib './perl';
-use File::Copy;
-use Cwd 'abs_path';
-use Data::Dumper;
-use Template;
-
-my $name;
-my $shortHelp;
-
-my %groupEnum;
-my $GroupRoot = $ARGV[0];
-my $OutputDirectory = $ARGV[1];
-my $TemplateRoot = $ARGV[2];
-my $DEBUG = 0;
-
-my $tpl = Template->new({
- INCLUDE_PATH => ["$TemplateRoot"]
- })|| die Template->error(), "\n";
-
-# First open the architecture directories
-opendir (DIR, "./$GroupRoot") or die "Cannot open groups directory: $!\n";
-my $rule;
-my $metric;
-
-while (defined(my $arch = readdir(DIR))) {
- if ($arch !~ /^\./) {
- print "SCANNING $arch\n" if ($DEBUG);
- if (-d "$GroupRoot/$arch") {
-
- my $Vars;
- my @groups;
- opendir (ARCHDIR, "$GroupRoot/$arch") or die "Cannot open current directory: $!\n";
-
- while (defined(my $group = readdir(ARCHDIR))) {
-
- next unless ($group !~ /^\./);
- print "SCANNING GROUP $group\n" if ($DEBUG);
- my $eventSet;
- my @metrics;
- my $isUncore = 0;
- $Vars->{groups} = [];
-
- $group =~ /([A-Za-z_0-9]+)\.txt/;
- $name = $1;
-
- open FILE, "<$GroupRoot/$arch/$group";
-
- my $isInSet = 0;
- my $isInMetrics = 0;
- my $isInLong = 0;
- my $msg = '';
-
- while (<FILE>) {
- my $line = $_;
-
- if($line =~ /SHORT[ ]+(.+)/) {
- $shortHelp = $1;
- } elsif ($line =~ /EVENTSET/) {
- $isInSet = 1;
- } elsif ($line =~ /METRICS/) {
- $isInSet = 0;
- $isInMetrics = 1;
- $eventSet =~ s/,$//;
- } elsif ($line =~ /LONG/) {
- $isInSet = 0;
- $isInMetrics = 0;
- $isInLong = 1;
- } else {
- if ($isInSet) {
- if ($line =~ /([A-Z0-9]+)[ ]+([A-Z_0-9]+)/) {
- $eventSet .= "$2:$1,";
- }
- } elsif ($isInMetrics) {
- if ($line =~ /(.+)[ ]+(.+)/) {
- $metric = $1;
- $rule = $2;
- $rule =~ s/(UPMC[0-9]+)/perfmon_getResult(threadId,"$1")/g;
- $rule =~ s/([^U]|^)(PMC[0-9]+)/$1perfmon_getResult(threadId,"$2")/g;
- $rule =~ s/(FIXC[0-9]+)/perfmon_getResult(threadId,"$1")/g;
- $rule =~ s/(WBOX[0-9]+)/perfmon_getResult(threadId,"$1")/g;
- $rule =~ s/(BBOX[C0-9]+)/perfmon_getResult(threadId,"$1")/g;
- $rule =~ s/(MBOX[CC0-9]+)/perfmon_getResult(threadId,"$1")/g;
- $rule =~ s/(SBOX[P0-9]+)/perfmon_getResult(threadId,"$1")/g;
- $rule =~ s/(RBOX[C0-9]+)/perfmon_getResult(threadId,"$1")/g;
- $rule =~ s/(PWR[0-9]+)/perfmon_getResult(threadId,"$1")/g;
- $rule =~ s/(TMP[0-9]+)/perfmon_getResult(threadId,"$1")/g;
- $rule =~ s/(MBOXFIX)/perfmon_getResult(threadId,"$1")/g;
-
- $metric =~ s/(^\s+|\s+$)//g;
- push (@metrics, {label => $metric,
- rule => $rule});
- }
- } elsif ($isInLong) {
- $msg .= $line;
- }
- }
- }
- close FILE;
- $msg =~ s/\n/\\n\\\n/g;
-
- if ($eventSet =~ /WBOX|BBOX|MBOX|SBOX|RBOX/) {
- $isUncore = 1;
- }
-
- push (@groups, {name => $name,
- shortHelp => $shortHelp,
- longHelp => $msg,
- isUncore => $isUncore,
- eventSet => $eventSet,
- numRows => $#metrics+1,
- metrics => \@metrics});
-
- if (not exists($groupEnum{$name})) {
- $groupEnum{$name} = 1;
- }
-
- }
-
- $Vars->{arch} = $arch;
- my @groupsSorted = sort {$a->{name} cmp $b->{name}} @groups;
- $Vars->{groups} = \@groupsSorted;
- $Vars->{numGroups} = $#groupsSorted+1;
-
-
- $tpl->process('group.tt', $Vars, "$OutputDirectory/perfmon_$arch"."_groups.h")|| die $tpl->error(), "\n";
-# print Dumper($Vars);
- closedir ARCHDIR;
- }
- }
-}
-closedir DIR;
-
-my $Vars;
-$Vars->{groups} = \%groupEnum;
-$tpl->process('group_types.tt', $Vars, "$OutputDirectory/perfmon_group_types.h")|| die $tpl->error(), "\n";
-
-
-
diff --git a/perl/generatePas.pl b/perl/generatePas.pl
deleted file mode 100755
index 9c1dcd1..0000000
--- a/perl/generatePas.pl
+++ /dev/null
@@ -1,163 +0,0 @@
-#!/usr/bin/perl
-
-use lib 'util';
-use strict;
-use warnings;
-use lib './perl';
-use File::Copy;
-use Cwd 'abs_path';
-use Data::Dumper;
-use Template;
-
-my @Testcases;
-my $name;
-my $streams;
-my $type;
-my $flops;
-my $bytes;
-my $prolog='';
-my $loop='';
-my $increment;
-my $isLoop=0;
-my $skip=0;
-my $multi=0;
-
-my $BenchRoot = $ARGV[0];
-my $OutputDirectory = $ARGV[1];
-my $TemplateRoot = $ARGV[2];
-my $DEBUG = 0;
-
-my $stream_lookup = {
- STR0 => 'ARG2',
- STR1 => 'ARG3',
- STR2 => 'ARG4',
- STR3 => 'ARG5',
- STR4 => 'ARG6',
- STR5 => '[rbp+16]',
- STR6 => '[rbp+24]',
- STR7 => '[rbp+32]',
- STR8 => '[rbp+40]',
- STR9 => '[rbp+48]',
- STR10 => '[rbp+56]',
- STR11 => '[rbp+64]',
- STR12 => '[rbp+72]',
- STR13 => '[rbp+80]',
- STR14 => '[rbp+88]',
- STR15 => '[rbp+96]',
- STR16 => '[rbp+104]',
- STR17 => '[rbp+112]',
- STR18 => '[rbp+120]',
- STR19 => '[rbp+128]',
- STR20 => '[rbp+136]',
- STR21 => '[rbp+144]',
- STR22 => '[rbp+152]',
- STR23 => '[rbp+160]',
- STR24 => '[rbp+168]',
- STR25 => '[rbp+176]',
- STR26 => '[rbp+184]',
- STR27 => '[rbp+192]',
- STR28 => '[rbp+200]',
- STR29 => '[rbp+208]',
- STR30 => '[rbp+216]',
- STR31 => '[rbp+224]',
- STR32 => '[rbp+232]',
- STR33 => '[rbp+240]',
- STR34 => '[rbp+248]',
- STR35 => '[rbp+256]',
- STR36 => '[rbp+264]',
- STR37 => '[rbp+272]',
- STR38 => '[rbp+280]',
- STR39 => '[rbp+288]',
- STR40 => '[rbp+296]'};
-
-opendir (DIR, "./$BenchRoot") or die "Cannot open bench directory: $!\n";
-my $tpl = Template->new({
- INCLUDE_PATH => ["$TemplateRoot"]
- });
-
-while (defined(my $file = readdir(DIR))) {
- if ($file !~ /^\./) {
- print "SCANNING $file\n" if ($DEBUG);
-
- $file =~ /([A-Za-z_0-9]+)\.ptt/;
- $name = $1;
-
- $isLoop = 0;
- $skip=0;
- $multi=0;
- $prolog='';
- $loop='';
- open FILE, "<$BenchRoot/$file";
- while (<FILE>) {
- my $line = $_;
-
- if($line =~ /STREAMS[ ]+([0-9]+)/) {
- $streams = $1;
- if ($streams > 10) {
- $multi = 1;
- }
- } elsif ($line =~ /TYPE[ ]+(SINGLE|DOUBLE)/) {
- $type = $1;
- } elsif ($line =~ /FLOPS[ ]+([0-9.]+)/) {
- $flops = $1;
- } elsif ($line =~ /BYTES[ ]+([0-9]+)/) {
- $bytes = $1;
- } elsif ($line =~ /INC[ ]+([0-9]+)/) {
- $increment = $1;
- $skip = 1;
- } elsif ($line =~ /LOOP[ ]+([0-9]+)/) {
- $increment = $1;
- $isLoop = 1;
- } else {
- if ($isLoop) {
- if($line =~ /SET[ ]+(STR[0-9]+)[ ]+(GPR[0-9]+)/) {
- $loop .= "#define $1 $2\n";
- $loop .= "mov $2, $stream_lookup->{$1}\n";
- } else {
- $loop .= $line;
- }
- } else {
- $prolog .= $line;
- }
- }
- }
- close FILE;
-
- if (($streams > 5) && ($streams < 10)) {
- my $arg = 7;
- foreach my $stream ( 5 .. $streams ) {
- $prolog .= "mov STR$stream, ARG$arg\n";
- $arg++;
- }
- }
-
- $streams = 'STREAM_'.$streams;
- my $Vars;
- $Vars->{name} = $name;
- $Vars->{prolog} = $prolog;
- $Vars->{increment} = $increment;
- $Vars->{loop} = $loop;
- $Vars->{skip} = $skip;
- $Vars->{multi} = $multi;
-
-#print Dumper($Vars);
-
- $tpl->process('bench.tt', $Vars, "$OutputDirectory/$name.pas");
- push(@Testcases,{name => $name,
- streams => $streams,
- type => $type,
- stride => $increment,
- flops => $flops,
- bytes => $bytes});
- }
-}
-#print Dumper(@Testcases);
-my @TestcasesSorted = sort {$a->{name} cmp $b->{name}} @Testcases;
-
-my $Vars;
-$Vars->{Testcases} = \@TestcasesSorted;
-$Vars->{numKernels} = $#TestcasesSorted+1;
-$Vars->{allTests} = join('\n',map {$_->{name}} @TestcasesSorted);
-$tpl->process('testcases.tt', $Vars, "$OutputDirectory/testcases.h");
-
-
diff --git a/perl/likwid-mpirun b/perl/likwid-mpirun
deleted file mode 100755
index b922359..0000000
--- a/perl/likwid-mpirun
+++ /dev/null
@@ -1,456 +0,0 @@
-#!/usr/bin/perl
-# =======================================================================================
-#
-# Filename: likwid-mpirun
-#
-# Description: Wrapper application to mpi startup mechanisms. Builds on
-# likwid to control affinity and has integrated perfctr support.
-#
-# Version: <VERSION>
-# Released: <DATE>
-#
-# Author: Jan Treibig (jt), jan.treibig at gmail.com
-# Project: likwid
-#
-# Copyright (C) 2014 Jan Treibig
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU General Public License as published by the Free Software
-# Foundation, either version 3 of the License, or (at your option) any later
-# version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT ANY
-# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-# PARTICULAR PURPOSE. See the GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License along with
-# this program. If not, see <http://www.gnu.org/licenses/>.
-#
-# =======================================================================================
-
-use Getopt::Long;
-##############################
-# CONFIGURATION #
-##############################
-my $LIKWIDPIN = '<PREFIX>/bin/likwid-pin';
-my $LIKWIDPERF = '<PREFIX>/bin/likwid-perfctr';
-my $MPIROOT_openmpi = $ENV{'MPIHOME'};
-my $MPIROOT_intelmpi = $ENV{'MPIHOME'};
-my $MPIEXEC_openmpi = "$MPIROOT_openmpi/bin/mpiexec";
-my $MPIEXEC_intelmpi = "$MPIROOT_intelmpi/bin/mpiexec";
-my $MPIEXEC_mvapich2 = "mpirun";
-##############################
-
-my $OMPType = '';
-my $MPIType = '';
-my $WrapperScript = "mpiexec.$$";
-my %Domains;
-my $NP = 0;
-my $PPN = 0;
-my $NperNode = 0;
-my %NodeList;
-my $NumberOfNodes = 0;
-my $NumberOfUsedNodes = 0;
-my $Hostfilename = 0;
-my $Hostfile = '';
-my $PerformanceGroup = '';
-my $LikwidCall = "$LIKWIDPIN -c ";
-my $debug = 0;
-my $marker = '';
-
-sub readHostfile
-{
- open FILE, "<$Hostfilename";
-
- while (<FILE>) {
- chomp;
- if (not exists $NodeList{$host}) {
- $NodeList{$_} = 1;
- }
- }
- close FILE;
-
- $NumberOfNodes = keys %NodeList;
-}
-
-# MPI implementations
-# OpenMPI #<#
-sub generateNodelist_openmpi
-{
- open FILE, ">$Hostfilename-openmpi";
-
- #FIXME Order may be different
- foreach my $node (keys %NodeList) {
- print FILE "$node slots=$PPN\n"
- }
-
- close FILE;
-
- $Hostfile = "-hostfile $Hostfilename-openmpi";
-}
-
-sub setEnvironment_openmpi
-{
-}
-
-sub executeMPI_openmpi
-{
- if ($debug) {
- print "$MPIEXEC_openmpi $Hostfile -np $NP -npernode $NperNode ./$WrapperScript";
- }
-
- system ("$MPIEXEC_openmpi $Hostfile -np $NP -npernode $NperNode ./$WrapperScript");
-}
-#>#
-
-# mvapich2 #<#
-sub generateNodelist_mvapich2
-{
-}
-
-sub setEnvironment_mvapich2
-{
- $ENV{'MV2_ENABLE_AFFINITY'}='0';
-}
-
-#tw
-#mvapich2: pinning aus
-# Hybrid programming options:
-# -ranks-per-proc assign so many ranks to each process
-#
-# Processor topology options:
-# -binding process-to-core binding mode
-# -topolib processor topology library ( hwloc plpa)
-
-sub executeMPI_mvapich2
-{
- if ($debug) {
- print "$MPIEXEC_mvapich2 $Hostfile -np $NP -npernode $NperNode ./$WrapperScript";
- }
-
- system ("$MPIEXEC_mvapich2 $Hostfile -np $NP -ppn $NperNode ./$WrapperScript");
-
-}
-
-#generate wrapper script
-#mpirank
-#mpitype = mvapich
-
-#>#
-
-# Intel MPI #<#
-sub generateNodelist_intelmpi
-{
- open FILE, ">$Hostfilename-intelmpi";
-
- #FIXME Order may be different
- foreach my $node (keys %NodeList) {
- print FILE "$node\:$NperNode\n"
- }
-
- close FILE;
-
- $Hostfile = "-f $Hostfilename-intelmpi";
-}
-
-sub setEnvironment_intelmpi
-{
- $ENV{'I_MPI_PIN'}='off';
- $ENV{'KMP_AFFINITY'}='disabled';
-}
-
-sub executeMPI_intelmpi
-{
- if ($debug) {
- print "$MPIROOT_intelmpi/bin/mpdboot -r ssh -n $NumberOfNodes $Hostfile \n";
- print "$MPIROOT_intelmpi/bin/mpiexec -np $NP $WrapperScript \n";
- print "$MPIROOT_intelmpi/bin/mpdallexit \n";
- }
-
- system ("$MPIROOT_intelmpi/bin/mpdboot -r ssh -n $NumberOfNodes $Hostfile ");
- system ("$MPIROOT_intelmpi/bin/mpiexec -perhost $NperNode -np $NP ./$WrapperScript");
- system ("$MPIROOT_intelmpi/bin/mpdallexit");
-}
-#>#
-
-sub generateHostlist #<#
-{
- $ppnHost = '';
- open FILE, "<$ENV{'PBS_NODEFILE'}";
- my @hostArray = <FILE>;
- close FILE;
-
- $ppnhost = $hostArray[0];
- chomp $ppnhost;
-
- # generate unique host list
- foreach my $host (@hostArray) {
- chomp $host;
- if ($ppnhost eq $host) {
- $PPN++;
- }
- if (not exists $NodeList{$host}) {
- $NodeList{$host} = 1;
- }
- }
-
- $NumberOfNodes = keys %NodeList;
-}
-#>#
-
-sub usage #<#
-{
- print <<END;
-usage: $0 -np <NUMPROC>
-
-Required:
--np <NUMPROC> : number of MPI processes
-
-Optional:
--h : this (help) message
--d : debug run
--hostfile <argument> : Specify nodes if not in in a scheduler
--nperdomain <argument> : Run specified number of processes per domain.
- Supported domains are:
- N Node
- S Socket
- C last level cache group
- M NUMA domain
--pin <argument> : Specify pinning for hybrid execution.
- Processes are separated by underscore.
- The threaded pinning must be a valid likwid-pin list.
--omp <argument> : Enables support for specific hybrid setup. Use only
- together with -pin option. Currently recognized values: intel
--mpi <argument> : Specify which mpi implementation should be used. Current recognized
- values: intelmpi, openmpi, mvapich2
--- : Stop the likwid-mpirun parser. Useful for saving options to
- the MPI application.
-
-You can either use -nperdomain OR -pin for specifying pinning.
-For pure MPI pinning use only the nperdomain option. For hybrid use the pin option.
-
-Example:
-$0 -np 32 ./a.out
-
-$0 will use as many processes per node as available in ppn
-
-Example with pinning:
-$0 -np 32 -nperdomain S:2 ./a.out
-starts 2 processes per socket.
-
-Example for hybrid run:
-$0 -np 32 -pin M0:0-3_M1:0-3
-starts 2 processes per node. Threads of first process are pinned to first four
-cores in NUMA domain 0. Threads of second process are pinned to first four cores
-in NUMA domain 1.
-END
-
-exit(0);
-}
-#>#
-
-sub generateDomains #<#
-{
- my $output = `$LIKWIDPIN -p`;
-
- foreach my $line (split("\n",$output)) {
- if ($line =~ /Tag ([NSCM])[0-9]*: ([0-9 ]+)/) {
- if (exists $Domains{$1}) {
- $Domains{$1}++;
- } else {
- $Domains{$1} = 1;
- }
-
- if ($1 eq 'N') {
- $PPN = split(/ /,$2);
- }
- }
- }
-}
-#>#
-
-sub generateWrapperScript #<#
-{
- my $pinStrings = shift;
- my $mpiType = shift;
- open FILE, ">$WrapperScript";
- my $environment = '';
- my $doRest = '';
-
- if ($mpiType eq 'openmpi') {
- $environment = 'OMPI_COMM_WORLD_RANK';
- } elsif ($mpiType eq 'intelmpi') {
- $environment = 'PMI_RANK';
- } elsif ($mpiType eq 'mvapich2') {
- $environment = 'PMI_RANK'; #tw maybe????
- }
-
- if ($NP % $NperNode) {
- my $rest = $NP-($NP % $NperNode);
- $doRest = "if (\$myRank >= $rest) {\$localId = \$myRank - $rest;}\n";
- }
-
- print FILE <<END;
-#!/usr/bin/perl
-use strict;
-use warnings;
-
-my \$args = join \@ARGV;
-my \$myRank = \$ENV{$environment};
-
-my \$localId = \$myRank \% $NperNode ;
-
-$doRest
-
-if (\$localId == 0) {
- system ("$LikwidCall $pinStrings->[0] $PerformanceGroup $OMPType $cmdline \$args ");
-}
-END
-
- foreach my $process ( 1 .. ($NperNode-1) ) {
- print FILE <<END;
-elsif (\$localId == $process) {
- system ("$LikwidCall $pinStrings->[$process] $PerformanceGroup $OMPType $cmdline \$args ");
-}
-END
- }
-
- close FILE;
-}
-#>#
-
-my $pinString = '';
-my $domain = '';
-my @pinStrings;
-
-GetOptions ('np=i' => \$NP,
- 'nperdomain=s' => \$NperDomain,
- 'hostfile=s' => \$Hostfilename,
- 'pin=s' => \$pinString,
- 'mpi=s' => \$MPIType,
- 'omp=s' => \$OMPType,
- 'perf=s' => \$PerformanceGroup,
- 'debug' => \$debug,
- 'marker' => sub { $marker = ' -m '; },
- 'help' => \&usage);
-
-# MPI implementation switch
-$generateNodelist = "generateNodelist_$MPIType";
-$setEnvironment = "setEnvironment_$MPIType";
-$executeMPI = "executeMPI_$MPIType";
-
-generateDomains();
-
-# check for PBS batch system
-if (not defined ($ENV{'PBS_JOBID'})) {
- readHostfile();
-} else {
- $NumberOfNodes = `uniq \$PBS_NODEFILE | wc -l`;
-}
-
-if ($pinString) {
- @pinStrings = split('_',$pinString);
- $NperNode = ($#pinStrings+1);
-
- if ($MPIType eq 'openmpi') {
- if ($OMPType eq 'intel') {
- $OMPType = '';
- $OMPType = '-s 0xF';
- }
- } elsif ($MPIType eq 'intelmpi') {
- if ($OMPType eq 'intel' and ($NumberOfNodes == 1)) {
- $OMPType = '-t intel';
- } elsif ($OMPType eq 'intel') {
- $OMPType = '-s 0x7';
- }
- }elsif ($MPIType eq 'mvapich2') {
- if ($OMPType eq 'intel' and ($NumberOfNodes == 1)) {
- $OMPType = '-t intel';
- } elsif ($OMPType eq 'intel') {
- $OMPType = '-s 0x7';
- }
- }
-
-} elsif ($NperDomain) {
-
- $OMPType = '';
- if ($NperDomain =~ /([NSCM]):([0-9]+)/) {
- $domain = $1;
- $NperDomain = $2;
- } else {
- die "Parse Error \n";
- }
-
- $NperNode = $NperDomain * $Domains{$domain};
-
- if (not $domain eq 'N') {
- foreach my $currentDomain ( 0 .. ($Domains{$domain}-1)) {
- foreach my $currentProcess ( 0 .. ($NperDomain-1)) {
- push @pinStrings, "$domain"."$currentDomain".":$currentProcess";
- }
- }
- } else {
- foreach my $currentProcess ( 0 .. ($NperDomain-1)) {
- push @pinStrings, "$domain".":$currentProcess";
- }
- }
-} elsif ($NP) {
- print "PPN = $PPN\n";
- $NperNode = $PPN;
- $OMPType = '';
-
- foreach my $currentProcess ( 0 .. ($PPN-1)) {
- push @pinStrings, "N".":$currentProcess";
- }
-} else {
- usage();
-}
-
-if (not defined ($ENV{'PBS_JOBID'})) {
- $Hostfilename .= $$;
- &{$generateNodelist}();
-} else {
- if ($MPIType eq 'intelmpi') {
- $Hostfilename = "pbshosts$$";
- generateHostlist();
- &{$generateNodelist}();
- }
-}
-
-map {$cmdline .= "$_ " ;} @ARGV;
-$NumberOfUsedNodes = $NP / $NperNode;
-
-if ($NumberOfUsedNodes > $NumberOfNodes) {
- die "ERROR: Require $NumberOfUsedNodes nodes, but only $NumberOfNodes available!";
-}
-
-if ($NumberOfUsedNodes < 1) {
- die "ERROR: Requested $NperNode processes per Node with only $NP total processes!";
-}
-
-if ($PerformanceGroup) {
- $LikwidCall = "$LIKWIDPERF -C";
- $PerformanceGroup = ' -g '.$PerformanceGroup ;
- $PerformanceGroup .= " $marker -o perf_%h_%r.txt ";
-} else {
- $PerformanceGroup = ' -q ';
-}
-
-generateWrapperScript(\@pinStrings,$MPIType);
-chmod 0755,$WrapperScript;
-&{$setEnvironment}();
-
-if ($debug) {
- print "Number of nodes: $NumberOfNodes \n";
- $NumberOfUsedNodes = $NP / $NperNode;
- print "Number of used nodes: $NumberOfUsedNodes \n";
- print "Number of processes per node: $NperNode \n";
-}
-&{$executeMPI}();
-
-if (-e $WrapperScript and not $debug) {
- unlink ($WrapperScript);
- unlink ($Hostfilename);
-}
-
-# vim: foldmethod=marker foldmarker=#<#,#>#
diff --git a/perl/likwid-perfscope b/perl/likwid-perfscope
deleted file mode 100755
index 84f99da..0000000
--- a/perl/likwid-perfscope
+++ /dev/null
@@ -1,110 +0,0 @@
-#!/usr/bin/perl
-use strict;
-use warnings;
-
-use Getopt::Long;
-
-sub usage #<#
-{
- print <<END;
-usage: $0 --group <Performance Group> --cores <physical core list>
-
-Required:
--cores <CORELIST> : list of physical cores
-
-Optional:
--h : this (help) message
--freq : frequency of updates, in ms or s (e.g. 500ms), default: 1s
--group <PERFGROUP> : Specify what to plot, default FLOPS_DP
-
-Example:
-$0 -group FLOPS_DP -cores 0-3
-END
-
-exit(0);
-}
-#>#
-
-my $CONFIG = { #<#
- "FLOPS_DP" => {
- "group" => 'FLOPS_DP',
- "expr" => 'DP MFlops/s',
- "title" => 'Double Precision Flop Rate',
- "yaxis" => 'MFlops/s'},
- "FLOPS_SP" => {
- "group" => 'FLOPS_SP',
- "expr" => 'SP MFlops/s',
- "title" => 'Single Precision Flop Rate',
- "yaxis" => 'MFlops/s'},
- "L2" => {
- "group" => 'L2',
- "expr" => 'L2 bandwidth [MBytes/s]',
- "title" => 'L2 cache bandwidth',
- "yaxis" => 'bandwidth [MB/s]'},
- "L3" => {
- "group" => 'L3',
- "expr" => 'L3 bandwidth [MBytes/s]',
- "title" => 'L3 cache bandwidth',
- "yaxis" => 'bandwidth [MB/s]'},
- "CLOCK" => {
- "group" => 'CLOCK',
- "expr" => 'Clock [MHz]',
- "title" => 'Clock rate',
- "yaxis" => 'MHz'},
- "NUMA" => {
- "group" => 'MEM',
- "expr" => 'Remote BW [MBytes/s]',
- "title" => 'Remote NUMA bandwidth',
- "yaxis" => 'bandwidth [MB/s]'},
- "MEM" => {
- "group" => 'MEM',
- "expr" => 'MBytes/s',
- "title" => 'Main memory bandwidth',
- "yaxis" => 'bandwidth [MB/s]'}};
-#>#
-
-my $FREQ = '1s';
-my $CORES = '';
-my $optGroup = 'FLOPS_DP';
-my $optPlot;
-
-GetOptions ('group=s' => \$optGroup, 'freq=s' => \$FREQ, 'cores=s' => \$CORES, 'plot=s' => \$optPlot, 'help' => \&usage);
-
-my $GROUP = $CONFIG->{$optGroup}->{'group'};
-my $yaxis = $CONFIG->{$optGroup}->{'yaxis'};
-my $title = $CONFIG->{$optGroup}->{'title'};
-my $expr = $CONFIG->{$optGroup}->{'expr'};
-my $legend = '';
-
-open (INPUT, "likwid-perfctr -g $GROUP -d $FREQ -c $CORES |");
-
-select((select(INPUT), $| = 1)[0]);
-
-while (<INPUT>) {
- if (/CORES: ([0-9 ]+)/) {
- my @cores = split ' ',$1;
- my $coreNumber = 0;
-
- foreach my $core (@cores) {
- $legend .= " --legend $coreNumber=\"core $core\" ";
- $coreNumber++;
- }
- last;
- }
-}
-
-open (OUTPUT, "| feedGnuplot --lines --domain --stream --xlabel \"seconds\" --ylabel \"$yaxis\" --title \"$title\" $legend");
-
-select((select(OUTPUT), $| = 1)[0]);
-
-while (<INPUT>) {
- if (/$expr/) {
- s/$expr//;
- print OUTPUT;
- }
-}
-close(INPUT);
-close(OUTPUT);
-
-
-# vim: foldmethod=marker foldmarker=#<#,#>#
diff --git a/perl/likwid-setFrequencies b/perl/likwid-setFrequencies
deleted file mode 100755
index 5834441..0000000
--- a/perl/likwid-setFrequencies
+++ /dev/null
@@ -1,185 +0,0 @@
-#!/usr/bin/perl
-# =======================================================================================
-#
-# Filename: likwid-setFrequencies
-#
-# Description: Application allowing to change core frequencies
-#
-# Version: <VERSION>
-# Released: <DATE>
-#
-# Author: Jan Treibig (jt), jan.treibig at gmail.com
-# Project: likwid
-#
-# Copyright (C) 2014 Jan Treibig
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU General Public License as published by the Free Software
-# Foundation, either version 3 of the License, or (at your option) any later
-# version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT ANY
-# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-# PARTICULAR PURPOSE. See the GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License along with
-# this program. If not, see <http://www.gnu.org/licenses/>.
-#
-# =======================================================================================
-
-use Getopt::Std;
-
-my $LIKWIDPIN = '<PREFIX>/bin/likwid-pin';
-my $SYSPATH = '/sys/devices/system/cpu';
-my $SYSCMD = '<PREFIX>/sbin/likwid-setFreq';
-my $domain = 'N';
-my $governor = 'ondemand';
-my @processors;
-my %frequencies;
-my $freq_string;
-use vars qw/ %opt /;
-
-sub init
-{
- my $opt_string = 'g:c:f:lph';
- getopts( $opt_string, \%opt ) or usage();
- usage() if $opt{h};
- if (scalar(keys %opt) == 0)
- {
- usage();
- }
-}
-
-sub usage
-{
- print STDERR << "EOF";
-
-This script allows to switch governors and set fixed
-frequencies on Linux system.
-
-usage: $0 [-hlp] [-g governor] [-c domain] [-f frequency]
--h : this (help) message
--p : print current frequencies
--l : list available frequencies
--c domain : likwid thread domain which to apply settings
- (set to N if omitted)
--g governor : set governor (ondemand, performance, turbo)
- (set to ondemand if omitted)
--f frequency: set fixed frequency, implicitly sets userspace
- governor
-
-example: $0 -c S0 -f 2.7 (set all CPUs on socket 0 to 2.7 GHz)
-EOF
- exit;
-}
-
-sub extractAvailableFrequencies
-{
- my @tmp_keys;
- open FILE, "<$SYSPATH/cpu0/cpufreq/scaling_available_frequencies";
- my $tmp = <FILE>;
- my @list = split(/ /,$tmp);
- close FILE;
- $frequencies{'turbo'} = $list[0];
-
- foreach my $item ( @list ) {
- if( not $item =~ /\n/ ) {
- my $key = $item/1000000.0;
- push @tmp_keys, $key;
- $frequencies{$key} = $item;
- }
- }
-
- $freq_string = join(' ', sort @tmp_keys);
-}
-
-sub extractProcessorList
-{
- my $output = `$LIKWIDPIN -p`;
- my $found = 0;
-
- foreach my $line (split("\n",$output)) {
- if ($line =~ /Tag ([NSCM][0-9]*): ([0-9 ]+)/) {
- if ($domain eq $1) {
- $found = 1;
- @processors = split(/ /,$2);
- last;
- }
- }
- }
-
- if ( not $found ) {
- print "Domain $domain not available!\n";
- exit;
- }
-}
-
-
-init();
-
-if (! -s $SYSCMD) {
- die "ERROR Binary $SYSCMD not existing!\n\n";
-}
-
-if ( defined $opt{c}) {
- $domain = $opt{c};
-}
-
-extractProcessorList();
-extractAvailableFrequencies();
-
-if ($opt{f}) {
- $freq = $opt{f};
-
- if (not exists($frequencies{$freq})) {
- print "Frequency $freq not available!\nPlease select one of $freq_string\n";
- exit;
- }
-
- foreach my $processID (@processors) {
-# print "$SYSCMD $processID $frequencies{$freq}\n";
- system("$SYSCMD $processID $frequencies{$freq}");
- }
-}
-
-if ($opt{p}) {
- foreach my $processID (@processors) {
- open FILE,"<$SYSPATH/cpu".$processID."/cpufreq/scaling_governor";
- my $gov = <FILE>;
- chomp $gov;
- close FILE;
- open FILE,"<$SYSPATH/cpu".$processID."/cpufreq/scaling_cur_freq";
- my $freq = <FILE>;
- chomp $freq;
- close FILE;
- print "CPU $processID: governor $gov frequency $freq\n"
- }
- exit;
-}
-
-if ($opt{l}) {
- print "Available frequencies: $freq_string\n";
- exit;
-}
-
-if ($opt{g} eq 'turbo') {
- foreach my $processID (@processors) {
-# print "$SYSCMD $processID $frequencies{turbo}\n";
- system("$SYSCMD $processID $frequencies{turbo}");
- }
- exit;
-}
-
-if ($opt{g}) {
- $governor = $opt{g};
- if (($governor ne "ondemand") and ($governor ne "performance")) {
- print "Governor $governor not valid\n";
- } else {
- print "Set governor in domain $domain to $governor \n";
- foreach my $processID (@processors) {
- system("$SYSCMD $processID 0 $governor");
- }
- }
-}
-
-# vim: foldmethod=marker foldmarker=#<#,#>#
diff --git a/perl/set_license.pl b/perl/set_license.pl
index f80326d..b14801d 100755
--- a/perl/set_license.pl
+++ b/perl/set_license.pl
@@ -8,15 +8,18 @@ use File::Copy;
my $mc = '#';
my $cc = ' *';
my $fc = '!';
+my $lc = ' *';
#my $VERSION = '<VERSION>';
#my $DATE = '<DATE>';
-my $VERSION = '3.1.3';
-my $DATE = '4.11.2014';
-my $YEAR = '2014';
-my $AUTHOR = 'Jan Treibig';
+my $VERSION = '4.1';
+my $DATE = '19.5.2016';
+my $YEAR = '2016';
+my $AUTHOR = 'RRZE, University Erlangen-Nuremberg';
my $LICENSE = 'gpl';
+my @SKIPLIST = ('ghash.c','ghash.h','loadData.S','bstrlib.c','bstrlib.h', 'calculator_stack.h', 'calculator_stack.c');
+
sub print_copyright
{
my $fh = shift;
@@ -72,108 +75,143 @@ END
}
}
-sub wanted
+sub wanted
{
- my $filename;
-
- if (scalar(@_)) {
- $filename = shift;
- } else {
- $filename = $_;
- }
-
- if (($filename =~ /^\./) or (-d $_)) {
- return;
- }
-
- my $in_copyright = 0;
- my $in_header = 0;
- my $style = $cc;
- my $enter = 0;
- open INFILE, "< $filename";
- open OUTFILE, "> $filename.tmp";
- print "Process $filename\n";
-
- while( <INFILE> ) {
-
- if (/\/\*/ and !$enter) {
- $style = $cc;
- $enter = 1;
- $in_header = 1;
- print OUTFILE "/*\n";
- print OUTFILE "$style =======================================================================================\n";
- next;
- } elsif (/# =/ and !$enter) {
- $style = $mc;
- $enter = 1;
- $in_header = 1;
- print OUTFILE "$style =======================================================================================\n";
- next;
- } elsif (/! =/ and !$enter) {
- $style = $fc;
- $enter = 1;
- $in_header = 1;
- print OUTFILE "$style =======================================================================================\n";
- next;
- } elsif (!$enter) {
- print "Skip $filename: No header found!\n";
- return;
- }
-
- if ($in_header) {
- if(/Filename:[ ]+([A-za-z0-9._\-]+)/) {
- if ($1 ne $filename) {
- print "File name mismatch: $filename header says $1\n";
- }
- print OUTFILE "$_";
- } elsif(/Version:/) {
- print OUTFILE "$style Version: $VERSION\n";
- } elsif(/Released:/) {
- print OUTFILE "$style Released: $DATE\n";
- } elsif(/Company:/) {
- #Skip company from header
- } elsif(/Copyright/) {
- $in_copyright = 1;
-# print OUTFILE "$style\n";
- print_copyright(\*OUTFILE, $style);
- } elsif(/# =/ or /! =/) {
- $in_copyright = 0;
- $in_header = 0;
- } elsif (/\*\//) {
- $in_copyright = 0;
- $in_header = 0;
- print OUTFILE " */\n";
- } elsif (/\* =/) {
- # Skip initial hline
- } else {
- if($in_copyright eq 0) {
- print OUTFILE "$_";
- }
- }
-
- } else {
- print OUTFILE "$_";
- }
- }
-
- close INFILE;
- close OUTFILE;
-
- unlink $filename or die "Failed to delete file $filename\n";
- copy ("$filename.tmp", $filename) or die "Copy failed\n";
- unlink "$filename.tmp" or die "Failed to delete file $filename\n";
+ my $filename;
+
+ if (scalar(@_)) {
+ $filename = shift;
+ } else {
+ $filename = $_;
+ }
+
+ if (($filename =~ /^\./) or (-d $filename)) {
+ return;
+ }
+
+ foreach my $filter ( @SKIPLIST ) {
+ if ( $filename eq $filter ) {
+ print "SKIP $filename\n";
+ return;
+ }
+ }
+
+ my $in_copyright = 0;
+ my $in_header = 0;
+ my $style = $cc;
+ my $enter = 0;
+ open INFILE, "< $filename";
+ open OUTFILE, "> $filename.tmp";
+ print "Process $filename\n";
+
+ while( <INFILE> ) {
+ # Ensure UNIX line ending
+ $_ =~ s/\cM\cJ|\cM|\cJ/\n/g;
+
+ if (/\/\*/ and !$enter) {
+ $style = $cc;
+ $enter = 1;
+ $in_header = 1;
+ print OUTFILE "/*\n";
+ print OUTFILE "$style =======================================================================================\n";
+ next;
+ } elsif (/# =/ and !$enter) {
+ $style = $mc;
+ $enter = 1;
+ $in_header = 1;
+ print OUTFILE "$style =======================================================================================\n";
+ next;
+ } elsif (/! =/ and !$enter) {
+ $style = $fc;
+ $enter = 1;
+ $in_header = 1;
+ print OUTFILE "$style =======================================================================================\n";
+ next;
+ } elsif (/#!/ and !$enter) {
+ $style = $lc;
+ $enter = 1;
+ $in_header = 1;
+ print OUTFILE "$_";
+ print OUTFILE "--[[\n";
+ print OUTFILE "$style =======================================================================================\n";
+ next;
+ } elsif (/\-\-\[\[/ and !$enter) {
+ $style = $lc;
+ $enter = 1;
+ $in_header = 1;
+ print OUTFILE "--[[\n";
+ print OUTFILE "$style =======================================================================================\n";
+ next;
+ } elsif (!$enter) {
+ print "Skip $filename: No header found!\n";
+ unlink "$filename.tmp" or die "Failed to delete file $filename\n";
+ return;
+ }
+
+ if ($in_header) {
+ if(/Filename:[ ]+([A-za-z0-9._\-]+)/) {
+ if ($1 ne $filename) {
+ print "File name mismatch: $filename header says $1\n";
+ }
+ print OUTFILE "$_";
+ } elsif(/Version:/) {
+ print OUTFILE "$style Version: $VERSION\n";
+ } elsif(/Released:/) {
+ print OUTFILE "$style Released: $DATE\n";
+ } elsif(/Copyright/) {
+ $in_copyright = 1;
+ print_copyright(\*OUTFILE, $style);
+ } elsif(/# =/ or /! =/) {
+ $in_copyright = 0;
+ $in_header = 0;
+ } elsif (/\*\//) {
+ $in_copyright = 0;
+ $in_header = 0;
+ print OUTFILE " */\n";
+ } elsif (/\]\]$/) {
+ $in_copyright = 0;
+ $in_header = 0;
+ print OUTFILE "]]\n";
+ } elsif (/\* =/ or /\-\-\[\[/) {
+ # Skip initial hline
+ } else {
+ if($in_copyright eq 0) {
+ print OUTFILE "$_";
+ }
+ }
+ } else {
+ print OUTFILE "$_";
+ }
+ }
+
+ close INFILE;
+ close OUTFILE;
+
+ unlink $filename or die "Failed to delete file $filename\n";
+ copy ("$filename.tmp", $filename) or die "Copy failed\n";
+ unlink "$filename.tmp" or die "Failed to delete file $filename\n";
}
if (defined $ARGV[0]) {
my $filename = $ARGV[0];
wanted($filename);
- exit (0);
+ exit (0);
}
my @directories;
push @directories, 'src';
+push @directories, 'bench/src';
+push @directories, 'bench/includes';
+push @directories, 'examples';
find(\&wanted, @directories);
+# single files
+wanted('Makefile');
+chdir 'bench';
+wanted('Makefile');
+wanted('likwid-bench.c');
+
+
diff --git a/perl/templates/group.tt b/perl/templates/group.tt
deleted file mode 100644
index 2122caf..0000000
--- a/perl/templates/group.tt
+++ /dev/null
@@ -1,208 +0,0 @@
-/* GENERATED FILE: DO NOTE EDIT */
-
-#define NUM_GROUPS_[% arch FILTER upper %] [% numGroups %]
-
-[% FOREACH group IN groups %]
-static const char* group_names_[% arch FILTER ucfirst %]_[% group.name %] [] = {[% FOREACH metric IN group.metrics %] "[% metric.label %]", [% END %] NULL};
-[% END %]
-
-static PerfmonGroupMap [% arch %]_group_map[NUM_GROUPS_[% arch FILTER upper %]] = {
-[% FOREACH group IN groups %]
- {"[% group.name %]",[% group.name %],[% group.isUncore %],"[% group.shortHelp %]","[% group.eventSet %]", 0 [% FOREACH metric IN group.metrics %] +1 [% END %], group_names_[% arch FILTER ucfirst %]_[% group.name %]
- },
-[% END %]
-};
-
-void perfmon_getDerivedCounterValues[% arch FILTER ucfirst %](PerfmonGroup group, float * values, float * out_max, float * out_min){
- double time = rdtscTime;
- double inverseClock = 1.0 /(double) timer_getCpuClock();
-
- values[0] = time;
- out_min[0] = time;
- out_max[0] = time;
-
- switch ( group ) {
- [% FOREACH group IN groups %]
- case [% group.name %]:{
- int threadId;
- int counter = 0;
- double sum,min,max;
-
- [% FOREACH metric IN group.metrics %]
- sum = 0;
- min = 1e300;
- max = 0;
-
- for(threadId=0; threadId < perfmon_numThreads; threadId++)
- {
- double cur = [% metric.rule %];
- cur = isnan(cur) ? 0.0 : cur;
- sum += cur;
- max = max > cur ? max : cur;
- min = min < cur ? min : cur;
- }
-
- values[counter] = (float) sum / perfmon_numThreads;
- out_min[counter] = (float) min;
- out_max[counter] = (float) max;
- counter++;
- [% END %]
- return;
- }
- [% END %]
-
- default:
- fprintf (stderr, "perfmon_getDerivedCounterValues[% arch %]: Unknown group! Exiting!\n" );
- exit (EXIT_FAILURE);
- break;
- }
-}
-
-
-void
-perfmon_printDerivedMetrics[% arch FILTER ucfirst %](PerfmonGroup groupId)
-{
- int threadId;
- double time = rdtscTime;
- double inverseClock = 1.0 /(double) timer_getCpuClock();
- PerfmonResultTable tableData;
- int numRows;
- int numColumns = perfmon_numThreads;
- bstring label;
- bstrList* fc;
- double** stat;
- double tmpValue;
- uint64_t cpi_instr = 0;
- uint64_t cpi_cyc = 0;
- int cpi_index = 0;
-
- switch ( groupId )
- {
-[% FOREACH group IN groups %]
- case [% group.name %]:
- numRows = [% group.numRows %];
- stat = (double**) malloc(numRows * sizeof(double*));
- for (int i=0; i<numRows; i++)
- {
- stat[i] = (double*) malloc(4 * sizeof(double));
- stat[i][0] = 0;
- stat[i][1] = 0;
- stat[i][2] = DBL_MAX;
- }
- INIT_BASIC;
-[% FOREACH metric IN group.metrics %]
- bstrListAdd(fc,[% loop.count %],[% metric.label %]);
-[% END %]
- initResultTable(&tableData, fc, numRows, numColumns);
-
- for(threadId=0; threadId < perfmon_numThreads; threadId++)
- {
-[% FOREACH metric IN group.metrics %]
- tmpValue = [% metric.rule %];
- if (!isnan(tmpValue))
- {
- tableData.rows[[% loop.index %]].value[threadId] = tmpValue;
- }
- else
- {
- tableData.rows[[% loop.index %]].value[threadId] = 0.0;
- }
-[% IF metric.label == 'CPI' && arch == 'westmere' %]
- cpi_instr += perfmon_getResult(threadId,"FIXC0");
- cpi_cyc += perfmon_getResult(threadId,"FIXC1");
- cpi_index = [% loop.index %];
-[% ELSE %]
- stat[[% loop.index %]][0] += (double) tableData.rows[[% loop.index %]].value[threadId];
-[% END %]
- stat[[% loop.index %]][1] = MAX(stat[[% loop.index %]][1],(double) tableData.rows[[% loop.index %]].value[threadId]);
- stat[[% loop.index %]][2] = MIN(stat[[% loop.index %]][2],(double) tableData.rows[[% loop.index %]].value[threadId]);
-[% END %]
- }
-
- if (cpi_instr)
- {
- stat[cpi_index][0] = (double) cpi_cyc / (double) cpi_instr;
- }
-
- break;
-[% END %]
-
- default:
- fprintf (stderr, "perfmon_printDerivedMetrics[% arch %]: Unknown group! Exiting!\n" );
- exit (EXIT_FAILURE);
- break;
- }
-
- printResultTable(&tableData);
- freeResultTable(&tableData);
-
- /* for threaded results print sum, max, min and avg */
- if (perfmon_numThreads > 1)
- {
- initStatisticTable(&tableData, fc, numRows);
- for (int i=0; i<numRows; i++)
- {
- stat[i][3] = stat[i][0]/perfmon_numThreads;
- for (int j=0; j<4; j++)
- {
- tableData.rows[i].value[j] = stat[i][j];
- }
- }
- printResultTable(&tableData);
- freeResultTable(&tableData);
- }
-
- for (int i=0; i<numRows; i++)
- {
- free(stat[i]);
- }
- free(stat);
- bstrListDestroy(fc);
-}
-
-void
-perfmon_logDerivedMetrics[% arch FILTER ucfirst %](PerfmonGroup group, double time,double timeStamp)
-{
- int threadId;
- double tmpValue;
- double inverseClock = 1.0 /(double) timer_getCpuClock();
-
- switch ( group )
- {
- [% FOREACH group IN groups %]
- case [% group.name %]:
-
- [% FOREACH metric IN group.metrics %]
- printf("[% metric.label %] %e ",timeStamp);
- for(threadId=0; threadId < perfmon_numThreads; threadId++)
- {
- tmpValue = [% metric.rule %];
- if (!isnan(tmpValue))
- {
- printf(" %e ", tmpValue);
- }
- else
- {
- printf(" 0.0 ");
- }
- }
- printf("\n");
- [% END %]
- break;
- [% END %]
-
- default:
- fprintf (stderr, "perfmon_printDerivedMetrics[% arch %]: Unknown group! Exiting!\n" );
- exit (EXIT_FAILURE);
- break;
- }
-}
-
-
-
-static PerfmonGroupHelp [% arch %]_group_help[NUM_GROUPS_[% arch FILTER upper %]] = {
-[% FOREACH group IN groups %]
- {"[% group.name %]","[% group.longHelp %]"},
-[% END %]
-};
-
diff --git a/perl/templates/testcases.tt b/perl/templates/testcases.tt
deleted file mode 100644
index 1f03a85..0000000
--- a/perl/templates/testcases.tt
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef TESTCASES_H
-#define TESTCASES_H
-
-#include <test_types.h>
-
-[% FOREACH test IN Testcases %]
-extern void [% test.name %]();
-[% END %]
-
-#define TESTS "[% allTests %]"
-#define NUMKERNELS [% numKernels %]
-
-static const TestCase kernels[NUMKERNELS] = {
- [% FOREACH test IN Testcases %]
- {"[% test.name %]" , [% test.streams %], [% test.type %], [% test.stride %], &[% test.name %], [% test.flops %], [% test.bytes %]},
- [% END %]
-};
-
-#endif /* TESTCASES_H */
diff --git a/src/access-daemon/Makefile b/src/access-daemon/Makefile
index afd751b..5af6941 100644
--- a/src/access-daemon/Makefile
+++ b/src/access-daemon/Makefile
@@ -4,13 +4,13 @@
#
# Description: accessDaemon Makefile
#
-# Version: 3.1.3
-# Released: 4.11.2014
+# Version: 4.1
+# Released: 19.5.2016
#
# Author: Jan Treibig (jt), jan.treibig at gmail.com
# Project: likwid
#
-# Copyright (C) 2014 Jan Treibig
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
@@ -31,20 +31,22 @@ include ../../make/include_$(COMPILER).mk
DAEMON_TARGET = likwid-accessD
SETFREQ_TARGET = likwid-setFreq
+Q ?= @
-DEFINES = -D_GNU_SOURCE -DMAX_NUM_THREADS=$(MAX_NUM_THREADS)
+DEFINES += -D_GNU_SOURCE -DMAX_NUM_THREADS=$(MAX_NUM_THREADS) -DMAX_NUM_NODES=$(MAX_NUM_NODES)
INCLUDES = -I../includes
-ifeq ($(COMPILER),GCC)
-CFLAGS += -pedantic -Wall -Wextra -std=c99
+CFLAGS += -std=c99 -fPIC -pie -fPIE -fstack-protector
+ifeq ($(COMPILER),GCCX86)
+CFLAGS += -m32
endif
CPPFLAGS := $(DEFINES) $(INCLUDES)
-Q=
all: $(DAEMON_TARGET) $(SETFREQ_TARGET)
$(DAEMON_TARGET): accessDaemon.c
- $(CC) $(ANSI_CFLAGS) $(CFLAGS) $(CPPFLAGS) -o ../../$(DAEMON_TARGET) accessDaemon.c
+ $(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -o ../../$(DAEMON_TARGET) accessDaemon.c
$(SETFREQ_TARGET): setFreq.c
- $(CC) $(ANSI_CFLAGS) $(CFLAGS) $(CPPFLAGS) -o ../../$(SETFREQ_TARGET) setFreq.c
+ $(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -o ../../$(SETFREQ_TARGET) setFreq.c
+
diff --git a/src/access-daemon/accessDaemon.c b/src/access-daemon/accessDaemon.c
index 5679a92..ee875fb 100644
--- a/src/access-daemon/accessDaemon.c
+++ b/src/access-daemon/accessDaemon.c
@@ -5,14 +5,15 @@
*
* Description: Implementation of access daemon.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
* Authors: Michael Meier, michael.meier at rrze.fau.de
- * Jan Treibig (jt), jan.treibig at gmail.com
+ * Jan Treibig (jt), jan.treibig at gmail.com,
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -46,135 +47,53 @@
#include <sys/fsuid.h>
#include <getopt.h>
-#include <pci_types.h>
+#include <types.h>
+#include <registers.h>
+#include <perfmon_haswellEP_counters.h>
+#include <perfmon_ivybridgeEP_counters.h>
+#include <perfmon_sandybridgeEP_counters.h>
+#include <perfmon_broadwelld_counters.h>
+#include <perfmon_broadwellEP_counters.h>
+#include <topology.h>
+#include <cpuid.h>
#include <lock.h>
-#include <accessClient_types.h>
+
/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
#define SA struct sockaddr
#define str(x) #x
-#define CHECK_ERROR(func, msg) \
- if ((func) < 0) { \
- syslog(LOG_ERR, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); \
- }
-
#define CHECK_FILE_ERROR(func, msg) \
- if ((func) == 0) { \
- syslog(LOG_ERR, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); \
- }
-
-
-#define EXIT_IF_ERROR(func, msg) \
- if ((func) < 0) { \
- syslog(LOG_ERR, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); \
- stop_daemon(); \
- exit(EXIT_FAILURE); \
- }
-
-
-#define CPUID \
- __asm__ volatile ("cpuid" \
- : "=a" (eax), "=b" (ebx) \
- : "0" (eax))
-
-
-/* Intel P6 */
-#define PENTIUM_M_BANIAS 0x09U
-#define PENTIUM_M_DOTHAN 0x0DU
-#define CORE_DUO 0x0EU
-#define CORE2_65 0x0FU
-#define CORE2_45 0x17U
-#define ATOM 0x1CU
-#define ATOM_45 0x26U
-#define ATOM_32 0x36U
-#define ATOM_22 0x27U
-#define ATOM_SILVERMONT 0x4DU
-#define NEHALEM 0x1AU
-#define NEHALEM_BLOOMFIELD 0x1AU
-#define NEHALEM_LYNNFIELD 0x1EU
-#define NEHALEM_LYNNFIELD_M 0x1FU
-#define NEHALEM_WESTMERE 0x2CU
-#define NEHALEM_WESTMERE_M 0x25U
-#define SANDYBRIDGE 0x2AU
-#define SANDYBRIDGE_EP 0x2DU
-#define HASWELL 0x3CU
-#define HASWELL_EX 0x3FU
-#define HASWELL_M1 0x45U
-#define HASWELL_M2 0x46U
-#define IVYBRIDGE 0x3AU
-#define IVYBRIDGE_EP 0x3EU
-#define NEHALEM_EX 0x2EU
-#define WESTMERE_EX 0x2FU
-#define XEON_MP 0x1DU
-
-/* Intel MIC */
-#define XEON_PHI 0x01U
-
-/* AMD K10 */
-#define BARCELONA 0x02U
-#define SHANGHAI 0x04U
-#define ISTANBUL 0x08U
-#define MAGNYCOURS 0x09U
-
-/* AMD K8 */
-#define OPTERON_SC_1MB 0x05U
-#define OPTERON_DC_E 0x21U
-#define OPTERON_DC_F 0x41U
-#define ATHLON64_X2 0x43U
-#define ATHLON64_X2_F 0x4BU
-#define ATHLON64_F1 0x4FU
-#define ATHLON64_F2 0x5FU
-#define ATHLON64_X2_G 0x6BU
-#define ATHLON64_G1 0x6FU
-#define ATHLON64_G2 0x7FU
-
-
-#define P6_FAMILY 0x6U
-#define MIC_FAMILY 0xBU
-#define NETBURST_FAMILY 0xFFU
-#define K15_FAMILY 0x15U
-#define K16_FAMILY 0x16U
-#define K10_FAMILY 0x10U
-#define K8_FAMILY 0xFU
+ if ((func) == 0) { syslog(LOG_ERR, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); }
+
+
+
+
+
#define PCI_ROOT_PATH "/proc/bus/pci/"
-#define MAX_PATH_LENGTH 60
-#define MAX_NUM_NODES 4
+#define MAX_PATH_LENGTH 80
+//#define MAX_NUM_NODES 4
/* Lock file controlled from outside which prevents likwid to start.
* Can be used to synchronize access to the hardware counters
* with an external monitoring system. */
/* ##### TYPE DEFINITIONS ########### */
-typedef int (*FuncPrototype)(uint32_t);
+typedef int (*AllowedPrototype)(uint32_t);
+typedef int (*AllowedPciPrototype)(PciDeviceType, uint32_t);
/* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */
static int sockfd = -1;
static int connfd = -1; /* temporary in to make it compile */
static char* filepath;
static const char* ident = "accessD";
-static FuncPrototype allowed = NULL;
+static AllowedPrototype allowed = NULL;
+static AllowedPciPrototype allowedPci = NULL;
static int FD_MSR[MAX_NUM_THREADS];
-static int FD_PCI[MAX_NUM_NODES][MAX_NUM_DEVICES];
+static int FD_PCI[MAX_NUM_NODES][MAX_NUM_PCI_DEVICES];
static int isPCIUncore = 0;
-
-static char* pci_DevicePath[MAX_NUM_DEVICES] = {
- "13.5", /* PCI_R3QPI_DEVICE_LINK_0 */
- "13.6", /* PCI_R3QPI_DEVICE_LINK_1 */
- "13.1", /* PCI_R2PCIE_DEVICE */
- "10.0", /* PCI_IMC_DEVICE_CH_0 */
- "10.1", /* PCI_IMC_DEVICE_CH_1 */
- "10.4", /* PCI_IMC_DEVICE_CH_2 */
- "10.5", /* PCI_IMC_DEVICE_CH_3 */
- "0e.1", /* PCI_HA_DEVICE */
- "08.2", /* PCI_QPI_DEVICE_PORT_0 */
- "09.2", /* PCI_QPI_DEVICE_PORT_1 */
- "08.6", /* PCI_QPI_MASK_DEVICE_PORT_0 */
- "09.6", /* PCI_QPI_MASK_DEVICE_PORT_1 */
- "08.0", /* PCI_QPI_MISC_DEVICE_PORT_0 */
- "09.0" }; /* PCI_QPI_MISC_DEVICE_PORT_1 */
-
+static PciDevice* pci_devices_daemon = NULL;
static char pci_filepath[MAX_PATH_LENGTH];
/* Socket to bus mapping -- will be determined at runtime;
@@ -185,7 +104,7 @@ static char pci_filepath[MAX_PATH_LENGTH];
* 2 0xbf
* 3 0xff
*/
-static char* socket_bus[MAX_NUM_NODES];
+static char* socket_bus[MAX_NUM_NODES] = { [0 ... (MAX_NUM_NODES-1)] = NULL};
/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
@@ -198,12 +117,15 @@ static int allowed_intel(uint32_t reg)
((reg & 0xF00U) == 0xC00U) ||
((reg & 0xF00U) == 0xD00U) ||
((reg & 0xF00U) == 0xE00U) ||
+ ((reg & 0xF00U) == 0xF00U) ||
(reg == 0x1A0) ||
+ (reg == 0x1A4) ||
(reg == 0x0CE) ||
(reg == 0x19C) ||
(reg == 0x1A2) ||
(reg == 0x1AD) ||
- (reg == 0x1A6))
+ (reg == 0x1A6) ||
+ (reg == 0x1A7))
{
return 1;
}
@@ -213,58 +135,159 @@ static int allowed_intel(uint32_t reg)
}
}
-static int allowed_silvermont(uint32_t reg)
+static int allowed_sandybridge(uint32_t reg)
{
- if ( ((reg & 0x0F8U) == 0x0C0U) ||
- ((reg & 0xFF0U) == 0x180U) ||
- ((reg & 0xF00U) == 0x300U) ||
- ((reg & 0xF00U) == 0x600U) ||
- ((reg & 0xF00U) == 0xC00U) ||
- ((reg & 0xF00U) == 0xD00U) ||
- (reg == 0x1A0) ||
- (reg == 0x0CE) ||
- (reg == 0x1AD) ||
- (reg == 0x19C) ||
- (reg == 0x1A2) ||
- (reg == 0x1A6) ||
- (reg == 0x1A6) ||
- (reg == 0x1A7))
+ if ((allowed_intel(reg)) ||
+ (((reg & 0xF00U) == 0x600U)))
{
return 1;
}
- else
- {
- return 0;
- }
+ return 0;
}
-static int allowed_westmereEX(uint32_t reg)
+static int allowed_pci_sandybridge(PciDeviceType type, uint32_t reg)
{
- if (allowed_intel(reg) == 1)
+ switch (type)
{
- return 1;
- }
- else if ((reg & 0xF00) == 0xF00)
- {
- return 1;
+ case NODEVTYPE:
+ return 1;
+ break;
+ case R3QPI:
+ if ((reg == PCI_UNC_R3QPI_PMON_BOX_CTL) ||
+ (reg == PCI_UNC_R3QPI_PMON_BOX_STATUS) ||
+ (reg == PCI_UNC_R3QPI_PMON_CTL_0) ||
+ (reg == PCI_UNC_R3QPI_PMON_CTL_1) ||
+ (reg == PCI_UNC_R3QPI_PMON_CTL_2) ||
+ (reg == PCI_UNC_R3QPI_PMON_CTR_0_A) ||
+ (reg == PCI_UNC_R3QPI_PMON_CTR_1_A) ||
+ (reg == PCI_UNC_R3QPI_PMON_CTR_2_A) ||
+ (reg == PCI_UNC_R3QPI_PMON_CTR_0_B) ||
+ (reg == PCI_UNC_R3QPI_PMON_CTR_1_B) ||
+ (reg == PCI_UNC_R3QPI_PMON_CTR_2_B))
+ {
+ return 1;
+ }
+ return 0;
+ break;
+ case R2PCIE:
+ if ((reg == PCI_UNC_R2PCIE_PMON_BOX_CTL) ||
+ (reg == PCI_UNC_R2PCIE_PMON_BOX_STATUS) ||
+ (reg == PCI_UNC_R2PCIE_PMON_CTL_0) ||
+ (reg == PCI_UNC_R2PCIE_PMON_CTL_1) ||
+ (reg == PCI_UNC_R2PCIE_PMON_CTL_2) ||
+ (reg == PCI_UNC_R2PCIE_PMON_CTL_3) ||
+ (reg == PCI_UNC_R2PCIE_PMON_CTR_0_A) ||
+ (reg == PCI_UNC_R2PCIE_PMON_CTR_1_A) ||
+ (reg == PCI_UNC_R2PCIE_PMON_CTR_2_A) ||
+ (reg == PCI_UNC_R2PCIE_PMON_CTR_3_A) ||
+ (reg == PCI_UNC_R2PCIE_PMON_CTR_0_B) ||
+ (reg == PCI_UNC_R2PCIE_PMON_CTR_1_B) ||
+ (reg == PCI_UNC_R2PCIE_PMON_CTR_2_B) ||
+ (reg == PCI_UNC_R2PCIE_PMON_CTR_3_B))
+ {
+ return 1;
+ }
+ return 0;
+ break;
+ case IMC:
+ if ((reg == PCI_UNC_MC_PMON_BOX_CTL) ||
+ (reg == PCI_UNC_MC_PMON_BOX_STATUS) ||
+ (reg == PCI_UNC_MC_PMON_CTL_0) ||
+ (reg == PCI_UNC_MC_PMON_CTL_1) ||
+ (reg == PCI_UNC_MC_PMON_CTL_2) ||
+ (reg == PCI_UNC_MC_PMON_CTL_3) ||
+ (reg == PCI_UNC_MC_PMON_CTR_0_A) ||
+ (reg == PCI_UNC_MC_PMON_CTR_1_A) ||
+ (reg == PCI_UNC_MC_PMON_CTR_2_A) ||
+ (reg == PCI_UNC_MC_PMON_CTR_3_A) ||
+ (reg == PCI_UNC_MC_PMON_CTR_0_B) ||
+ (reg == PCI_UNC_MC_PMON_CTR_1_B) ||
+ (reg == PCI_UNC_MC_PMON_CTR_2_B) ||
+ (reg == PCI_UNC_MC_PMON_CTR_3_B) ||
+ (reg == PCI_UNC_MC_PMON_FIXED_CTL) ||
+ (reg == PCI_UNC_MC_PMON_FIXED_CTR_A) ||
+ (reg == PCI_UNC_MC_PMON_FIXED_CTR_B))
+ {
+ return 1;
+ }
+ return 0;
+ break;
+ case HA:
+ if ((reg == PCI_UNC_HA_PMON_BOX_CTL) ||
+ (reg == PCI_UNC_HA_PMON_BOX_STATUS) ||
+ (reg == PCI_UNC_HA_PMON_CTL_0) ||
+ (reg == PCI_UNC_HA_PMON_CTL_1) ||
+ (reg == PCI_UNC_HA_PMON_CTL_2) ||
+ (reg == PCI_UNC_HA_PMON_CTL_3) ||
+ (reg == PCI_UNC_HA_PMON_CTR_0_A) ||
+ (reg == PCI_UNC_HA_PMON_CTR_1_A) ||
+ (reg == PCI_UNC_HA_PMON_CTR_2_A) ||
+ (reg == PCI_UNC_HA_PMON_CTR_3_A) ||
+ (reg == PCI_UNC_HA_PMON_CTR_0_B) ||
+ (reg == PCI_UNC_HA_PMON_CTR_1_B) ||
+ (reg == PCI_UNC_HA_PMON_CTR_2_B) ||
+ (reg == PCI_UNC_HA_PMON_CTR_3_B) ||
+ (reg == PCI_UNC_HA_PMON_OPCODEMATCH) ||
+ (reg == PCI_UNC_HA_PMON_ADDRMATCH0) ||
+ (reg == PCI_UNC_HA_PMON_ADDRMATCH1))
+ {
+ return 1;
+ }
+ return 0;
+ break;
+ case QPI:
+ if ((reg == PCI_UNC_QPI_PMON_BOX_CTL) ||
+ (reg == PCI_UNC_QPI_PMON_BOX_STATUS) ||
+ (reg == PCI_UNC_QPI_PMON_CTL_0) ||
+ (reg == PCI_UNC_QPI_PMON_CTL_1) ||
+ (reg == PCI_UNC_QPI_PMON_CTL_2) ||
+ (reg == PCI_UNC_QPI_PMON_CTL_3) ||
+ (reg == PCI_UNC_QPI_PMON_CTR_0_A) ||
+ (reg == PCI_UNC_QPI_PMON_CTR_1_A) ||
+ (reg == PCI_UNC_QPI_PMON_CTR_2_A) ||
+ (reg == PCI_UNC_QPI_PMON_CTR_3_A) ||
+ (reg == PCI_UNC_QPI_PMON_CTR_0_B) ||
+ (reg == PCI_UNC_QPI_PMON_CTR_1_B) ||
+ (reg == PCI_UNC_QPI_PMON_CTR_2_B) ||
+ (reg == PCI_UNC_QPI_PMON_CTR_3_B) ||
+ (reg == PCI_UNC_QPI_PMON_MASK_0) ||
+ (reg == PCI_UNC_QPI_PMON_MASK_1) ||
+ (reg == PCI_UNC_QPI_PMON_MATCH_0) ||
+ (reg == PCI_UNC_QPI_PMON_MATCH_1) ||
+ (reg == PCI_UNC_QPI_RATE_STATUS))
+ {
+ return 1;
+ }
+ return 0;
+ break;
+ case IRP:
+ if ((reg == PCI_UNC_IRP_PMON_BOX_STATUS) ||
+ (reg == PCI_UNC_IRP_PMON_BOX_CTL) ||
+ (reg == PCI_UNC_IRP0_PMON_CTL_0) ||
+ (reg == PCI_UNC_IRP0_PMON_CTL_1) ||
+ (reg == PCI_UNC_IRP0_PMON_CTR_0) ||
+ (reg == PCI_UNC_IRP0_PMON_CTR_1) ||
+ (reg == PCI_UNC_IRP1_PMON_CTL_0) ||
+ (reg == PCI_UNC_IRP1_PMON_CTL_1) ||
+ (reg == PCI_UNC_IRP1_PMON_CTR_0) ||
+ (reg == PCI_UNC_IRP1_PMON_CTR_1))
+ {
+ return 1;
+ }
+ return 0;
+ break;
+ default:
+ return 0;
+ break;
}
return 0;
}
-static int allowed_sandybridge(uint32_t reg)
+static int allowed_haswell(uint32_t reg)
{
- if ( ((reg & 0x0F8U) == 0x0C0U) ||
- ((reg & 0xFF0U) == 0x180U) ||
- ((reg & 0xF00U) == 0x300U) ||
- ((reg & 0xF00U) == 0x600U) ||
- ((reg & 0xF00U) == 0xC00U) ||
- ((reg & 0xF00U) == 0xD00U) ||
- (reg == 0x1A0) ||
- (reg == 0x0CE) ||
- (reg == 0x1AD) ||
- (reg == 0x19C) ||
- (reg == 0x1A2) ||
- (reg == 0x1A6))
+ if ((allowed_intel(reg)) ||
+ (allowed_sandybridge(reg)) ||
+ (((reg & 0xF00U) == 0x700U)))
{
return 1;
}
@@ -274,22 +297,150 @@ static int allowed_sandybridge(uint32_t reg)
}
}
-static int allowed_haswell(uint32_t reg)
+static int allowed_pci_haswell(PciDeviceType type, uint32_t reg)
{
+ switch (type)
+ {
+ case NODEVTYPE:
+ return 1;
+ break;
+ case R3QPI:
+ if ((reg == PCI_UNC_R3QPI_PMON_BOX_CTL) ||
+ (reg == PCI_UNC_R3QPI_PMON_BOX_STATUS) ||
+ (reg == PCI_UNC_R3QPI_PMON_CTL_0) ||
+ (reg == PCI_UNC_R3QPI_PMON_CTL_1) ||
+ (reg == PCI_UNC_R3QPI_PMON_CTL_2) ||
+ (reg == PCI_UNC_R3QPI_PMON_CTR_0_A) ||
+ (reg == PCI_UNC_R3QPI_PMON_CTR_1_A) ||
+ (reg == PCI_UNC_R3QPI_PMON_CTR_2_A) ||
+ (reg == PCI_UNC_R3QPI_PMON_CTR_0_B) ||
+ (reg == PCI_UNC_R3QPI_PMON_CTR_1_B) ||
+ (reg == PCI_UNC_R3QPI_PMON_CTR_2_B))
+ {
+ return 1;
+ }
+ return 0;
+ break;
+ case R2PCIE:
+ if ((reg == PCI_UNC_R2PCIE_PMON_BOX_CTL) ||
+ (reg == PCI_UNC_R2PCIE_PMON_BOX_STATUS) ||
+ (reg == PCI_UNC_R2PCIE_PMON_CTL_0) ||
+ (reg == PCI_UNC_R2PCIE_PMON_CTL_1) ||
+ (reg == PCI_UNC_R2PCIE_PMON_CTL_2) ||
+ (reg == PCI_UNC_R2PCIE_PMON_CTL_3) ||
+ (reg == PCI_UNC_R2PCIE_PMON_CTR_0_A) ||
+ (reg == PCI_UNC_R2PCIE_PMON_CTR_1_A) ||
+ (reg == PCI_UNC_R2PCIE_PMON_CTR_2_A) ||
+ (reg == PCI_UNC_R2PCIE_PMON_CTR_3_A) ||
+ (reg == PCI_UNC_R2PCIE_PMON_CTR_0_B) ||
+ (reg == PCI_UNC_R2PCIE_PMON_CTR_1_B) ||
+ (reg == PCI_UNC_R2PCIE_PMON_CTR_2_B) ||
+ (reg == PCI_UNC_R2PCIE_PMON_CTR_3_B))
+ {
+ return 1;
+ }
+ return 0;
+ break;
+ case IMC:
+ if ((reg == PCI_UNC_MC_PMON_BOX_CTL) ||
+ (reg == PCI_UNC_MC_PMON_BOX_STATUS) ||
+ (reg == PCI_UNC_MC_PMON_CTL_0) ||
+ (reg == PCI_UNC_MC_PMON_CTL_1) ||
+ (reg == PCI_UNC_MC_PMON_CTL_2) ||
+ (reg == PCI_UNC_MC_PMON_CTL_3) ||
+ (reg == PCI_UNC_MC_PMON_CTR_0_A) ||
+ (reg == PCI_UNC_MC_PMON_CTR_1_A) ||
+ (reg == PCI_UNC_MC_PMON_CTR_2_A) ||
+ (reg == PCI_UNC_MC_PMON_CTR_3_A) ||
+ (reg == PCI_UNC_MC_PMON_CTR_0_B) ||
+ (reg == PCI_UNC_MC_PMON_CTR_1_B) ||
+ (reg == PCI_UNC_MC_PMON_CTR_2_B) ||
+ (reg == PCI_UNC_MC_PMON_CTR_3_B) ||
+ (reg == PCI_UNC_MC_PMON_FIXED_CTL) ||
+ (reg == PCI_UNC_MC_PMON_FIXED_CTR_A) ||
+ (reg == PCI_UNC_MC_PMON_FIXED_CTR_B))
+ {
+ return 1;
+ }
+ return 0;
+ break;
+ case HA:
+ if ((reg == PCI_UNC_HA_PMON_BOX_CTL) ||
+ (reg == PCI_UNC_HA_PMON_BOX_STATUS) ||
+ (reg == PCI_UNC_HA_PMON_CTL_0) ||
+ (reg == PCI_UNC_HA_PMON_CTL_1) ||
+ (reg == PCI_UNC_HA_PMON_CTL_2) ||
+ (reg == PCI_UNC_HA_PMON_CTL_3) ||
+ (reg == PCI_UNC_HA_PMON_CTR_0_A) ||
+ (reg == PCI_UNC_HA_PMON_CTR_1_A) ||
+ (reg == PCI_UNC_HA_PMON_CTR_2_A) ||
+ (reg == PCI_UNC_HA_PMON_CTR_3_A) ||
+ (reg == PCI_UNC_HA_PMON_CTR_0_B) ||
+ (reg == PCI_UNC_HA_PMON_CTR_1_B) ||
+ (reg == PCI_UNC_HA_PMON_CTR_2_B) ||
+ (reg == PCI_UNC_HA_PMON_CTR_3_B) ||
+ (reg == PCI_UNC_HA_PMON_OPCODEMATCH) ||
+ (reg == PCI_UNC_HA_PMON_ADDRMATCH0) ||
+ (reg == PCI_UNC_HA_PMON_ADDRMATCH1))
+ {
+ return 1;
+ }
+ return 0;
+ break;
+ case QPI:
+ if ((reg == PCI_UNC_V3_QPI_PMON_BOX_CTL) ||
+ (reg == PCI_UNC_V3_QPI_PMON_BOX_STATUS) ||
+ (reg == PCI_UNC_V3_QPI_PMON_CTL_0) ||
+ (reg == PCI_UNC_V3_QPI_PMON_CTL_1) ||
+ (reg == PCI_UNC_V3_QPI_PMON_CTL_2) ||
+ (reg == PCI_UNC_V3_QPI_PMON_CTL_3) ||
+ (reg == PCI_UNC_V3_QPI_PMON_CTR_0_A) ||
+ (reg == PCI_UNC_V3_QPI_PMON_CTR_1_A) ||
+ (reg == PCI_UNC_V3_QPI_PMON_CTR_2_A) ||
+ (reg == PCI_UNC_V3_QPI_PMON_CTR_3_A) ||
+ (reg == PCI_UNC_V3_QPI_PMON_CTR_0_B) ||
+ (reg == PCI_UNC_V3_QPI_PMON_CTR_1_B) ||
+ (reg == PCI_UNC_V3_QPI_PMON_CTR_2_B) ||
+ (reg == PCI_UNC_V3_QPI_PMON_CTR_3_B) ||
+ (reg == PCI_UNC_V3_QPI_PMON_RX_MASK_0) ||
+ (reg == PCI_UNC_V3_QPI_PMON_RX_MASK_1) ||
+ (reg == PCI_UNC_V3_QPI_PMON_RX_MATCH_0) ||
+ (reg == PCI_UNC_V3_QPI_PMON_RX_MATCH_1) ||
+ (reg == PCI_UNC_V3_QPI_PMON_TX_MASK_0) ||
+ (reg == PCI_UNC_V3_QPI_PMON_TX_MASK_1) ||
+ (reg == PCI_UNC_V3_QPI_PMON_TX_MATCH_0) ||
+ (reg == PCI_UNC_V3_QPI_PMON_TX_MATCH_1) ||
+ (reg == PCI_UNC_V3_QPI_RATE_STATUS) ||
+ (reg == PCI_UNC_V3_QPI_LINK_LLR) ||
+ (reg == PCI_UNC_V3_QPI_LINK_IDLE))
+ {
+ return 1;
+ }
+ return 0;
+ break;
+ default:
+ return 0;
+ break;
+ }
+ return 0;
+}
+
+static int allowed_silvermont(uint32_t reg)
+{
+
if ( ((reg & 0x0F8U) == 0x0C0U) ||
((reg & 0xFF0U) == 0x180U) ||
((reg & 0xF00U) == 0x300U) ||
+ ((reg & 0xF00U) == 0x600U) ||
((reg & 0xF00U) == 0xC00U) ||
((reg & 0xF00U) == 0xD00U) ||
- ((reg & 0xF00U) == 0xE00U) ||
- ((reg & 0xF00U) == 0x600U) ||
- ((reg & 0xF00U) == 0x700U) ||
(reg == 0x1A0) ||
(reg == 0x0CE) ||
+ (reg == 0x1AD) ||
(reg == 0x19C) ||
(reg == 0x1A2) ||
- (reg == 0x1AD) ||
- (reg == 0x1A6))
+ (reg == 0x1A6) ||
+ (reg == 0x1A7))
{
return 1;
}
@@ -347,25 +498,25 @@ static void msr_read(AccessDataRecord * dRecord)
dRecord->errorcode = ERR_NOERROR;
dRecord->data = 0;
- if (FD_MSR[cpu] == -2)
+ if (FD_MSR[cpu] <= 0)
{
dRecord->errorcode = ERR_NODEV;
return;
}
+
if (!allowed(reg))
{
- syslog(LOG_ERR, "attempt to read from restricted register 0x%x", reg);
dRecord->errorcode = ERR_RESTREG;
return;
}
if (pread(FD_MSR[cpu], &data, sizeof(data), reg) != sizeof(data))
{
- syslog(LOG_ERR, "Failed to read data from msr device file on core %u", cpu);
+ syslog(LOG_ERR, "Failed to read data to register 0x%x on core %u", reg, cpu);
+ syslog(LOG_ERR, "%s", strerror(errno));
dRecord->errorcode = ERR_RWFAIL;
return;
}
-
dRecord->data = data;
}
@@ -376,8 +527,8 @@ static void msr_write(AccessDataRecord * dRecord)
uint64_t data = dRecord->data;
dRecord->errorcode = ERR_NOERROR;
-
- if (FD_MSR[cpu] == -2)
+
+ if (FD_MSR[cpu] <= 0)
{
dRecord->errorcode = ERR_NODEV;
return;
@@ -385,19 +536,33 @@ static void msr_write(AccessDataRecord * dRecord)
if (!allowed(reg))
{
- syslog(LOG_ERR, "attempt to write to restricted register %x", reg);
+ syslog(LOG_ERR, "Attempt to write to restricted register 0x%x on core %u", reg, cpu);
dRecord->errorcode = ERR_RESTREG;
return;
}
if (pwrite(FD_MSR[cpu], &data, sizeof(data), reg) != sizeof(data))
{
- syslog(LOG_ERR, "Failed to write data to msr device file on core %u", cpu);
+ syslog(LOG_ERR, "Failed to write data to register 0x%x on core %u", reg, cpu);
+ syslog(LOG_ERR, "%s", strerror(errno));
dRecord->errorcode = ERR_RWFAIL;
return;
}
}
+static void msr_check(AccessDataRecord * dRecord)
+{
+ uint32_t cpu = dRecord->cpu;
+ dRecord->errorcode = ERR_NOERROR;
+
+ if (FD_MSR[cpu] < 0)
+ {
+ dRecord->errorcode = ERR_NODEV;
+ return;
+ }
+ return;
+}
+
static void pci_read(AccessDataRecord* dRecord)
{
uint32_t socketId = dRecord->cpu;
@@ -413,26 +578,35 @@ static void pci_read(AccessDataRecord* dRecord)
dRecord->errorcode = ERR_NODEV;
return;
}
- else if ( !FD_PCI[socketId][device] )
- {
- strncpy(pci_filepath, PCI_ROOT_PATH, 30);
- strncat(pci_filepath, socket_bus[socketId], 10);
- strncat(pci_filepath, pci_DevicePath[device], 20);
+ if (allowedPci)
+ {
+ if (!allowedPci(pci_devices_daemon[device].type, reg))
+ {
+ dRecord->errorcode = ERR_RESTREG;
+ return;
+ }
+ }
+ if ( !FD_PCI[socketId][device] )
+ {
+ snprintf(pci_filepath, MAX_PATH_LENGTH-1, "%s%s%s", PCI_ROOT_PATH, socket_bus[socketId], pci_devices_daemon[device].path);
FD_PCI[socketId][device] = open( pci_filepath, O_RDWR);
if ( FD_PCI[socketId][device] < 0)
{
- syslog(LOG_ERR, "Failed to open device file %s on socket %u", pci_filepath, socketId);
+ syslog(LOG_ERR, "Failed to open device file %s for device %s (%s) on socket %u", pci_filepath,
+ pci_types[pci_devices_daemon[device].type].name, pci_devices_daemon[device].name, socketId);
dRecord->errorcode = ERR_OPENFAIL;
return;
}
+ syslog(LOG_ERR, "Open device file %s for device %s (%s) on socket %u", pci_filepath,
+ pci_types[pci_devices_daemon[device].type].name, pci_devices_daemon[device].name, socketId);
}
- if ( pread(FD_PCI[socketId][device], &data, sizeof(data), reg) != sizeof(data))
+ if (FD_PCI[socketId][device] > 0 && pread(FD_PCI[socketId][device], &data, sizeof(data), reg) != sizeof(data))
{
- syslog(LOG_ERR, "Failed to read data from pci device file on socket %u device %u",
- socketId, device);
+ syslog(LOG_ERR, "Failed to read data from pci device file %s for device %s (%s) on socket %u",
+ pci_filepath,pci_types[pci_devices_daemon[device].type].name, pci_devices_daemon[device].name,socketId);
dRecord->errorcode = ERR_RWFAIL;
return;
}
@@ -450,40 +624,65 @@ static void pci_write(AccessDataRecord* dRecord)
uint32_t data = (uint32_t) dRecord->data;
dRecord->errorcode = ERR_NOERROR;
+
if (FD_PCI[socketId][device] == -2)
{
dRecord->errorcode = ERR_NODEV;
return;
}
- else if ( !FD_PCI[socketId][device] )
+
+ if (allowedPci)
+ {
+ if (!allowedPci(pci_devices_daemon[device].type, reg))
+ {
+ dRecord->errorcode = ERR_RESTREG;
+ return;
+ }
+ }
+
+ if ( !FD_PCI[socketId][device] )
{
- strncpy(pci_filepath, PCI_ROOT_PATH, 30);
- strncat(pci_filepath, socket_bus[socketId], 10);
- strncat(pci_filepath, pci_DevicePath[device], 20);
+ snprintf(pci_filepath, MAX_PATH_LENGTH-1, "%s%s%s", PCI_ROOT_PATH, socket_bus[socketId], pci_devices_daemon[device].path);
FD_PCI[socketId][device] = open( pci_filepath, O_RDWR);
if ( FD_PCI[socketId][device] < 0)
{
- syslog(LOG_ERR, "Failed to open device file %s on socket %u", pci_filepath, socketId);
+ syslog(LOG_ERR, "Failed to open device file %s for device %s (%s) on socket %u", pci_filepath,
+ pci_types[pci_devices_daemon[device].type].name, pci_devices_daemon[device].name, socketId);
dRecord->errorcode = ERR_OPENFAIL;
return;
}
+ syslog(LOG_ERR, "Open device file %s for device %s (%s) on socket %u", pci_filepath,
+ pci_types[pci_devices_daemon[device].type].name, pci_devices_daemon[device].name, socketId);
}
- if (pwrite(FD_PCI[socketId][device], &data, sizeof data, reg) != sizeof data)
+ if (FD_PCI[socketId][device] > 0 && pwrite(FD_PCI[socketId][device], &data, sizeof data, reg) != sizeof data)
{
- syslog(LOG_ERR, "Failed to write data to pci device file on socket %u", socketId);
+ syslog(LOG_ERR, "Failed to write data to pci device file %s for device %s (%s) on socket %u",pci_filepath,
+ pci_types[pci_devices_daemon[device].type].name, pci_devices_daemon[device].name, socketId);
dRecord->errorcode = ERR_RWFAIL;
return;
}
}
-static void kill_client(void)
+static void pci_check(AccessDataRecord* dRecord)
{
- syslog(LOG_NOTICE, "daemon dropped client");
+ uint32_t socketId = dRecord->cpu;
+ uint32_t device = dRecord->device;
+ dRecord->errorcode = ERR_NOERROR;
+ if (FD_PCI[socketId][device] == -2)
+ {
+ dRecord->errorcode = ERR_NODEV;
+ return;
+ }
+ return;
+}
+
+static void kill_client(void)
+{
if (connfd != -1)
{
CHECK_ERROR(close(connfd), socket close failed);
@@ -495,7 +694,13 @@ static void kill_client(void)
static void stop_daemon(void)
{
kill_client();
- syslog(LOG_NOTICE, "daemon exiting");
+ for (int i=0;i<MAX_NUM_NODES;i++)
+ {
+ if (socket_bus[i] != NULL)
+ {
+ free(socket_bus[i]);
+ }
+ }
if (sockfd != -1)
{
@@ -507,6 +712,41 @@ static void stop_daemon(void)
exit(EXIT_SUCCESS);
}
+int getBusFromSocket(const uint32_t socket)
+{
+ int cur_bus = 0;
+ uint32_t cur_socket = 0;
+ char pci_filepath[1024];
+ int fp;
+ int ret = 0;
+ while(cur_socket <= socket)
+ {
+ snprintf(pci_filepath, MAX_PATH_LENGTH-1, "%s%02x/05.0", PCI_ROOT_PATH, cur_bus);
+ fp = open(pci_filepath, O_RDONLY);
+ if (fp < 0)
+ {
+ return -1;
+ }
+ uint32_t cpubusno = 0;
+ ret = pread(fp, &cpubusno, sizeof(uint32_t), 0x108);
+ if (ret != sizeof(uint32_t))
+ {
+ close(fp);
+ return -1;
+ }
+ cur_bus = (cpubusno >> 8) & 0x0ff;
+ close(fp);
+ if(socket == cur_socket)
+ return cur_bus;
+ ++cur_socket;
+ ++cur_bus;
+ if(cur_bus > 0x0ff)
+ return -1;
+ }
+
+ return -1;
+}
+
static void Signal_Handler(int sig)
{
if (sig == SIGPIPE)
@@ -516,7 +756,7 @@ static void Signal_Handler(int sig)
}
/* For SIGALRM we just return - we're just here to create a EINTR */
- if ((sig == SIGTERM))
+ if (sig == SIGTERM)
{
stop_daemon();
}
@@ -543,7 +783,7 @@ static void daemonize(int* parentPid)
/* If we got a good PID, then we can exit the parent process. */
if (pid > 0)
{
- exit(ERR_NOERROR);
+ exit(EXIT_SUCCESS);
}
/* At this point we are executing as the child process */
@@ -585,90 +825,107 @@ int main(void)
mode_t oldumask;
uint32_t numHWThreads = sysconf(_SC_NPROCESSORS_CONF);
uint32_t model;
- int isIntel = 1;
-
- if (!lock_check())
- {
- fprintf(stderr,"Access to performance counters is locked. Exiting!\n");
- exit(EXIT_FAILURE);
- }
-
- for ( uint32_t i=0; i < numHWThreads; i++ )
+ for (int i=0;i<MAX_NUM_THREADS;i++)
{
FD_MSR[i] = -1;
}
- uint32_t eax = 0x00;
- uint32_t ebx = 0x00;
-
- CPUID;
- if (ebx == 0x68747541U)
+ openlog(ident, 0, LOG_USER);
+
+ if (!lock_check())
{
- isIntel = 0;
+ syslog(LOG_ERR,"Access to performance counters is locked.\n");
+ stop_daemon();
}
- eax = 0x01;
- CPUID;
- uint32_t family = ((eax >> 8) & 0xFU) + ((eax >> 20) & 0xFFU);
- model = (((eax >> 16) & 0xFU) << 4) + ((eax >> 4) & 0xFU);
+ daemonize(&pid);
- switch (family)
{
- case P6_FAMILY:
- allowed = allowed_intel;
+ uint32_t eax = 0x00;
+ uint32_t ebx = 0x00;
+ uint32_t ecx = 0x00;
+ uint32_t edx = 0x00;
+ /*int isIntel = 1;
+ CPUID(eax, ebx, ecx, edx);
+ if (ebx == 0x68747541U)
+ {
+ isIntel = 0;
+ }*/
- if (isIntel && ((model == SANDYBRIDGE) ||
- (model == SANDYBRIDGE_EP) ||
- (model == IVYBRIDGE) ||
- (model == IVYBRIDGE_EP) ))
- {
- allowed = allowed_sandybridge;
- isPCIUncore = 1;
- }
- else if (isIntel && ((model == HASWELL) ||
- (model == HASWELL_M1) ||
- (model == HASWELL_M2) ||
- (model == HASWELL_EX)))
- {
- allowed = allowed_haswell;
- }
- else if (isIntel && (model == ATOM_SILVERMONT))
- {
- allowed = allowed_silvermont;
- }
- else if (isIntel && (model == WESTMERE_EX))
- {
- allowed = allowed_westmereEX;
- }
- break;
- case K8_FAMILY:
- case K10_FAMILY:
- if (!isIntel)
- {
+ eax = 0x01;
+ CPUID(eax, ebx, ecx, edx);
+ uint32_t family = ((eax >> 8) & 0xFU) + ((eax >> 20) & 0xFFU);
+ model = (((eax >> 16) & 0xFU) << 4) + ((eax >> 4) & 0xFU);
+
+ switch (family)
+ {
+ case P6_FAMILY:
+ allowed = allowed_intel;
+
+ if ((model == SANDYBRIDGE) || (model == IVYBRIDGE))
+ {
+ allowed = allowed_sandybridge;
+ }
+ else if ((model == SANDYBRIDGE_EP) || (model == IVYBRIDGE_EP))
+ {
+ allowed = allowed_sandybridge;
+ allowedPci = allowed_pci_sandybridge;
+ isPCIUncore = 1;
+ }
+ else if ((model == HASWELL) ||
+ (model == HASWELL_M1) ||
+ (model == HASWELL_M2) ||
+ (model == BROADWELL) ||
+ (model == SKYLAKE1) ||
+ (model == SKYLAKE2))
+ {
+ allowed = allowed_haswell;
+ }
+ else if (model == BROADWELL_D)
+ {
+ allowed = allowed_haswell;
+ isPCIUncore = 1;
+ allowedPci = allowed_pci_haswell;
+ }
+ else if (model == HASWELL_EP)
+ {
+ isPCIUncore = 1;
+ allowed = allowed_haswell;
+ allowedPci = allowed_pci_haswell;
+ }
+ else if (model == BROADWELL_E)
+ {
+ isPCIUncore = 1;
+ allowed = allowed_haswell;
+ allowedPci = allowed_pci_haswell;
+ }
+ else if ((model == ATOM_SILVERMONT_C) ||
+ (model == ATOM_SILVERMONT_E) ||
+ (model == ATOM_SILVERMONT_Z1) ||
+ (model == ATOM_SILVERMONT_Z2) ||
+ (model == ATOM_SILVERMONT_F) ||
+ (model == ATOM_SILVERMONT_AIR))
+ {
+ allowed = allowed_silvermont;
+ }
+ break;
+ case K8_FAMILY:
+ case K10_FAMILY:
allowed = allowed_amd;
- }
- break;
- case K15_FAMILY:
- if (!isIntel)
- {
+ break;
+ case K15_FAMILY:
allowed = allowed_amd15;
- }
- break;
- case K16_FAMILY:
- if (!isIntel)
- {
+ break;
+ case K16_FAMILY:
allowed = allowed_amd16;
- }
break;
- default:
- fprintf(stderr, "ERROR - [%s:%d] - Unsupported processor. Exiting!\n",
- __FILE__, __LINE__);
- exit(EXIT_FAILURE);
+ default:
+ syslog(LOG_ERR, "ERROR - [%s:%d] - Unsupported processor. Exiting! \n",
+ __FILE__, __LINE__);
+ exit(EXIT_FAILURE);
+ }
}
- openlog(ident, 0, LOG_USER);
- daemonize(&pid);
-
/* setup filename for socket */
filepath = (char*) calloc(sizeof(addr1.sun_path), 1);
snprintf(filepath, sizeof(addr1.sun_path), "/tmp/likwid-%d", pid);
@@ -691,10 +948,6 @@ int main(void)
EXIT_IF_ERROR(listen(sockfd, 1), listen failed);
EXIT_IF_ERROR(chmod(filepath, S_IRUSR|S_IWUSR), chmod failed);
- /* Restore the old umask and fs ids. */
- (void) umask(oldumask);
- CHECK_ERROR(setfsuid(geteuid()), setfsuid failed);
-
socklen = sizeof(addr1);
{ /* Init signal handler */
@@ -707,8 +960,6 @@ int main(void)
sigaction(SIGTERM, &sia, NULL);
}
- syslog(LOG_NOTICE, "daemon started");
-
/* setup an alarm to stop the daemon if there is no connect.*/
alarm(15U);
@@ -728,7 +979,10 @@ int main(void)
alarm(0);
CHECK_ERROR(unlink(filepath), unlink of socket failed);
- syslog(LOG_NOTICE, "daemon accepted client");
+
+ /* Restore the old umask and fs ids. */
+ (void) umask(oldumask);
+ CHECK_ERROR(setfsuid(geteuid()), setfsuid failed);
{
char* msr_file_name = (char*) malloc(MAX_PATH_LENGTH * sizeof(char));
@@ -737,75 +991,77 @@ int main(void)
* NOTICE: This assumes consecutive processor Ids! */
for ( uint32_t i=0; i < numHWThreads; i++ )
{
-#ifdef __MIC
- sprintf(msr_file_name,"/dev/msr%d",i);
- if (access(msr_file_name, F_OK))
- {
- sprintf(msr_file_name,"/dev/cpu/%d/msr",i);
- }
-#else
sprintf(msr_file_name,"/dev/cpu/%d/msr",i);
-#endif
FD_MSR[i] = open(msr_file_name, O_RDWR);
if ( FD_MSR[i] < 0 )
{
- syslog(LOG_ERR, "Failed to open device file %s.",msr_file_name);
- FD_MSR[i] = -2;
+ syslog(LOG_ERR, "Failed to open device file %s: %s, trying /dev/msr%d", msr_file_name, strerror(errno), i);
+ sprintf(msr_file_name,"/dev/msr%d",i);
+ FD_MSR[i] = open(msr_file_name, O_RDWR);
+ if ( FD_MSR[i] < 0 )
+ {
+ syslog(LOG_ERR, "Failed to open device file %s: %s.", msr_file_name, strerror(errno));
+ }
}
}
free(msr_file_name);
-
if (isPCIUncore)
{
- for (int j=0; j<MAX_NUM_NODES; j++)
- {
- socket_bus[j] = "N-A";
- for (int i=0; i<MAX_NUM_DEVICES; i++)
- {
- FD_PCI[j][i] = -2;
- }
- }
-
- /* determine PCI-BUSID mapping ... */
- FILE *fptr;
- char buf[1024];
- uint32_t testDevice;
- uint32_t sbus, sdevfn, svend;
int cntr = 0;
int socket_count = 0;
-
if (model == SANDYBRIDGE_EP)
{
- testDevice = 0x80863c44;
+ //testDevice = 0x80863c44;
+ pci_devices_daemon = sandybridgeEP_pci_devices;
}
else if (model == IVYBRIDGE_EP)
{
- testDevice = 0x80860e36;
+ //testDevice = 0x80860e36;
+ pci_devices_daemon = ivybridgeEP_pci_devices;
}
- else
+ else if (model == HASWELL_EP)
{
- testDevice = 0;
- syslog(LOG_NOTICE, "PCI Uncore not supported on this system");
+ //testDevice = 0x80862f30;
+ pci_devices_daemon = haswellEP_pci_devices;
}
-
- if ( ((fptr = fopen("/proc/bus/pci/devices", "r")) == NULL) || !testDevice)
+ else if (model == BROADWELL_D)
+ {
+ //testDevice = 0x80862f30;
+ pci_devices_daemon = broadwelld_pci_devices;
+ }
+ else if (model == BROADWELL_E)
{
- syslog(LOG_NOTICE, "Unable to open /proc/bus/pci/devices");
+ //testDevice = 0x80862f30;
+ pci_devices_daemon = broadwellEP_pci_devices;
}
else
{
- while( fgets(buf, sizeof(buf)-1, fptr) )
+ //testDevice = 0;
+ syslog(LOG_NOTICE, "PCI Uncore not supported on this system");
+ goto LOOP;
+ }
+
+ for (int j=0; j<MAX_NUM_NODES; j++)
+ {
+ socket_bus[j] = (char*)malloc(4);
+ sprintf(socket_bus[j], "N-A");
+ for (int i=0; i<MAX_NUM_PCI_DEVICES; i++)
{
- if ( sscanf(buf, "%2x%2x %8x", &sbus, &sdevfn, &svend) == 3 &&
- svend == testDevice )
- {
- socket_bus[cntr] = (char*)malloc(4);
- sprintf(socket_bus[cntr++], "%02x/", sbus);
- }
+ FD_PCI[j][i] = -2;
}
- fclose(fptr);
+ }
+
+ /* determine PCI-BUSID mapping ... */
+ int sbus = -1;
+ cntr = 0;
+ sbus = getBusFromSocket(cntr);
+ while (sbus != -1)
+ {
+ sprintf(socket_bus[cntr], "%02x/", sbus);
+ cntr++;
+ sbus = getBusFromSocket(cntr);
}
if ( cntr == 0 )
@@ -815,38 +1071,41 @@ int main(void)
else
{
socket_count = cntr;
-
+ int fd;
for (int j=0; j<socket_count; j++)
{
- for (int i=0; i<MAX_NUM_DEVICES; i++)
+ for (int i=1; i<MAX_NUM_PCI_DEVICES; i++)
{
- sprintf(pci_filepath, "%s%s%s",PCI_ROOT_PATH,socket_bus[j],pci_DevicePath[i]);
-
- if (!access(pci_filepath,F_OK))
- {
- FD_PCI[j][i] = 0;
- }
- else
+ if (pci_devices_daemon[i].path)
{
- syslog(LOG_NOTICE, "Device %s not found, excluded it from device list\n",pci_filepath);
+ sprintf(pci_filepath, "%s%s%s", PCI_ROOT_PATH, socket_bus[j], pci_devices_daemon[i].path);
+ fd = open(pci_filepath, O_RDWR);
+ if (fd > 0)
+ {
+ FD_PCI[j][i] = 0;
+ pci_devices_daemon[i].online = 1;
+ close(fd);
+ }
+ else if (j==0)
+ {
+ syslog(LOG_NOTICE, "Device %s for socket %d not found at path %s, excluded it from device list: %s\n",pci_devices_daemon[i].name,j, pci_filepath, strerror(errno));
+ }
}
}
}
}
}
}
-
+LOOP:
while (1)
{
ret = read(connfd, (void*) &dRecord, sizeof(AccessDataRecord));
if (ret < 0)
{
- syslog(LOG_ERR, "ERROR - [%s:%d] read from client failed - %s \n",
- __FILE__, __LINE__, strerror(errno));
stop_daemon();
}
- else if (ret == 0)
+ else if ((ret == 0) && (dRecord.type != DAEMON_EXIT))
{
syslog(LOG_ERR, "ERROR - [%s:%d] zero read", __FILE__, __LINE__);
stop_daemon();
@@ -860,7 +1119,7 @@ int main(void)
if (dRecord.type == DAEMON_READ)
{
- if (dRecord.device == DAEMON_AD_MSR)
+ if (dRecord.device == MSR_DEV)
{
msr_read(&dRecord);
}
@@ -871,7 +1130,7 @@ int main(void)
}
else if (dRecord.type == DAEMON_WRITE)
{
- if (dRecord.device == DAEMON_AD_MSR)
+ if (dRecord.device == MSR_DEV)
{
msr_write(&dRecord);
dRecord.data = 0x0ULL;
@@ -882,6 +1141,17 @@ int main(void)
dRecord.data = 0x0ULL;
}
}
+ else if (dRecord.type == DAEMON_CHECK)
+ {
+ if (dRecord.device == MSR_DEV)
+ {
+ msr_check(&dRecord);
+ }
+ else
+ {
+ pci_check(&dRecord);
+ }
+ }
else if (dRecord.type == DAEMON_EXIT)
{
stop_daemon();
diff --git a/src/access-daemon/setFreq.c b/src/access-daemon/setFreq.c
index 967dbbf..6802449 100644
--- a/src/access-daemon/setFreq.c
+++ b/src/access-daemon/setFreq.c
@@ -1,18 +1,18 @@
/*
* =======================================================================================
- *
- * Filename: setFreq.c
- *
- * Description: Wrapper for accessing setfreq kernel FS files
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Authors: Michael Meier, michael.meier at rrze.fau.de
- * Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
+ *
+ * Filename: setFreq.c
+ *
+ * Description: Implementation of frequency daemon
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Authors: Jan Treibig (jt), jan.treibig at gmail.com,
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -28,103 +28,168 @@
*
* =======================================================================================
*/
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-static int get_numCPUs()
-{
- int cpucount = 0;
- char line[1024];
- FILE* fp = fopen("/proc/cpuinfo","r");
- if (fp != NULL)
- {
- while( fgets(line,1024,fp) )
- {
- if (strncmp(line, "processor", 9) == 0)
- {
- cpucount++;
- }
- }
- }
- return cpucount;
-}
-
-int main (int argn, char** argv)
-{
- int cpuid;
- int freq;
- int numCPUs = 0;
- char* gov;
- char* gpath = malloc(100);
- char* fpath = malloc(100);
- FILE* f;
-
- if (argn < 3 || argn > 4)
- {
- fprintf(stderr, "Usage: %s <processorID> <frequency> [<governor>] \n",argv[0]);
- exit(EXIT_FAILURE);
- }
-
- cpuid = atoi(argv[1]);
- numCPUs = get_numCPUs();
- if (cpuid < 0 || cpuid > numCPUs)
- {
- fprintf(stderr, "CPU %d not a valid CPU ID. Range from 0 to %d.\n",cpuid,numCPUs);
- exit(EXIT_FAILURE);
- }
- freq = atoi(argv[2]);
- if (freq < 0)
- {
- fprintf(stderr, "Frequency must be greater than 0.\n");
- exit(EXIT_FAILURE);
- }
- snprintf(gpath, 60, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", cpuid);
- snprintf(fpath, 60, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_setspeed", cpuid);
-
- if (argn == 4)
- {
- gov = argv[3];
-
- if ((strncmp(gov,"ondemand",12)) && (strncmp(gov,"performance",12)))
- {
- fprintf(stderr, "Invalid governor %s!\n",gov);
- return (EXIT_FAILURE);
- }
-
- f = fopen(gpath, "w");
- if (f == NULL)
- {
- fprintf(stderr, "Unable to open path for writing\n");
- return (EXIT_FAILURE);
- }
- fprintf(f,"%s",gov);
- fclose(f);
- return(EXIT_SUCCESS);
- }
- else
- {
- f = fopen(gpath, "w");
- if (f == NULL)
- {
- fprintf(stderr, "Unable to open path for writing\n");
- return (EXIT_FAILURE);
- }
- fprintf(f,"userspace");
- fclose(f);
- }
-
- f = fopen(fpath, "w");
- if (f == NULL)
- {
- fprintf(stderr, "Unable to open path for writing\n");
- return (EXIT_FAILURE);
- }
- fprintf(f,"%d",freq);
- fclose(f);
-
- return(EXIT_SUCCESS);
-}
-
-
+/* ##### HEADER FILE INCLUDES ######################################### */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+char setfiles[3][100] = {"scaling_min_freq", "scaling_max_freq", "scaling_setspeed"};
+char getfiles[3][100] = {"cpuinfo_min_freq", "cpuinfo_max_freq", "cpuinfo_cur_freq"};
+
+/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
+static int get_numCPUs()
+{
+ int cpucount = 0;
+ char line[1024];
+ FILE* fp = fopen("/proc/cpuinfo","r");
+ if (fp != NULL)
+ {
+ while( fgets(line,1024,fp) )
+ {
+ if (strncmp(line, "processor", 9) == 0)
+ {
+ cpucount++;
+ }
+ }
+ }
+ return cpucount;
+}
+
+/* ##### MAIN FUNCTION DEFINITION ################## */
+int main (int argn, char** argv)
+{
+ int i = 0;
+ int tmp;
+ int cpuid;
+ int freq = 0;
+ int numCPUs = 0;
+ char* gov;
+ char* gpath = malloc(100);
+ char* fpath = malloc(100);
+
+ if (argn < 3 || argn > 4)
+ {
+ fprintf(stderr, "Usage: %s <processorID> <frequency> [<governor>] \n",argv[0]);
+ free(gpath);
+ free(fpath);
+ exit(EXIT_FAILURE);
+ }
+
+ cpuid = atoi(argv[1]);
+ numCPUs = get_numCPUs();
+ if (cpuid < 0 || cpuid > numCPUs)
+ {
+ fprintf(stderr, "CPU %d not a valid CPU ID. Range from 0 to %d.\n", cpuid, numCPUs);
+ free(gpath);
+ free(fpath);
+ exit(EXIT_FAILURE);
+ }
+ freq = atoi(argv[2]);
+ if (freq <= 0)
+ {
+ fprintf(stderr, "Frequency must be greater than 0.\n");
+ free(gpath);
+ free(fpath);
+ exit(EXIT_FAILURE);
+ }
+
+ if (argn == 4)
+ {
+ FILE* f;
+ gov = argv[3];
+
+ if ((strncmp(gov,"ondemand",8) != 0) &&
+ (strncmp(gov,"performance",11) != 0) &&
+ (strncmp(gov,"conservative",12) != 0) &&
+ (strncmp(gov,"powersave",9) != 0)) {
+ fprintf(stderr, "Invalid governor %s!\n",gov);
+ free(gpath);
+ free(fpath);
+ return (EXIT_FAILURE);
+ }
+
+ for (i=0; i<2; i++)
+ {
+ snprintf(fpath, 99, "/sys/devices/system/cpu/cpu%d/cpufreq/%s", cpuid, getfiles[i]);
+ f = fopen(fpath, "r");
+ if (f == NULL) {
+ fprintf(stderr, "Unable to open path %s for writing\n", fpath);
+ free(gpath);
+ free(fpath);
+ return (EXIT_FAILURE);
+ }
+ tmp = fread(fpath, 100, sizeof(char), f);
+ freq = atoi(fpath);
+ fclose(f);
+ snprintf(fpath, 99, "/sys/devices/system/cpu/cpu%d/cpufreq/%s", cpuid, setfiles[i]);
+ f = fopen(fpath, "w");
+ if (f == NULL) {
+ fprintf(stderr, "Unable to open path %s for writing\n",fpath);
+ free(gpath);
+ free(fpath);
+ return (EXIT_FAILURE);
+ }
+ fprintf(f,"%d",freq);
+ fclose(f);
+
+ }
+ snprintf(gpath, 99, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", cpuid);
+
+ f = fopen(gpath, "w");
+ if (f == NULL) {
+ fprintf(stderr, "Unable to open path %s for writing\n", gpath);
+ free(gpath);
+ free(fpath);
+ return (EXIT_FAILURE);
+ }
+ fprintf(f,"%s",gov);
+ fclose(f);
+ free(gpath);
+ free(fpath);
+ return(EXIT_SUCCESS);
+ }
+
+ snprintf(gpath, 99, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", cpuid);
+
+ FILE* f = fopen(gpath, "w");
+ if (f == NULL) {
+ fprintf(stderr, "Unable to open path %s for writing\n", gpath);
+ free(gpath);
+ free(fpath);
+ return (EXIT_FAILURE);
+ }
+ if ((argn == 4) &&
+ ((strncmp(argv[3],"ondemand",8) == 0) ||
+ (strncmp(argv[3],"performance",11) == 0) ||
+ (strncmp(argv[3],"conservative",12) == 0) ||
+ (strncmp(argv[3],"powersave",9) == 0)))
+ {
+ fprintf(f, "%s", argv[3]);
+ tmp = 1;
+ }
+ else
+ {
+ fprintf(f, "%s", "userspace");
+ tmp = 3;
+ }
+ fclose(f);
+
+ for (i=0;i<tmp;i++)
+ {
+ snprintf(fpath, 99, "/sys/devices/system/cpu/cpu%d/cpufreq/%s", cpuid, setfiles[i]);
+ f = fopen(fpath, "w");
+ if (f == NULL) {
+ fprintf(stderr, "Unable to open path %s for writing\n",fpath);
+ free(gpath);
+ free(fpath);
+ return (EXIT_FAILURE);
+ }
+ fprintf(f,"%d",freq);
+ fclose(f);
+ }
+ free(gpath);
+ free(fpath);
+ return(EXIT_SUCCESS);
+}
+
+
diff --git a/src/access.c b/src/access.c
new file mode 100644
index 0000000..1102909
--- /dev/null
+++ b/src/access.c
@@ -0,0 +1,221 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: access.c
+ *
+ * Description: Interface for the different register access modules.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <pthread.h>
+
+#include <types.h>
+#include <error.h>
+#include <topology.h>
+#include <configuration.h>
+#include <perfmon.h>
+#include <registers.h>
+#include <access.h>
+#include <access_client.h>
+#include <access_x86.h>
+
+
+
+static int registeredCpus = 0;
+static int registeredCpuList[MAX_NUM_THREADS] = { [0 ... (MAX_NUM_THREADS-1)] = 0 };
+
+
+static int (*access_read)(PciDeviceIndex dev, const int cpu, uint32_t reg, uint64_t *data) = NULL;
+static int (*access_write)(PciDeviceIndex dev, const int cpu, uint32_t reg, uint64_t data) = NULL;
+static int (*access_init) (int cpu_id) = NULL;
+static void (*access_finalize) (int cpu_id) = NULL;
+static int (*access_check) (PciDeviceIndex dev, int cpu_id) = NULL;
+
+void HPMmode(int mode)
+{
+ if ((mode == ACCESSMODE_DIRECT) || (mode == ACCESSMODE_DAEMON))
+ {
+ config.daemonMode = mode;
+ }
+}
+
+int HPMinit(void)
+{
+ int ret = 0;
+ if (access_init == NULL)
+ {
+#if defined(__x86_64__) || defined(__i386__)
+ if (config.daemonMode == -1)
+ {
+ config.daemonMode = ACCESSMODE_DAEMON;
+ }
+ if (config.daemonMode == ACCESSMODE_DAEMON)
+ {
+ DEBUG_PLAIN_PRINT(DEBUGLEV_DEVELOP, Adjusting functions for x86 architecture in daemon mode);
+ access_init = &access_client_init;
+ access_read = &access_client_read;
+ access_write = &access_client_write;
+ access_finalize = &access_client_finalize;
+ access_check = &access_client_check;
+ }
+ else if (config.daemonMode == ACCESSMODE_DIRECT)
+ {
+ DEBUG_PLAIN_PRINT(DEBUGLEV_DEVELOP, Adjusting functions for x86 architecture in direct mode);
+ access_init = &access_x86_init;
+ access_read = &access_x86_read;
+ access_write = &access_x86_write;
+ access_finalize = &access_x86_finalize;
+ access_check = &access_x86_check;
+ }
+#endif
+ }
+
+ return 0;
+}
+
+
+int HPMinitialized(void)
+{
+ return registeredCpus;
+}
+
+int HPMaddThread(int cpu_id)
+{
+ int ret;
+ if (registeredCpuList[cpu_id] == 0)
+ {
+ if (access_init != NULL)
+ {
+ ret = access_init(cpu_id);
+ if (ret == 0)
+ {
+ DEBUG_PRINT(DEBUGLEV_DETAIL, Adding CPU %d to access module, cpu_id);
+ registeredCpus++;
+ registeredCpuList[cpu_id] = 1;
+ }
+ else
+ {
+ return ret;
+ }
+ }
+ else
+ {
+ return -ENODEV;
+ }
+ }
+ return 0;
+}
+
+void HPMfinalize()
+{
+ if (registeredCpus != 0)
+ {
+ for (int i=0; i<cpuid_topology.numHWThreads; i++)
+ {
+ if (i >= cpuid_topology.numHWThreads)
+ {
+ break;
+ }
+ if (registeredCpuList[i] == 1)
+ {
+ access_finalize(i);
+ registeredCpus--;
+ registeredCpuList[i] = 0;
+ }
+ }
+ }
+ if (access_init != NULL)
+ access_init = NULL;
+ if (access_finalize != NULL)
+ access_finalize = NULL;
+ if (access_read != NULL)
+ access_read = NULL;
+ if (access_write != NULL)
+ access_write = NULL;
+ if (access_check != NULL)
+ access_check = NULL;
+ return;
+}
+
+int HPMread(int cpu_id, PciDeviceIndex dev, uint32_t reg, uint64_t* data)
+{
+ uint64_t tmp = 0x0ULL;
+ *data = 0x0ULL;
+ int err = 0;
+ if ((dev >= MAX_NUM_PCI_DEVICES) || (data == NULL))
+ {
+ return -EFAULT;
+ }
+ if ((cpu_id < 0) || (cpu_id >= cpuid_topology.numHWThreads))
+ {
+ return -ERANGE;
+ }
+ if (registeredCpuList[cpu_id] == 0)
+ {
+ return -ENODEV;
+ }
+ err = access_read(dev, cpu_id, reg, &tmp);
+ *data = tmp;
+ return err;
+}
+
+int HPMwrite(int cpu_id, PciDeviceIndex dev, uint32_t reg, uint64_t data)
+{
+ int err = 0;
+ if (dev >= MAX_NUM_PCI_DEVICES)
+ {
+ return -EFAULT;
+ }
+ if ((cpu_id < 0) || (cpu_id >= cpuid_topology.numHWThreads))
+ {
+ ERROR_PRINT(MSR WRITE C %d OUT OF RANGE, cpu_id);
+ return -ERANGE;
+ }
+ if (registeredCpuList[cpu_id] == 0)
+ {
+ return -ENODEV;
+ }
+ err = access_write(dev, cpu_id, reg, data);
+ return err;
+}
+
+int HPMcheck(PciDeviceIndex dev, int cpu_id)
+{
+ if (registeredCpuList[cpu_id] == 0)
+ {
+ return -ENODEV;
+ }
+ return access_check(dev, cpu_id);
+}
diff --git a/src/accessClient.c b/src/accessClient.c
deleted file mode 100644
index ba4cb59..0000000
--- a/src/accessClient.c
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: accessClient.c
- *
- * Description: Implementation of client to the access daemon.
- * Provides API to read and write values to MSR or
- * PCI Cfg Adresses. This module is used by the
- * msr and pci modules.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-
-/* ##### HEADER FILE INCLUDES ######################################### */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <fcntl.h>
-#include <string.h>
-#include <unistd.h>
-#include <signal.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-
-#include <types.h>
-#include <error.h>
-#include <cpuid.h>
-#include <accessClient.h>
-
-int accessClient_mode = ACCESSMODE;
-
-/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
-#define STRINGIFY(x) #x
-#define TOSTRING(x) STRINGIFY(x)
-
-/* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */
-
-/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
-static char* accessClient_strerror(AccessErrorType det)
-{
- switch (det)
- {
- case ERR_NOERROR: return "No error";
- case ERR_UNKNOWN: return "unknown command";
- case ERR_RESTREG: return "access to this register is not allowed";
- case ERR_OPENFAIL: return "failed to open device file";
- case ERR_RWFAIL: return "failed to read/write register";
- case ERR_DAEMONBUSY: return "daemon already has a same/higher priority client";
- case ERR_LOCKED: return "access to HPM is locked";
- case ERR_UNSUPPORTED: return "unsupported processor";
- case ERR_NODEV: return "no such device";
- default: return "UNKNOWN errorcode";
- }
-}
-
-static int startDaemon(void)
-{
- /* Check the function of the daemon here */
- char* filepath;
- char *newargv[] = { NULL };
- char *newenv[] = { NULL };
- char *exeprog = TOSTRING(ACCESSDAEMON);
- struct sockaddr_un address;
- size_t address_length;
- int ret;
- pid_t pid;
- int timeout = 1000;
- int socket_fd = -1;
-
- if (accessClient_mode == DAEMON_AM_ACCESS_D)
- {
- if (access(exeprog, F_OK))
- {
- fprintf(stderr, "Daemon '%s' cannot be found\n", exeprog);
- exit(EXIT_FAILURE);
- }
- if (access(exeprog, X_OK))
- {
- fprintf(stderr, "Daemon '%s' not executable\n", exeprog);
- exit(EXIT_FAILURE);
- }
- pid = fork();
-
- if (pid == 0)
- {
- ret = execve (exeprog, newargv, newenv);
- ERRNO_PRINT;
- fprintf(stderr, "Failed to execute the daemon '%s' (see error above)\n", exeprog);
- exit(EXIT_FAILURE);
- }
- else if (pid < 0)
- {
- ERROR_PLAIN_PRINT(Failed to fork);
- }
- }
-
- EXIT_IF_ERROR(socket_fd = socket(AF_LOCAL, SOCK_STREAM, 0), socket() failed);
-
- address.sun_family = AF_LOCAL;
- address_length = sizeof(address);
- snprintf(address.sun_path, sizeof(address.sun_path), "/tmp/likwid-%d", pid);
- filepath = strdup(address.sun_path);
- DEBUG_PRINT(0, "%ssocket pathname is %s\n",
- ((accessClient_mode == DAEMON_AM_ACCESS_D) ? "Generated " : ""),
- filepath);
-
- while (timeout > 0)
- {
- int res;
- usleep(1000);
- res = connect(socket_fd, (struct sockaddr *) &address, address_length);
-
- if (res == 0)
- {
- break;
- }
-
- timeout--;
- DEBUG_PRINT(1, "%s\n", "Still waiting for socket...");
- }
-
- if (timeout <= 0)
- {
- ERRNO_PRINT; /* should hopefully still work, as we make no syscalls in between. */
- fprintf(stderr, "Exiting due to timeout: The socket file at '%s' \
- could not be opened within 10 seconds.\n", filepath);
- fprintf(stderr, "Consult the error message above this to find out why.\n");
- fprintf(stderr, "If the error is 'no such file or directoy', \
- it usually means that likwid-accessD just failed to start.\n");
- fprintf(stderr, "In case the daemon itself output an error', \
- ignore this.\n");
- exit(EXIT_FAILURE);
- }
-
- DEBUG_PRINT(0, "%s\n", "Successfully opened socket to daemon.");
- free(filepath);
-
- return socket_fd;
-}
-
-/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
-
-void accessClient_setaccessmode(int mode)
-{
- if ((accessClient_mode > DAEMON_AM_ACCESS_D) || (accessClient_mode < DAEMON_AM_DIRECT))
- {
- fprintf(stderr, "Invalid accessmode %d\n", accessClient_mode);
- exit(EXIT_FAILURE);
- }
-
- accessClient_mode = mode;
-}
-
-void accessClient_init(int* socket_fd)
-{
- if ((accessClient_mode == DAEMON_AM_ACCESS_D))
- {
- (*socket_fd) = startDaemon();
- }
-}
-
-void accessClient_finalize(int socket_fd)
-{
- if ( socket_fd != -1 )
- { /* Only if a socket is actually open */
- AccessDataRecord data;
- data.type = DAEMON_EXIT;
- CHECK_ERROR(write(socket_fd, &data, sizeof(AccessDataRecord)),socket write failed);
- CHECK_ERROR(close(socket_fd),socket close failed);
- }
-}
-
-
-uint64_t accessClient_read(
- int socket_fd,
- const int cpu,
- const int device,
- uint32_t reg)
-{
- AccessDataRecord data;
-
- data.cpu = cpu;
- data.reg = reg;
- data.data = 0x00;
- data.type = DAEMON_READ;
- data.device = device;
-
- CHECK_ERROR(write(socket_fd, &data, sizeof(AccessDataRecord)), socket write failed);
- CHECK_ERROR(read(socket_fd, &data, sizeof(AccessDataRecord)), socket read failed);
-
- if (data.errorcode != ERR_NOERROR)
- {
- fprintf(stderr, "Failed to read data through daemon: "
- "daemon returned error %d '%s' for cpu %d reg 0x%x\n",
- data.errorcode, accessClient_strerror(data.errorcode), cpu, reg);
- //exit(EXIT_FAILURE);
- }
-
- return data.data;
-}
-
-void accessClient_write(
- int socket_fd,
- const int cpu,
- const int device,
- uint32_t reg,
- uint64_t sdata)
-{
- AccessDataRecord data;
-
- data.cpu = cpu;
- data.reg = reg;
- data.data = sdata;
- data.type = DAEMON_WRITE;
- data.device = device;
- CHECK_ERROR(write(socket_fd, &data, sizeof(AccessDataRecord)), socket write failed);
- CHECK_ERROR(read(socket_fd, &data, sizeof(AccessDataRecord)), socket read failed);
-
- if (data.errorcode != ERR_NOERROR)
- {
- fprintf(stderr, "Failed to write data through daemon: "
- "daemon returned error %d '%s' for cpu %d reg 0x%x\n",
- data.errorcode, accessClient_strerror(data.errorcode), cpu, reg);
- //exit(EXIT_FAILURE);
- }
-
- if (data.data != 0x00ULL)
- {
- ERROR_PLAIN_PRINT(daemon write failed);
- }
-}
-
-
diff --git a/src/access_client.c b/src/access_client.c
new file mode 100644
index 0000000..93623f0
--- /dev/null
+++ b/src/access_client.c
@@ -0,0 +1,343 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <pthread.h>
+
+#include <types.h>
+#include <error.h>
+#include <topology.h>
+#include <access.h>
+#include <access_client.h>
+#include <configuration.h>
+#include <affinity.h>
+
+/* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */
+static int globalSocket = -1;
+static int cpuSockets_open = 0;
+static int cpuSockets[MAX_NUM_THREADS] = { [0 ... MAX_NUM_THREADS-1] = -1};
+static pthread_mutex_t globalLock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t cpuLocks[MAX_NUM_THREADS] = { [0 ... MAX_NUM_THREADS-1] = PTHREAD_MUTEX_INITIALIZER };
+
+/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
+static char*
+access_client_strerror(AccessErrorType det)
+{
+ switch (det)
+ {
+ case ERR_NOERROR: return "No error";
+ case ERR_UNKNOWN: return "unknown command";
+ case ERR_RESTREG: return "access to this register is not allowed";
+ case ERR_OPENFAIL: return "failed to open device file";
+ case ERR_RWFAIL: return "failed to read/write register";
+ case ERR_DAEMONBUSY: return "daemon already has a same/higher priority client";
+ case ERR_NODEV: return "no such pci device";
+ default: return "UNKNOWN errorcode";
+ }
+}
+
+static int
+access_client_errno(AccessErrorType det)
+{
+ switch (det)
+ {
+ case ERR_NOERROR: return 0;
+ case ERR_UNKNOWN: return -EFAULT;
+ case ERR_RESTREG: return -EPERM;
+ case ERR_OPENFAIL: return -ENXIO;
+ case ERR_RWFAIL: return -EIO;
+ case ERR_DAEMONBUSY: return -EBUSY;
+ case ERR_NODEV: return -ENODEV;
+ default: return -EFAULT;
+ }
+}
+
+static int
+access_client_startDaemon(int cpu_id)
+{
+ /* Check the function of the daemon here */
+ char* filepath;
+ char *newargv[] = { NULL };
+ char *newenv[] = { NULL };
+ char *safeexeprog = TOSTRING(ACCESSDAEMON);
+ char exeprog[1024];
+ struct sockaddr_un address;
+ size_t address_length;
+ int ret;
+ pid_t pid;
+ int timeout = 1000;
+ int socket_fd = -1;
+
+ if (config.daemonPath != NULL)
+ {
+ strcpy(exeprog, config.daemonPath);
+ }
+ else
+ {
+ strcpy(exeprog, safeexeprog);
+ }
+
+ if (access(exeprog, X_OK))
+ {
+ ERROR_PRINT(Failed to find the daemon '%s'\n, exeprog);
+ exit(EXIT_FAILURE);
+ }
+
+ pid = fork();
+
+ if (pid == 0)
+ {
+ if (cpu_id >= 0)
+ {
+ cpu_set_t cpuset;
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpu_id, &cpuset);
+ sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
+ }
+ ret = execve (exeprog, newargv, newenv);
+
+ if (ret < 0)
+ {
+ //ERRNO_PRINT;
+ ERROR_PRINT(Failed to execute the daemon '%s'\n, exeprog);
+ exit(EXIT_FAILURE);
+ }
+ }
+ else if (pid < 0)
+ {
+ ERROR_PLAIN_PRINT(Failed to fork);
+ }
+
+ EXIT_IF_ERROR(socket_fd = socket(AF_LOCAL, SOCK_STREAM, 0), socket() failed);
+
+ address.sun_family = AF_LOCAL;
+ address_length = sizeof(address);
+ snprintf(address.sun_path, sizeof(address.sun_path), "/tmp/likwid-%d", pid);
+ filepath = strdup(address.sun_path);
+
+ while (timeout > 0)
+ {
+ int res;
+ usleep(1000);
+ res = connect(socket_fd, (struct sockaddr *) &address, address_length);
+
+ if (res == 0)
+ {
+ break;
+ }
+
+ timeout--;
+ DEBUG_PRINT(DEBUGLEV_INFO, Still waiting for socket %s ..., filepath);
+ }
+
+ if (timeout <= 0)
+ {
+ ERRNO_PRINT; /* should hopefully still work, as we make no syscalls in between. */
+ fprintf(stderr, "Exiting due to timeout: The socket file at '%s' \
+ could not be opened within 10 seconds.\n", filepath);
+ fprintf(stderr, "Consult the error message above this to find out why.\n");
+ fprintf(stderr, "If the error is 'no such file or directoy', \
+ it usually means that likwid-accessD just failed to start.\n");
+ exit(EXIT_FAILURE);
+ }
+ DEBUG_PRINT(DEBUGLEV_INFO, Successfully opened socket %s to daemon for CPU %d, filepath, cpu_id);
+ free(filepath);
+
+ return socket_fd;
+}
+
+/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
+
+int access_client_init(int cpu_id)
+{
+ int ret = 0;
+ if (cpuSockets[cpu_id] < 0)
+ {
+ pthread_mutex_lock(&cpuLocks[cpu_id]);
+ cpuSockets[cpu_id] = access_client_startDaemon(cpu_id);
+ cpuSockets_open++;
+ pthread_mutex_unlock(&cpuLocks[cpu_id]);
+ if (globalSocket == -1)
+ {
+ pthread_mutex_lock(&globalLock);
+ globalSocket = cpuSockets[cpu_id];
+ pthread_mutex_unlock(&globalLock);
+ }
+ }
+ return ret;
+}
+
+int access_client_read(PciDeviceIndex dev, const int cpu_id, uint32_t reg, uint64_t *data)
+{
+ int ret;
+ int socket = globalSocket;
+ pthread_mutex_t* lockptr = &globalLock;
+ AccessDataRecord record;
+ record.cpu = cpu_id;
+ record.device = MSR_DEV;
+
+ if (cpuSockets_open == 0)
+ {
+ return -ENOENT;
+ }
+
+ if ((cpuSockets[cpu_id] >= 0) && (cpuSockets[cpu_id] != globalSocket))
+ {
+ socket = cpuSockets[cpu_id];
+ lockptr = &cpuLocks[cpu_id];
+ }
+
+ if (dev != MSR_DEV)
+ {
+ record.cpu = affinity_core2node_lookup[cpu_id];
+ record.device = dev;
+ }
+ if (socket != -1)
+ {
+ record.reg = reg;
+ record.data = 0x00;
+ record.type = DAEMON_READ;
+
+ pthread_mutex_lock(lockptr);
+ CHECK_ERROR(write(socket, &record, sizeof(AccessDataRecord)), socket write failed);
+ CHECK_ERROR(read(socket, &record, sizeof(AccessDataRecord)), socket read failed);
+ *data = record.data;
+ pthread_mutex_unlock(lockptr);
+
+ if (record.errorcode != ERR_NOERROR)
+ {
+ if (dev == MSR_DEV)
+ {
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Got error '%s' from access daemon reading reg 0x%X at CPU %d,
+ access_client_strerror(record.errorcode), reg, cpu_id);
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Got error '%s' from access daemon reading reg 0x%X on socket %d,
+ access_client_strerror(record.errorcode), reg, cpu_id);
+ }
+ *data = 0;
+ return access_client_errno(record.errorcode);
+ }
+ }
+ else
+ {
+ *data = 0;
+ return -EBADFD;
+ }
+ return 0;
+}
+
+int access_client_write(PciDeviceIndex dev, const int cpu_id, uint32_t reg, uint64_t data)
+{
+ int socket = globalSocket;
+ int ret;
+ AccessDataRecord record;
+ record.cpu = cpu_id;
+ record.device = MSR_DEV;
+ pthread_mutex_t* lockptr = &globalLock;
+
+ if (cpuSockets_open == 0)
+ {
+ return -ENOENT;
+ }
+
+ if ((cpuSockets[cpu_id] >= 0) && (cpuSockets[cpu_id] != socket))
+ {
+ socket = cpuSockets[cpu_id];
+ lockptr = &cpuLocks[cpu_id];
+ }
+
+ if (dev != MSR_DEV)
+ {
+ record.cpu = affinity_core2node_lookup[cpu_id];
+ record.device = dev;
+ }
+ if (socket != -1)
+ {
+ record.reg = reg;
+ record.data = data;
+ record.type = DAEMON_WRITE;
+
+ pthread_mutex_lock(lockptr);
+ CHECK_ERROR(write(socket, &record, sizeof(AccessDataRecord)), socket write failed);
+ CHECK_ERROR(read(socket, &record, sizeof(AccessDataRecord)), socket read failed);
+ pthread_mutex_unlock(lockptr);
+
+ if (record.errorcode != ERR_NOERROR)
+ {
+ if (dev == MSR_DEV)
+ {
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Got error '%s' from access daemon writing reg 0x%X at CPU %d,
+ access_client_strerror(record.errorcode), reg, cpu_id);
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Got error '%s' from access daemon writing reg 0x%X on socket %d,
+ access_client_strerror(record.errorcode), reg, cpu_id);
+ }
+ return access_client_errno(record.errorcode);
+ }
+ }
+ else
+ {
+ return -EBADFD;
+ }
+ return 0;
+}
+
+void access_client_finalize(int cpu_id)
+{
+ AccessDataRecord record;
+ if (cpuSockets[cpu_id] > 0)
+ {
+ record.type = DAEMON_EXIT;
+ CHECK_ERROR(write(cpuSockets[cpu_id], &record, sizeof(AccessDataRecord)),socket write failed);
+ CHECK_ERROR(close(cpuSockets[cpu_id]),socket close failed);
+ cpuSockets[cpu_id] = -1;
+ cpuSockets_open--;
+ }
+ if (cpuSockets_open == 0)
+ {
+ globalSocket = -1;
+ }
+}
+
+int access_client_check(PciDeviceIndex dev, int cpu_id)
+{
+ int socket = globalSocket;
+ pthread_mutex_t* lockptr = &globalLock;
+
+ AccessDataRecord record;
+ record.cpu = cpu_id;
+ record.device = dev;
+ record.type = DAEMON_CHECK;
+ if (dev != MSR_DEV)
+ {
+ record.cpu = affinity_core2node_lookup[cpu_id];
+ }
+ if ((cpuSockets[cpu_id] > 0) && (cpuSockets[cpu_id] != globalSocket))
+ {
+ socket = cpuSockets[cpu_id];
+ lockptr = &cpuLocks[cpu_id];
+ }
+ if ((cpuSockets[cpu_id] > 0) || ((cpuSockets_open == 1) && (globalSocket > 0)))
+ {
+ pthread_mutex_lock(lockptr);
+ CHECK_ERROR(write(socket, &record, sizeof(AccessDataRecord)), socket write failed);
+ CHECK_ERROR(read(socket, &record, sizeof(AccessDataRecord)), socket read failed);
+ pthread_mutex_unlock(lockptr);
+ if (record.errorcode == ERR_NOERROR )
+ {
+ return 1;
+ }
+ }
+ return 0;
+}
diff --git a/src/access_x86.c b/src/access_x86.c
new file mode 100644
index 0000000..4cda3a7
--- /dev/null
+++ b/src/access_x86.c
@@ -0,0 +1,91 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#include <signal.h>
+
+
+#include <types.h>
+#include <error.h>
+#include <topology.h>
+#include <access.h>
+#include <access_x86.h>
+#include <access_x86_msr.h>
+#include <access_x86_pci.h>
+#include <affinity.h>
+
+
+
+int access_x86_init(int cpu_id)
+{
+ int ret = access_x86_msr_init(cpu_id);
+ if (ret == 0)
+ {
+ if (cpuid_info.supportUncore)
+ {
+ ret = access_x86_pci_init(affinity_core2node_lookup[cpu_id]);
+ }
+ }
+ return ret;
+}
+
+int access_x86_read(PciDeviceIndex dev, const int cpu_id, uint32_t reg, uint64_t *data)
+{
+ int err;
+ uint64_t tmp = 0x0ULL;
+ if (dev == MSR_DEV)
+ {
+ err = access_x86_msr_read(cpu_id, reg, &tmp);
+ *data = tmp;
+ }
+ else
+ {
+ if (access_x86_pci_check(dev, affinity_core2node_lookup[cpu_id]))
+ {
+ err = access_x86_pci_read(dev, affinity_core2node_lookup[cpu_id], reg, &tmp);
+ *data = tmp;
+ }
+ }
+ return err;
+}
+
+int access_x86_write(PciDeviceIndex dev, const int cpu_id, uint32_t reg, uint64_t data)
+{
+ int err;
+ if (dev == MSR_DEV)
+ {
+ err = access_x86_msr_write(cpu_id, reg, data);
+ }
+ else
+ {
+ if (access_x86_pci_check(dev, affinity_core2node_lookup[cpu_id]))
+ {
+ err = access_x86_pci_write(dev, affinity_core2node_lookup[cpu_id], reg, data);
+ }
+ }
+ return err;
+}
+
+void access_x86_finalize(int cpu_id)
+{
+ access_x86_msr_finalize(cpu_id);
+ if (cpuid_info.supportUncore)
+ {
+ access_x86_pci_finalize(affinity_core2node_lookup[cpu_id]);
+ }
+}
+
+int access_x86_check(PciDeviceIndex dev, int cpu_id)
+{
+ if (dev == MSR_DEV)
+ {
+ return access_x86_msr_check(dev, cpu_id);
+ }
+ else
+ {
+ return access_x86_pci_check(dev, affinity_core2node_lookup[cpu_id]);
+ }
+ return 0;
+}
diff --git a/src/access_x86_msr.c b/src/access_x86_msr.c
new file mode 100644
index 0000000..08a082d
--- /dev/null
+++ b/src/access_x86_msr.c
@@ -0,0 +1,288 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: access_x86_msr.c
+ *
+ * Description: Implementation of msr module.
+ * Provides API to read and write values to the model
+ * specific registers on x86 processors using the msr
+ * sys interface of the Linux 2.6 kernel. This module
+ * is based on the msr-util tools.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com.
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+/* ##### HEADER FILE INCLUDES ######################################### */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include <types.h>
+#include <error.h>
+#include <topology.h>
+#include <access_x86_msr.h>
+#include <registers.h>
+#ifdef LIKWID_PROFILE_COUNTER_READ
+#include <timer.h>
+#endif
+/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
+#define MAX_LENGTH_MSR_DEV_NAME 20
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+
+/* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */
+static int FD[MAX_NUM_THREADS] = { [0 ... MAX_NUM_THREADS-1] = -1 };
+static int rdpmc_works_pmc = -1;
+static int rdpmc_works_fixed = -1;
+
+/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
+
+static inline int __rdpmc(int cpu_id, int counter, uint64_t* value)
+{
+ unsigned low, high;
+ cpu_set_t cpuset, current;
+ sched_getaffinity(0, sizeof(cpu_set_t), ¤t);
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpu_id, &cpuset);
+ sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
+ __asm__ volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter));
+ *value = ((low) | ((uint64_t )(high) << 32));
+ sched_setaffinity(0, sizeof(cpu_set_t), ¤t);
+ return 0;
+}
+
+//Needed for rdpmc check
+void segfault_sigaction(int signal, siginfo_t *si, void *arg)
+{
+ exit(1);
+}
+
+int test_rdpmc(int cpu_id, uint64_t value, int flag)
+{
+ int ret;
+ int pid;
+
+
+ pid = fork();
+
+ if (pid < 0)
+ {
+ return -1;
+ }
+ if (!pid)
+ {
+ uint64_t tmp;
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(struct sigaction));
+ sigemptyset(&sa.sa_mask);
+ sa.sa_sigaction = segfault_sigaction;
+ sa.sa_flags = SA_SIGINFO;
+ sigaction(SIGSEGV, &sa, NULL);
+ if (flag == 0)
+ {
+ __rdpmc(cpu_id, value, &tmp);
+ usleep(100);
+ }
+ exit(0);
+ } else {
+ int status = 0;
+ int waiting = 0;
+ waiting = waitpid(pid, &status, 0);
+ if ((waiting < 0) || (WEXITSTATUS(status) != 0))
+ {
+ ret = 0;
+ } else
+ {
+ ret = 1;
+ }
+ }
+ return ret;
+}
+
+/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
+
+
+int
+access_x86_msr_init(const int cpu_id)
+{
+ int fd = 0;
+ int i = 0;
+
+ char* msr_file_name;
+ if (FD[cpu_id] > 0)
+ {
+ return 0;
+ }
+ msr_file_name = (char*) malloc(MAX_LENGTH_MSR_DEV_NAME * sizeof(char));
+ if (!msr_file_name)
+ {
+ return -ENOMEM;
+ }
+
+ sprintf(msr_file_name,"/dev/msr%d", cpu_id);
+ fd = open(msr_file_name, O_RDWR);
+ if (fd < 0)
+ {
+ sprintf(msr_file_name,"/dev/cpu/%d/msr", cpu_id);
+ }
+ else
+ {
+ close(fd);
+ }
+ fd = open(msr_file_name, O_RDWR);
+ if (fd < 0)
+ {
+ ERROR_PRINT(Cannot access MSR device file %s: %s.,msr_file_name , strerror(errno))
+ ERROR_PLAIN_PRINT(Please check if 'msr' module is loaded and device files have correct permissions);
+ ERROR_PLAIN_PRINT(Alternatively you might want to look into (sys)daemonmode);
+ free(msr_file_name);
+ return -EPERM;
+ }
+ else
+ {
+ close(fd);
+ }
+ if (rdpmc_works_pmc < 0)
+ {
+ rdpmc_works_pmc = test_rdpmc(cpu_id, 0, 0);
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Test for RDPMC for PMC counters returned %d, rdpmc_works_pmc);
+ }
+ if (rdpmc_works_fixed < 0)
+ {
+ rdpmc_works_fixed = test_rdpmc(cpu_id, (1<<30), 0);
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Test for RDPMC for FIXED counters returned %d, rdpmc_works_fixed);
+ }
+
+ sprintf(msr_file_name,"/dev/msr%d",cpu_id);
+ fd = open(msr_file_name, O_RDWR);
+ if (fd < 0)
+ {
+ sprintf(msr_file_name,"/dev/cpu/%d/msr",cpu_id);
+ }
+ else
+ {
+ close(fd);
+ }
+ FD[cpu_id] = open(msr_file_name, O_RDWR);
+ if ( FD[cpu_id] < 0 )
+ {
+ ERROR_PRINT(Cannot access MSR device file %s in direct mode, msr_file_name);
+ free(msr_file_name);
+ return -EPERM;
+ }
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Opened MSR device %s for CPU %d,msr_file_name, cpu_id);
+ free(msr_file_name);
+
+ return 0;
+}
+
+void
+access_x86_msr_finalize(const int cpu_id)
+{
+ int i = 0;
+
+ if (FD[cpu_id] > 0)
+ {
+ close(FD[cpu_id]);
+ FD[cpu_id] = 0;
+ }
+}
+
+
+int
+access_x86_msr_read( const int cpu_id, uint32_t reg, uint64_t *data)
+{
+ int ret;
+
+ if ((rdpmc_works_pmc == 1) && (reg >= MSR_PMC0) && (reg <=MSR_PMC7))
+ {
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Read PMC counter with RDPMC instruction with index %d, reg - MSR_PMC0);
+ if (__rdpmc(cpu_id, reg - MSR_PMC0, data) )
+ {
+ rdpmc_works_pmc = 0;
+ goto fallback;
+ }
+ }
+ else if ((rdpmc_works_fixed == 1) && (reg >= MSR_PERF_FIXED_CTR0) && (reg <= MSR_PERF_FIXED_CTR2))
+ {
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Read FIXED counter with RDPMC instruction with index %d, (1<<30) + (reg - MSR_PERF_FIXED_CTR0));
+ if (__rdpmc(cpu_id, (1<<30) + (reg - MSR_PERF_FIXED_CTR0), data) )
+ {
+ rdpmc_works_fixed = 0;
+ goto fallback;
+ }
+ }
+ else
+ {
+fallback:
+ if (FD[cpu_id] > 0)
+ {
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Read MSR counter 0x%X with RDMSR instruction on CPU %d, reg, cpu_id);
+ ret = pread(FD[cpu_id], data, sizeof(*data), reg);
+ if ( ret != sizeof(*data) )
+ {
+ return ret;
+ }
+ }
+ }
+ return 0;
+}
+
+int
+access_x86_msr_write( const int cpu_id, uint32_t reg, uint64_t data)
+{
+ int ret;
+ if (FD[cpu_id] > 0)
+ {
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Write MSR counter 0x%X with WRMSR instruction on CPU %d data 0x%X, reg, cpu_id, data);
+ ret = pwrite(FD[cpu_id], &data, sizeof(data), reg);
+ if (ret != sizeof(data))
+ {
+ return ret;
+ }
+ }
+ return 0;
+}
+
+int access_x86_msr_check(PciDeviceIndex dev, int cpu_id)
+{
+ if (dev == MSR_DEV)
+ {
+ if (FD[cpu_id] > 0)
+ {
+ return 1;
+ }
+ }
+ return 0;
+}
diff --git a/src/access_x86_pci.c b/src/access_x86_pci.c
new file mode 100644
index 0000000..c96f775
--- /dev/null
+++ b/src/access_x86_pci.c
@@ -0,0 +1,313 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: pci.c
+ *
+ * Description: Implementation of pci module.
+ * Provides API to read and write values to the hardware
+ * performance monitoring registers in PCI Cfg space
+ * for Intel Sandy Bridge Processors.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com,
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+/* ##### HEADER FILE INCLUDES ######################################### */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include <types.h>
+#include <bstrlib.h>
+#include <error.h>
+#include <topology.h>
+
+#include <access_x86_pci.h>
+
+#ifdef LIKWID_USE_HWLOC
+#include <pci_hwloc.h>
+#else
+#include <pci_proc.h>
+#endif
+
+/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+
+#define PCI_ROOT_PATH "/proc/bus/pci/"
+#define PCM_PCI_CLASS 0x1101
+
+/* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */
+
+static int FD[MAX_NUM_NODES][MAX_NUM_PCI_DEVICES];
+static int access_x86_initialized = 0;
+static int nr_sockets = 0;
+
+/* Socket to bus mapping -- will be determined at runtime;
+ * typical mappings are:
+ * Socket Bus (2S) Bus (4s)
+ * 0 0xff 0x3f
+ * 1 0x7f 0x7f
+ * 2 0xbf
+ * 3 0xff
+ */
+static char* socket_bus[MAX_NUM_NODES] = { [0 ... (MAX_NUM_NODES-1)] = "N-A"};
+
+/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
+/* Dirty hack to avoid nonull warnings */
+int (*ownaccess)(const char*, int);
+int (*ownopen)(const char*, int, ...);
+
+/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
+
+int
+access_x86_pci_init(const int socket)
+{
+ int ret = 0;
+
+
+ if (access_x86_initialized == 0)
+ {
+ uint16_t testDevice;
+ ownaccess = &access;
+ ownopen = &open;
+
+ /* PCI is only provided by Intel systems */
+ if (!cpuid_info.isIntel)
+ {
+ DEBUG_PLAIN_PRINT(DEBUGLEV_DETAIL, PCI based Uncore performance monitoring only supported on Intel systems);
+ return -ENODEV;
+ }
+ switch (cpuid_info.model)
+ {
+ case SANDYBRIDGE_EP:
+ testDevice = 0x3c44;
+ break;
+ case IVYBRIDGE_EP:
+ testDevice = 0x0e36;
+ break;
+ case HASWELL_EP:
+ testDevice = 0x2f30;
+ break;
+ case BROADWELL_D:
+ testDevice = 0x6f30;
+ break;
+ default:
+ DEBUG_PRINT(DEBUGLEV_INFO,CPU model %s does not support PCI based Uncore performance monitoring, cpuid_info.name);
+ return -ENODEV;
+ break;
+ }
+ if(geteuid() != 0)
+ {
+ fprintf(stderr, "WARNING\n");
+ fprintf(stderr, "Direct access to the PCI Cfg Adressspace is only allowed for uid root!\n");
+ fprintf(stderr, "This means you can use performance groups as MEM only as root in direct mode.\n");
+ fprintf(stderr, "Alternatively you might want to look into (sys)daemonmode.\n\n");
+ return -EPERM;
+ }
+
+ for(int i=0; i<MAX_NUM_NODES; i++)
+ {
+ for(int j=1;j<MAX_NUM_PCI_DEVICES;j++)
+ {
+ FD[i][j] = -2;
+ }
+ }
+
+#ifdef LIKWID_USE_HWLOC
+ DEBUG_PLAIN_PRINT(DEBUGLEV_DETAIL, Using hwloc to find pci devices);
+ ret = hwloc_pci_init(testDevice, socket_bus, &nr_sockets);
+ if (ret)
+ {
+ ERROR_PLAIN_PRINT(Using hwloc to find pci devices failed);
+ return -ENODEV;
+ }
+#else
+ DEBUG_PLAIN_PRINT(DEBUGLEV_DETAIL, Using procfs to find pci devices);
+ ret = proc_pci_init(testDevice, socket_bus, &nr_sockets);
+ if (ret)
+ {
+ ERROR_PLAIN_PRINT(Using procfs to find pci devices failed);
+ return -ENODEV;
+ }
+#endif
+ }
+
+
+ for(int j=1;j<MAX_NUM_PCI_DEVICES;j++)
+ {
+ if ((pci_devices[j].path != NULL) && (FD[socket][j] == -2))
+ {
+ bstring filepath = bformat("%s%s%s",PCI_ROOT_PATH,
+ socket_bus[socket],
+ pci_devices[j].path);
+ if (!ownaccess(bdata(filepath),F_OK))
+ {
+ FD[socket][j] = 0;
+ pci_devices[j].online = 1;
+ if (access_x86_initialized == 0)
+ {
+ DEBUG_PRINT(DEBUGLEV_DETAIL, PCI device %s (%d) online for socket %d at path %s, pci_devices[j].name,j, socket,bdata(filepath));
+ if (ownaccess(bdata(filepath),R_OK|W_OK))
+ {
+ ERROR_PRINT(PCI device %s (%d) online for socket %d at path %s but not accessible, pci_devices[j].name,j, socket,bdata(filepath));
+ }
+ }
+ }
+ else
+ {
+ pci_devices[j].online = 0;
+ }
+ }
+ }
+
+ access_x86_initialized = 1;
+ return 0;
+}
+
+
+void
+access_x86_pci_finalize(const int socket)
+{
+ for (int j=1; j<MAX_NUM_PCI_DEVICES; j++)
+ {
+ if (FD[socket][j] > 0)
+ {
+ close(FD[socket][j]);
+ }
+ }
+}
+
+
+int
+access_x86_pci_read(PciDeviceIndex dev, const int socket, uint32_t reg, uint64_t *data)
+{
+ bstring filepath = NULL;
+ uint32_t tmp;
+ int err;
+
+ if (dev == MSR_DEV)
+ {
+ return -ENODEV;
+ }
+
+ if (FD[socket][dev] < 0)
+ {
+ *data = 0ULL;
+ return -ENODEV;
+ }
+ else if ( !FD[socket][dev] )
+ {
+ filepath = bfromcstr ( PCI_ROOT_PATH );
+ bcatcstr(filepath, socket_bus[socket]);
+ bcatcstr(filepath, pci_devices[dev].path);
+ FD[socket][dev] = ownopen( bdata(filepath), O_RDWR);
+
+ if ( FD[socket][dev] < 0)
+ {
+ ERROR_PRINT(Failed to open PCI device %s at path %s\n,
+ pci_devices[dev].name,
+ bdata(filepath));
+ *data = 0ULL;
+ return -EACCES;
+ }
+ DEBUG_PRINT(DEBUGLEV_DETAIL, Opened PCI device %s, pci_devices[dev].name);
+ }
+
+ if ( FD[socket][dev] > 0 &&
+ pread(FD[socket][dev], &tmp, sizeof(tmp), reg) != sizeof(tmp) )
+ {
+ ERROR_PRINT(Read from PCI device %s at register 0x%x failed, pci_devices[dev].name, reg);
+ *data = 0ULL;
+ return -EIO;
+ }
+ *data = (uint64_t)tmp;
+ return 0;
+}
+
+
+
+int
+access_x86_pci_write(PciDeviceIndex dev, const int socket, uint32_t reg, uint64_t data)
+{
+ bstring filepath = NULL;
+ int err;
+ uint32_t tmp = (uint32_t)data;
+
+ if (dev == MSR_DEV)
+ {
+ return -ENODEV;
+ }
+ if (FD[socket][dev] < 0)
+ {
+ return -ENODEV;
+ }
+ else if ( !FD[socket][dev] )
+ {
+ filepath = bfromcstr ( PCI_ROOT_PATH );
+ bcatcstr(filepath, socket_bus[socket]);
+ bcatcstr(filepath, pci_devices[dev].path );
+
+ FD[socket][dev] = ownopen( bdata(filepath), O_RDWR);
+
+ if ( FD[socket][dev] < 0)
+ {
+ ERROR_PRINT(Failed to open PCI device %s at path %s\n,
+ pci_devices[dev].name,
+ bdata(filepath));
+ return -EACCES;
+ }
+ DEBUG_PRINT(DEBUGLEV_DETAIL, Opened PCI device %s, pci_devices[dev].name);
+ }
+
+ if ( FD[socket][dev] > 0 &&
+ pwrite(FD[socket][dev], &tmp, sizeof tmp, reg) != sizeof tmp)
+ {
+ ERROR_PRINT(Write to PCI device %s at register 0x%x failed, pci_devices[dev].name, reg);
+ return -EIO;
+ }
+ return 0;
+}
+
+int access_x86_pci_check(PciDeviceIndex dev, int socket)
+{
+ if (dev == MSR_DEV)
+ {
+ return 1;
+ }
+ else if ((pci_devices[dev].online == 1) || (FD[socket][dev] >= 0))
+ {
+ return 1;
+ }
+ return 0;
+}
+
diff --git a/src/affinity.c b/src/affinity.c
index 59b05da..40f9e83 100644
--- a/src/affinity.c
+++ b/src/affinity.c
@@ -5,13 +5,14 @@
*
* Description: Implementation of affinity module.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com,
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -33,7 +34,6 @@
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
-#include <math.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <sys/time.h>
@@ -42,13 +42,15 @@
#include <sched.h>
#include <time.h>
#include <pthread.h>
+#include <math.h>
-#include <error.h>
#include <types.h>
+#include <error.h>
+#include <likwid.h>
#include <numa.h>
#include <affinity.h>
-#include <cpuid.h>
#include <tree.h>
+#include <topology.h>
/* ##### EXPORTED VARIABLES ########################################### */
@@ -63,6 +65,9 @@ int affinity_core2node_lookup[MAX_NUM_THREADS];
static int affinity_numberOfDomains = 0;
static AffinityDomain* domains;
+static int affinity_initialized = 0;
+
+AffinityDomains affinityDomains;
/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
@@ -81,7 +86,7 @@ getProcessorID(cpu_set_t* cpu_set)
return processorId;
}
-static void
+static int
treeFillNextEntries(
TreeNode* tree,
int* processorIds,
@@ -101,8 +106,7 @@ treeFillNextEntries(
if ( node == NULL )
{
- printf("ERROR: Socket %d not existing!",i);
- exit(EXIT_FAILURE);
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Cannot find socket %d in topology tree, i);
}
}
@@ -114,10 +118,10 @@ treeFillNextEntries(
if ( node == NULL )
{
- printf("ERROR: Core %d on socket %d not existing!",i,socketId);
- exit(EXIT_FAILURE);
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Cannot find core %d in topology tree, i);
}
}
+
/* Traverse horizontal */
while ( node != NULL )
{
@@ -127,12 +131,20 @@ treeFillNextEntries(
while ( thread != NULL )
{
- processorIds[numberOfEntries-counter] = thread->id;
- thread = tree_getNextNode(thread);
- counter--;
+ if (cpuid_topology.threadPool[thread->id].inCpuSet)
+ {
+ processorIds[numberOfEntries-counter] = thread->id;
+ thread = tree_getNextNode(thread);
+ counter--;
+ }
+ else
+ {
+ thread = tree_getNextNode(thread);
+ }
}
node = tree_getNextNode(node);
}
+ return numberOfEntries-counter;
}
/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
@@ -144,149 +156,250 @@ affinity_init()
int currentDomain;
int subCounter = 0;
int offset = 0;
- int numberOfSocketDomains = cpuid_topology.numSockets;;
+ int tmp;
+ if (affinity_initialized == 1)
+ {
+ return;
+ }
+ topology_init();
+ int numberOfSocketDomains = cpuid_topology.numSockets;
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity: Socket domains %d, numberOfSocketDomains);
+ numa_init();
int numberOfNumaDomains = numa_info.numberOfNodes;
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity: NUMA domains %d, numberOfNumaDomains);
int numberOfProcessorsPerSocket =
cpuid_topology.numCoresPerSocket * cpuid_topology.numThreadsPerCore;
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity: CPUs per socket %d, numberOfProcessorsPerSocket);
int numberOfCacheDomains;
int numberOfCoresPerCache =
cpuid_topology.cacheLevels[cpuid_topology.numCacheLevels-1].threads/
cpuid_topology.numThreadsPerCore;
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity: CPU cores per LLC %d, numberOfCoresPerCache);
int numberOfProcessorsPerCache =
cpuid_topology.cacheLevels[cpuid_topology.numCacheLevels-1].threads;
-
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity: CPUs per LLC %d, numberOfProcessorsPerCache);
/* for the cache domain take only into account last level cache and assume
* all sockets to be uniform. */
/* determine how many last level shared caches exist per socket */
numberOfCacheDomains = cpuid_topology.numSockets *
(cpuid_topology.numCoresPerSocket/numberOfCoresPerCache);
-
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity: Cache domains %d, numberOfCacheDomains);
/* determine total number of domains */
- if ( numberOfNumaDomains > 1 )
- {
- numberOfDomains += numberOfSocketDomains + numberOfCacheDomains + numberOfNumaDomains;
- }
- else
- {
- numberOfDomains += numberOfSocketDomains + numberOfCacheDomains;
- }
+ numberOfDomains += numberOfSocketDomains + numberOfCacheDomains + numberOfNumaDomains;
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity: All domains %d, numberOfDomains);
domains = (AffinityDomain*) malloc(numberOfDomains * sizeof(AffinityDomain));
if (!domains)
{
- fprintf(stderr, "Cannot allocate affinity domain memory\n");
+ fprintf(stderr,"No more memory for %ld bytes for array of affinity domains\n",numberOfDomains * sizeof(AffinityDomain));
return;
}
/* Node domain */
- domains[0].numberOfProcessors = cpuid_topology.numHWThreads;
+ domains[0].numberOfProcessors = cpuid_topology.activeHWThreads;
domains[0].numberOfCores = cpuid_topology.numSockets * cpuid_topology.numCoresPerSocket;
- domains[0].processorList = (int*) malloc(cpuid_topology.numHWThreads*sizeof(int));
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity domain N: %d HW threads on %d cores, domains[0].numberOfProcessors, domains[0].numberOfCores);
domains[0].tag = bformat("N");
+ domains[0].processorList = (int*) malloc(cpuid_topology.numHWThreads*sizeof(int));
+ if (!domains[0].processorList)
+ {
+ fprintf(stderr,"No more memory for %ld bytes for processor list of affinity domain %s\n",
+ cpuid_topology.numHWThreads*sizeof(int),
+ bdata(domains[0].tag));
+ return;
+ }
offset = 0;
- for (int i=0; i<numberOfSocketDomains; i++)
+ if (numberOfSocketDomains > 1)
{
- treeFillNextEntries(
- cpuid_topology.topologyTree,
- domains[0].processorList + offset,
- i, 0, numberOfProcessorsPerSocket);
-
- offset += numberOfProcessorsPerSocket;
+ for (int i=0; i<numberOfSocketDomains; i++)
+ {
+ tmp = treeFillNextEntries(cpuid_topology.topologyTree,
+ domains[0].processorList + offset,
+ i, 0, numberOfProcessorsPerSocket);
+ offset += tmp;
+ }
+ }
+ else
+ {
+ tmp = treeFillNextEntries(cpuid_topology.topologyTree,
+ domains[0].processorList,
+ 0, 0, domains[0].numberOfProcessors);
+ domains[0].numberOfProcessors = tmp;
}
/* Socket domains */
currentDomain = 1;
-
for (int i=0; i < numberOfSocketDomains; i++ )
{
- domains[currentDomain + i].numberOfProcessors = numberOfProcessorsPerSocket;
- domains[currentDomain + i].numberOfCores = cpuid_topology.numCoresPerSocket;
- domains[currentDomain + i].processorList = (int*) malloc( domains[currentDomain + i].numberOfProcessors * sizeof(int));
- domains[currentDomain + i].tag = bformat("S%d", i);
-
- treeFillNextEntries(
- cpuid_topology.topologyTree,
- domains[currentDomain + i].processorList,
- i, 0, domains[currentDomain + i].numberOfProcessors);
+ domains[currentDomain + i].numberOfProcessors = numberOfProcessorsPerSocket;
+ domains[currentDomain + i].numberOfCores = cpuid_topology.numCoresPerSocket;
+ domains[currentDomain + i].tag = bformat("S%d", i);
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity domain S%d: %d HW threads on %d cores, i, domains[currentDomain + i].numberOfProcessors, domains[currentDomain + i].numberOfCores);
+ domains[currentDomain + i].processorList = (int*) malloc( domains[currentDomain + i].numberOfProcessors * sizeof(int));
+ if (!domains[currentDomain + i].processorList)
+ {
+ fprintf(stderr,"No more memory for %ld bytes for processor list of affinity domain %s\n",
+ domains[currentDomain + i].numberOfProcessors * sizeof(int),
+ bdata(domains[currentDomain + i].tag));
+ return;
+ }
+
+ tmp = treeFillNextEntries(cpuid_topology.topologyTree,
+ domains[currentDomain + i].processorList,
+ i, 0, domains[currentDomain + i].numberOfProcessors);
+ tmp = MIN(tmp, domains[currentDomain + i].numberOfProcessors);
+ for ( int j = 0; j < tmp; j++ )
+ {
+ affinity_core2node_lookup[domains[currentDomain + i].processorList[j]] = i;
+ }
+ domains[currentDomain + i].numberOfProcessors = tmp;
}
/* Cache domains */
currentDomain += numberOfSocketDomains;
subCounter = 0;
-
for (int i=0; i < numberOfSocketDomains; i++ )
{
- offset = 0;
-
- for ( int j=0; j < (numberOfCacheDomains/numberOfSocketDomains); j++ )
- {
- domains[currentDomain + subCounter].numberOfProcessors = numberOfProcessorsPerCache;
- domains[currentDomain + subCounter].numberOfCores = numberOfCoresPerCache;
- domains[currentDomain + subCounter].processorList = (int*) malloc(numberOfProcessorsPerCache*sizeof(int));
- domains[currentDomain + subCounter].tag = bformat("C%d", subCounter);
-
- treeFillNextEntries(
- cpuid_topology.topologyTree,
- domains[currentDomain + subCounter].processorList,
- i, offset, domains[currentDomain + subCounter].numberOfProcessors);
-
- offset += numberOfCoresPerCache;
- subCounter++;
- }
- }
+ offset = 0;
- if ( numberOfNumaDomains > 1 )
- {
- /* Memory domains */
- currentDomain += numberOfCacheDomains;
- subCounter = 0;
+ for ( int j=0; j < (numberOfCacheDomains/numberOfSocketDomains); j++ )
+ {
+ domains[currentDomain + subCounter].numberOfProcessors = numberOfProcessorsPerCache;
+ domains[currentDomain + subCounter].numberOfCores = numberOfCoresPerCache;
+ domains[currentDomain + subCounter].tag = bformat("C%d", subCounter);
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity domain C%d: %d HW threads on %d cores, subCounter, domains[currentDomain + subCounter].numberOfProcessors, domains[currentDomain + subCounter].numberOfCores);
+ domains[currentDomain + subCounter].processorList = (int*) malloc(numberOfProcessorsPerCache*sizeof(int));
+ if (!domains[currentDomain + subCounter].processorList)
+ {
+ fprintf(stderr,"No more memory for %ld bytes for processor list of affinity domain %s\n",
+ numberOfProcessorsPerCache*sizeof(int),
+ bdata(domains[currentDomain + subCounter].tag));
+ return;
+ }
+ tmp = treeFillNextEntries(cpuid_topology.topologyTree,
+ domains[currentDomain + subCounter].processorList,
+ i, offset,
+ domains[currentDomain + subCounter].numberOfProcessors);
+ domains[currentDomain + subCounter].numberOfProcessors = tmp;
+ offset += (tmp < numberOfCoresPerCache ? tmp : numberOfCoresPerCache);
+ subCounter++;
+ }
+ }
+ /* Memory domains */
+ currentDomain += numberOfCacheDomains;
+ subCounter = 0;
+ if ((numberOfNumaDomains >= numberOfSocketDomains) && (numberOfNumaDomains > 1))
+ {
for (int i=0; i < numberOfSocketDomains; i++ )
{
offset = 0;
- for ( int j=0; j < (int)ceil((double)numberOfNumaDomains/numberOfSocketDomains); j++ )
+ for ( int j=0; j < (int)ceil((double)(numberOfNumaDomains)/numberOfSocketDomains); j++ )
{
- domains[currentDomain + subCounter].numberOfProcessors = numa_info.nodes[subCounter].numberOfProcessors;
- domains[currentDomain + subCounter].numberOfCores = numberOfCoresPerCache;
- domains[currentDomain + subCounter].processorList = (int*) malloc(numa_info.nodes[subCounter].numberOfProcessors*sizeof(int));
+ domains[currentDomain + subCounter].numberOfProcessors =
+ numa_info.nodes[subCounter].numberOfProcessors;
+ domains[currentDomain + subCounter].numberOfCores =
+ numa_info.nodes[subCounter].numberOfProcessors/cpuid_topology.numThreadsPerCore;
domains[currentDomain + subCounter].tag = bformat("M%d", subCounter);
-
- treeFillNextEntries(
- cpuid_topology.topologyTree,
- domains[currentDomain + subCounter].processorList,
- i, offset, domains[currentDomain + subCounter].numberOfProcessors);
-
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity domain M%d: %d HW threads on %d cores, subCounter, domains[currentDomain + subCounter].numberOfProcessors, domains[currentDomain + subCounter].numberOfCores);
+ domains[currentDomain + subCounter].processorList =
+ (int*) malloc(numa_info.nodes[subCounter].numberOfProcessors*sizeof(int));
+ if (!domains[currentDomain + subCounter].processorList)
+ {
+ fprintf(stderr,"No more memory for %ld bytes for processor list of affinity domain %s\n",
+ numa_info.nodes[subCounter].numberOfProcessors*sizeof(int),
+ bdata(domains[currentDomain + subCounter].tag));
+ return;
+ }
+
+ tmp = treeFillNextEntries(cpuid_topology.topologyTree,
+ domains[currentDomain + subCounter].processorList,
+ i, offset,
+ domains[currentDomain + subCounter].numberOfProcessors);
+ domains[currentDomain + subCounter].numberOfProcessors = tmp;
offset += domains[currentDomain + subCounter].numberOfCores;
-
subCounter++;
}
}
-
- /* This is redundant ;-). Create thread to node lookup */
- for ( uint32_t i = 0; i < numa_info.numberOfNodes; i++ )
+ }
+ else
+ {
+ offset = 0;
+ int NUMAthreads = numberOfProcessorsPerSocket * numberOfSocketDomains;
+ domains[currentDomain + subCounter].numberOfProcessors = NUMAthreads;
+ domains[currentDomain + subCounter].numberOfCores = NUMAthreads/cpuid_topology.numThreadsPerCore;
+ domains[currentDomain + subCounter].tag = bformat("M%d", subCounter);
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity domain M%d: %d HW threads on %d cores, subCounter, domains[currentDomain + subCounter].numberOfProcessors, domains[currentDomain + subCounter].numberOfCores);
+ domains[currentDomain + subCounter].processorList = (int*) malloc(NUMAthreads*sizeof(int));
+ if (!domains[currentDomain + subCounter].processorList)
{
- for ( int j = 0; j < numa_info.nodes[i].numberOfProcessors; j++ )
- {
- affinity_core2node_lookup[numa_info.nodes[i].processors[j]] = i;
- }
+ fprintf(stderr,"No more memory for %ld bytes for processor list of affinity domain %s\n",
+ NUMAthreads*sizeof(int),
+ bdata(domains[currentDomain + subCounter].tag));
+ return;
+ }
+ tmp = 0;
+ for (int i=0; i < numberOfSocketDomains; i++ )
+ {
+ tmp += treeFillNextEntries(
+ cpuid_topology.topologyTree,
+ &(domains[currentDomain + subCounter].processorList[offset]),
+ i, 0, numberOfProcessorsPerSocket);
+ offset += numberOfProcessorsPerSocket;
}
+ domains[currentDomain + subCounter].numberOfProcessors = tmp;
}
affinity_numberOfDomains = numberOfDomains;
+ affinityDomains.numberOfAffinityDomains = numberOfDomains;
+ affinityDomains.numberOfSocketDomains = numberOfSocketDomains;
+ affinityDomains.numberOfNumaDomains = numberOfNumaDomains;
+ affinityDomains.numberOfProcessorsPerSocket = numberOfProcessorsPerSocket;
+ affinityDomains.numberOfCacheDomains = numberOfCacheDomains;
+ affinityDomains.numberOfCoresPerCache = numberOfCoresPerCache;
+ affinityDomains.numberOfProcessorsPerCache = numberOfProcessorsPerCache;
+ affinityDomains.domains = domains;
+ affinity_initialized = 1;
}
void
affinity_finalize()
{
- for ( int i=0; i < affinity_numberOfDomains; i++ )
+ if (affinity_initialized == 0)
+ {
+ return;
+ }
+ if (!affinityDomains.domains)
+ {
+ return;
+ }
+ for ( int i=0; i < affinityDomains.numberOfAffinityDomains; i++ )
{
- free(domains[i].processorList);
+ bdestroy(affinityDomains.domains[i].tag);
+ if (affinityDomains.domains[i].processorList != NULL)
+ {
+ free(affinityDomains.domains[i].processorList);
+ }
+ affinityDomains.domains[i].processorList = NULL;
+ }
+ if (affinityDomains.domains != NULL)
+ {
+ free(affinityDomains.domains);
}
- free(domains);
+ affinityDomains.domains = NULL;
+ affinity_numberOfDomains = 0;
+ affinityDomains.numberOfAffinityDomains = 0;
+ affinityDomains.numberOfSocketDomains = 0;
+ affinityDomains.numberOfNumaDomains = 0;
+ affinityDomains.numberOfProcessorsPerSocket = 0;
+ affinityDomains.numberOfCacheDomains = 0;
+ affinityDomains.numberOfCoresPerCache = 0;
+ affinityDomains.numberOfProcessorsPerCache = 0;
+ affinity_initialized = 0;
}
@@ -347,6 +460,20 @@ affinity_pinProcess(int processorId)
sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
}
+void
+affinity_pinProcesses(int cpu_count, int* processorIds)
+{
+ int i;
+ cpu_set_t cpuset;
+
+ CPU_ZERO(&cpuset);
+ for(i=0;i<cpu_count;i++)
+ {
+ CPU_SET(processorIds[i], &cpuset);
+ }
+ sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
+}
+
const AffinityDomain*
affinity_getDomain(bstring domain)
@@ -364,22 +491,24 @@ affinity_getDomain(bstring domain)
}
void
-affinity_printDomains(FILE* OUTSTREAM)
+affinity_printDomains()
{
- if (OUTSTREAM)
+ for ( int i=0; i < affinity_numberOfDomains; i++ )
{
- for ( int i=0; i < affinity_numberOfDomains; i++ )
- {
- fprintf(OUTSTREAM, "Domain %d:\n", i);
- fprintf(OUTSTREAM, "\tTag %s:", bdata(domains[i].tag));
+ printf("Domain %d:\n",i);
+ printf("\tTag %s:",bdata(domains[i].tag));
- for ( uint32_t j=0; j < domains[i].numberOfProcessors; j++ )
- {
- fprintf(OUTSTREAM, " %d", domains[i].processorList[j]);
- }
- fprintf(OUTSTREAM, "\n");
- fflush(OUTSTREAM);
+ for ( uint32_t j=0; j < domains[i].numberOfProcessors; j++ )
+ {
+ printf(" %d",domains[i].processorList[j]);
}
+ printf("\n");
}
}
+AffinityDomains_t
+get_affinityDomains(void)
+{
+ return &affinityDomains;
+}
+
diff --git a/src/allocator.c b/src/allocator.c
deleted file mode 100644
index 83e8164..0000000
--- a/src/allocator.c
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: allocator.c
- *
- * Description: Implementation of allocator module.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-/* ##### HEADER FILE INCLUDES ######################################### */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-#include <error.h>
-#include <types.h>
-#include <allocator.h>
-#include <affinity.h>
-
-/* ##### EXPORTED VARIABLES ########################################### */
-
-
-/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
-
-
-
-/* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */
-
-static int numberOfAllocatedVectors = 0;
-static void** allocations;
-
-
-/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
-
-
-/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
-
-void
-allocator_init(int numVectors)
-{
- allocations = (void**) malloc(numVectors * sizeof(void*));
-}
-
-
-void
-allocator_finalize()
-{
- int i;
-
- for (i=0; i<numberOfAllocatedVectors; i++)
- {
- free(allocations[i]);
- }
-}
-
-void
-allocator_allocateVector(
- FILE* OUTSTREAM,
- void** ptr,
- int alignment,
- uint64_t size,
- int offset,
- DataType type,
- bstring domainString)
-{
- size_t bytesize = 0;
- const AffinityDomain* domain;
- int errorCode;
-
- switch ( type )
- {
- case SINGLE:
- case SINGLE_RAND:
- bytesize = (size+offset) * sizeof(float);
- break;
-
- case DOUBLE_RAND:
- case DOUBLE:
- bytesize = (size+offset) * sizeof(double);
- break;
- }
-
- errorCode = posix_memalign(ptr, alignment, bytesize);
-
- if (errorCode)
- {
- if (errorCode == EINVAL)
- {
- fprintf(stderr,
- "Alignment parameter is not a power of two\n");
- exit(EXIT_FAILURE);
- }
- if (errorCode == ENOMEM)
- {
- fprintf(stderr,
- "Insufficient memory to fulfill the request\n");
- exit(EXIT_FAILURE);
- }
- }
-
- if ((*ptr) == NULL)
- {
- fprintf(stderr, "posix_memalign failed!\n");
- exit(EXIT_FAILURE);
-
- }
-
- allocations[numberOfAllocatedVectors] = *ptr;
- numberOfAllocatedVectors++;
- domain = affinity_getDomain(domainString);
- affinity_pinProcess(domain->processorList[0]);
-
- if (OUTSTREAM)
- {
- fprintf(OUTSTREAM, "Allocate: Process running on core %d - Vector length %llu Offset %d\n",
- affinity_processGetProcessorId(),
- LLU_CAST size,
- offset);
- }
-
- switch ( type )
- {
- case SINGLE:
- {
- float* sptr = (float*) (*ptr);
- sptr += offset;
-
- for ( uint64_t i=0; i < size; i++ )
- {
- sptr[i] = 1.0;
- }
- *ptr = (void*) sptr;
-
- }
- break;
-
- case DOUBLE:
- {
- double* dptr = (double*) (*ptr);
- dptr += offset;
-
- for ( uint64_t i=0; i < size; i++ )
- {
- dptr[i] = 1.0;
- }
- *ptr = (void*) dptr;
- }
- break;
- case SINGLE_RAND:
- {
- srand((uint64_t)ptr);
- float* sptr = (float*) (*ptr);
- sptr += offset;
-
- for ( uint64_t i=0; i < size; i++ )
- {
- sptr[i] = rand()/((float)RAND_MAX)*2.0-1.0;
- }
- *ptr = (void*) sptr;
- }
- break;
- case DOUBLE_RAND:
- {
- srand((uint64_t)ptr);
- double* dptr = (double*) (*ptr);
- dptr += offset;
-
- for ( uint64_t i=0; i < size; i++ )
- {
- dptr[i] = rand()/((double)RAND_MAX)*2.0-1.0;
- }
- *ptr = (void*) dptr;
- }
- break;
-
- }
-}
-
diff --git a/src/applications/likwid-agent.lua b/src/applications/likwid-agent.lua
new file mode 100644
index 0000000..3f3e59a
--- /dev/null
+++ b/src/applications/likwid-agent.lua
@@ -0,0 +1,559 @@
+#!<INSTALLED_BINPREFIX>/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ * Filename: likwid-agent.lua
+ *
+ * Description: A monitoring daemon for hardware performance counters.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+
+package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
+local likwid = require("likwid")
+
+local base_groupPath = "<INSTALLED_PREFIX>/share/likwid/mongroups"
+dconfig = {}
+dconfig["groupStrings"] ={}
+dconfig["groupData"] ={}
+dconfig["accessmode"] = 1
+dconfig["duration"] = 1
+dconfig["groupPath"] = ""
+dconfig["logPath"] = nil
+dconfig["logStyle"] = "log"
+dconfig["gmetric"] = false
+dconfig["gmetricPath"] = "gmetric"
+dconfig["gmetricConfig"] = nil
+dconfig["gmetricHasUnit"] = false
+dconfig["gmetricHasGroup"] = false
+dconfig["rrd"] = false
+dconfig["rrdPath"] = "."
+dconfig["syslog"] = false
+dconfig["syslogPrio"] = "local0.notice"
+dconfig["stdout"] = false
+
+rrdconfig = {}
+
+
+local function read_daemon_config(filename)
+ if filename == nil or filename == "" then
+ print("Not a valid config filename")
+ os.exit(1)
+ end
+ local f = io.open(filename, "r")
+ if f == nil then
+ print("Cannot open config file "..filename)
+ os.exit(1)
+ end
+ local t = f:read("*all")
+ f:close()
+
+ for i, line in pairs(likwid.stringsplit(t,"\n")) do
+
+ if not line:match("^#") then
+ if line:match("^GROUPPATH%a*") ~= nil then
+ local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+ table.remove(linelist, 1)
+ dconfig["groupPath"] = linelist[1]
+ end
+
+ if line:match("^EVENTSET%a*") ~= nil then
+ local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+ table.remove(linelist, 1)
+ for i=#linelist,0,-1 do
+ if linelist[i] == "" then
+ table.remove(linelist, i)
+ else
+ table.insert(dconfig["groupStrings"], linelist[i])
+ end
+ end
+ end
+
+ if line:match("^DURATION%a*") ~= nil then
+ local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+ table.remove(linelist, 1)
+ dconfig["duration"] = tonumber(linelist[1])
+ end
+
+ if line:match("^ACCESSMODE%a*") ~= nil then
+ local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+ table.remove(linelist, 1)
+ dconfig["accessmode"] = tonumber(linelist[1])
+ end
+
+ if line:match("^LOGPATH%a*") ~= nil then
+ local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+ table.remove(linelist, 1)
+ dconfig["logPath"] = linelist[1]
+ end
+
+ if line:match("^LOGSTYLE%a*") ~= nil then
+ local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+ table.remove(linelist, 1)
+ if linelist[1] ~= "log" and linelist[1] ~= "update" then
+ print("LOGSTYLE argument not valid, available are log and update. Fallback to log.")
+ else
+ dconfig["logStyle"] = linelist[1]
+ end
+ end
+
+ if line:match("^GMETRIC%s%a*") ~= nil then
+ local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+ table.remove(linelist, 1)
+ if linelist[1] == "True" then
+ dconfig["gmetric"] = true
+ end
+ end
+
+ if line:match("^GMETRICPATH%a*") ~= nil then
+ local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+ table.remove(linelist, 1)
+ dconfig["gmetricPath"] = linelist[1]
+ end
+
+ if line:match("^GMETRICCONFIG%a*") ~= nil then
+ local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+ table.remove(linelist, 1)
+ dconfig["gmetricConfig"] = linelist[1]
+ end
+
+ if line:match("^RRD%a*") ~= nil then
+ local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+ table.remove(linelist, 1)
+ if linelist[1] == "True" then
+ dconfig["rrd"] = true
+ end
+ end
+
+ if line:match("^RRDPATH%a*") ~= nil then
+ local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+ table.remove(linelist, 1)
+ dconfig["rrdPath"] = linelist[1]
+ end
+
+ if line:match("^SYSLOG%s%a*") ~= nil then
+ local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+ table.remove(linelist, 1)
+ if linelist[1] == "True" then
+ dconfig["syslog"] = true
+ end
+ end
+
+ if line:match("^SYSLOGPRIO%a*") ~= nil then
+ local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+ table.remove(linelist, 1)
+ dconfig["syslogPrio"] = linelist[1]
+ end
+ end
+ end
+end
+
+local function calc_sum(key, results)
+ local sum = 0.0
+ local numThreads = likwid.getNumberOfThreads()
+ for thread=1, numThreads do
+ sum = sum + results[thread][key]
+ end
+ return sum
+end
+
+local function calc_avg(key, results)
+ local sum = 0.0
+ local numThreads = likwid.getNumberOfThreads()
+ for thread=1, numThreads do
+ sum = sum + results[thread][key]
+ end
+ return sum/numThreads
+end
+
+local function calc_min(key, results)
+ local min = math.huge
+ local numThreads = likwid.getNumberOfThreads()
+ for thread=1, numThreads do
+ if results[thread][key] < min then
+ min = results[thread][key]
+ end
+ end
+ return min
+end
+
+local function calc_max(key, results)
+ local max = 0
+ local numThreads = likwid.getNumberOfThreads()
+ for thread=1, numThreads do
+ if results[thread][key] > max then
+ max = results[thread][key]
+ end
+ end
+ return max
+end
+
+local function check_logfile()
+ local g = os.execute("cd "..dconfig["logPath"], "r")
+ if g == false then
+ print("Logfile path".. dconfig["logPath"].. " does not exist.")
+ return false
+ end
+ return true
+end
+
+local function logfile(groupID, results)
+ open_function = "a"
+ if dconfig["logStyle"] == "update" then
+ open_function = "w"
+ end
+ filename = "likwid."..tostring(groupID)..".log"
+ local s,e = dconfig["groupData"][groupID]["GroupString"]:find(":")
+ if not s then
+ filename = "likwid."..dconfig["groupData"][groupID]["GroupString"]..".log"
+ end
+ local f = io.open(dconfig["logPath"].."/"..filename, open_function)
+ if f == nil then
+ print("Cannot open logfile ".. dconfig["logPath"].."/"..filename)
+ return
+ end
+ local timestamp = results["Timestamp"]
+ for k,v in pairs(results) do
+ if k ~= "Timestamp" then
+ f:write(timestamp..","..k:gsub("%(",""):gsub("%)","").. ","..v.."\n")
+ end
+ end
+ f:close()
+end
+
+local function check_logger()
+ cmd = "which logger"
+ local f = io.popen(cmd)
+ if f == nil then
+ return false
+ end
+ f:close()
+ return true
+end
+
+local function logger(results)
+ cmd = "logger -t LIKWID "
+ if dconfig["syslogPrio"] ~= nil then
+ cm = cmd .."-p "..dconfig["syslogPrio"].." "
+ end
+ local timestamp = results["Timestamp"]
+ for k,v in pairs(results) do
+ if k ~= "Timestamp" then
+ local resultcmd = cmd .. k:gsub("%(",""):gsub("%)","") .. " " ..v
+ local f = io.popen(resultcmd)
+ if f == nil then
+ print("Cannot use logger, maybe not in $PATH")
+ return
+ end
+ f:close()
+ end
+ end
+
+end
+
+local function check_gmetric()
+ if dconfig["gmetricPath"] == nil then
+ return false
+ end
+ local f = io.popen(dconfig["gmetricPath"].." -h","r")
+ if f == nil then
+ return false
+ end
+ local msg = f:read("*a")
+ if msg:match("units=") then
+ dconfig["gmetricHasUnit"] = true
+ end
+ if msg:match("group=") then
+ dconfig["gmetricHasGroup"] = true
+ end
+ f:close()
+ return true
+end
+
+local function gmetric(gdata, results)
+ execList = {}
+ if dconfig["gmetricPath"] == nil then
+ return
+ end
+ table.insert(execList, dconfig["gmetricPath"])
+ if dconfig["gmetricConfig"] ~= nil then
+ table.insert(execList, "-c")
+ table.insert(execList, dconfig["gmetricConfig"])
+ end
+ if dconfig["gmetricHasGroup"] and gdata["GroupString"] ~= gdata["EventString"] then
+ table.insert(execList, "-g")
+ table.insert(execList, gdata["GroupString"])
+ end
+ for k,v in pairs(results) do
+ local execStr = table.concat(execList, " ")
+ if k ~= "Timestamp" then
+ execStr = execStr .. " -t double "
+
+ local name = k
+ local unit = nil
+ local s,e = k:find("%[")
+ if s ~= nil then
+ name = k:sub(0,s-2):gsub("^%s*(.-)%s*$", "%1")
+ unit = k:sub(s+1,k:len()-1):gsub("^%s*(.-)%s*$", "%1")
+ end
+ execStr = execStr .. " --name=\"" .. name .."\""
+ if dconfig["gmetricHasUnit"] and unit ~= nil then
+ execStr = execStr .. " --units=\"" .. unit .."\""
+ end
+ local value = tonumber(v)
+ if v ~= nil and value ~= nil then
+ execStr = execStr .. " --value=\"" .. string.format("%f", value) .."\""
+ elseif v ~= nil then
+ execStr = execStr .. " --value=\"" .. tostring(v) .."\""
+ else
+ execStr = execStr .. " --value=\"0\""
+ end
+ os.execute(execStr)
+ end
+ end
+end
+
+local function normalize_rrd_string(str)
+ str = str:gsub(" ","_")
+ str = str:gsub("%(","")
+ str = str:gsub("%)","")
+ str = str:gsub("%[","")
+ str = str:gsub("%]","")
+ str = str:gsub("%/","")
+ str = str:sub(1,19)
+ return str
+end
+
+local function check_rrd()
+ local f = io.popen("rrdtool")
+ if f == nil then
+ return false
+ end
+ f:close()
+ return true
+end
+
+local function create_rrd(numGroups, duration, groupData)
+ local rrdname = dconfig["rrdPath"].."/".. groupData["GroupString"] .. ".rrd"
+ local rrdstring = "rrdtool create "..rrdname.." --step ".. tostring(numGroups*duration)
+ if rrdconfig[groupData["GroupString"]] == nil then
+ rrdconfig[groupData["GroupString"]] = {}
+ end
+ for i, metric in pairs(groupdata["Metrics"]) do
+ rrdstring = rrdstring .. " DS"..":" .. normalize_rrd_string(metric["description"]) ..":GAUGE:"
+ rrdstring = rrdstring ..tostring(numGroups*duration) ..":0:U"
+ table.insert(rrdconfig[groupData["GroupString"]], metric["description"])
+ end
+ rrdstring = rrdstring .." RRA:AVERAGE:0.5:" .. tostring(60/duration)..":10"
+ rrdstring = rrdstring .." RRA:MIN:0.5:" .. tostring(60/duration)..":10"
+ rrdstring = rrdstring .." RRA:MAX:0.5:" .. tostring(60/duration)..":10"
+ --Average, min and max of hours of last day
+ rrdstring = rrdstring .." RRA:AVERAGE:0.5:" .. tostring(3600/duration)..":24"
+ rrdstring = rrdstring .." RRA:MIN:0.5:" .. tostring(3600/duration)..":24"
+ rrdstring = rrdstring .." RRA:MAX:0.5:" .. tostring(3600/duration)..":24"
+ --Average, min and max of day of last month
+ rrdstring = rrdstring .." RRA:AVERAGE:0.5:" .. tostring(86400/duration)..":31"
+ rrdstring = rrdstring .." RRA:MIN:0.5:" .. tostring(86400/duration)..":31"
+ rrdstring = rrdstring .." RRA:MAX:0.5:" .. tostring(86400/duration)..":31"
+ os.execute(rrdstring)
+end
+
+local function rrd(groupData, results)
+ local rrdname = dconfig["rrdPath"].."/".. groupData["GroupString"] .. ".rrd"
+ local rrdstring = "rrdtool update "..rrdname.." N"
+ for i, id in pairs(rrdconfig[groupData["GroupString"]]) do
+ rrdstring = rrdstring .. ":" .. tostring(results[id])
+ end
+ os.execute(rrdstring)
+end
+
+-- Read commandline arguments
+if #arg ~= 1 then
+ print("Usage:")
+ print(arg[0] .. " <configFile>")
+ os.exit(1)
+end
+
+-- Get architectural information for the current system
+local cpuinfo = likwid.getCpuInfo()
+local cputopo = likwid.getCpuTopology()
+local affinity = likwid.getAffinityInfo()
+-- Read LIKWID configuration file, mainly to avoid topology lookup
+local config = likwid.getConfiguration()
+-- Read LIKWID daemon configuration file
+read_daemon_config(arg[1])
+
+-- Set force mode, we are monitoring exclusively
+likwid.setenv("LIKWID_FORCE","1")
+
+if dconfig["groupPath"] ~= "" then
+ likwid.setGroupPath(dconfig["groupPath"])
+else
+ dconfig["groupPath"] = base_groupPath
+end
+
+if #dconfig["groupStrings"] == 0 then
+ print("No monitoring groups defined, exiting...")
+ os.exit(1)
+end
+if dconfig["duration"] == 0 then
+ print("Invalid value 0 for duration. Sanitizing to 1 second.")
+ dconfig["duration"] = 1
+end
+
+if dconfig["syslog"] then
+ if check_logger() == false then
+ print("Cannot find tool logger, disabling syslog output.")
+ dconfig["syslog"] = false
+ end
+end
+if dconfig["logPath"] then
+ if check_logfile() == false then
+ print("Cannot create logfile path "..dconfig["logPath"]..". Deactivating logfile output.")
+ dconfig["logPath"] = nil
+ end
+end
+if dconfig["gmetric"] then
+ if check_gmetric() == false then
+ print("Cannot find gmetric using path "..dconfig["gmetricPath"]..". Deactivating gmetric output.")
+ dconfig["gmetric"] = false
+ end
+end
+if dconfig["rrd"] then
+ if check_rrd() == false then
+ print("Cannot find rrdtool. Deactivating rrd output.")
+ dconfig["rrd"] = false
+ end
+end
+
+-- Activate output to stdout only if no other backend is set
+if dconfig["logPath"] == nil and dconfig["rrd"] == false and dconfig["gmetric"] == false and dconfig["syslog"] == false then
+ dconfig["stdout"] = true
+end
+
+-- Add all cpus to the cpulist
+local cpulist = {}
+for i=0, cputopo["numHWThreads"]-1 do
+ table.insert(cpulist, cputopo["threadPool"][i]["apicId"])
+end
+
+-- Select access mode to msr devices, try configuration file first
+access_mode = dconfig["accessmode"]
+if access_mode < 0 or access_mode > 1 then
+ access_mode = 1
+end
+if likwid.setAccessClientMode(access_mode) ~= 0 then
+ os.exit(1)
+end
+
+-- Select group directory for monitoring
+likwid.groupfolder = dconfig["groupPath"]
+
+power = likwid.getPowerInfo()
+-- Initialize likwid perfctr
+likwid.init(cputopo["numHWThreads"], cpulist)
+for k,v in pairs(dconfig["groupStrings"]) do
+ local groupID = likwid.addEventSet(v)
+ table.insert(dconfig["groupData"], groupID, v)
+ if dconfig["rrd"] then
+ create_rrd(#dconfig["groupStrings"], dconfig["duration"], v)
+ end
+end
+
+likwid.catchSignal()
+while likwid.getSignalState() == 0 do
+
+ for groupID,gname in pairs(dconfig["groupData"]) do
+ local old_mtime = likwid_getRuntimeOfGroup(groupID)
+ local cur_time = os.time()
+ likwid.setupCounters(groupID)
+
+ -- Perform the measurement
+ likwid.startCounters()
+ likwid.sleep(dconfig["duration"] * 1E6)
+ likwid.stopCounters()
+
+
+ if likwid.getNumberOfMetrics(groupID) > 0 then
+ local threadOutput = {}
+ for i=1, likwid.getNumberOfMetrics(groupID) do
+ local metricdesc = likwid.getNameOfMetric(groupID, i)
+ for thread=1, likwid.getNumberOfThreads() do
+ if threadOutput[thread] == nil then
+ threadOutput[thread] = {}
+ end
+ --local result = likwid.calculate_metric(metric["formula"], threadResults[thread])
+ threadOutput[thread][metricdesc] = likwid.getLastMetric(groupID, i, thread)
+ end
+ end
+ output = {}
+ output["Timestamp"] = os.date("%m/%d/%Y_%X",cur_time)
+ for i=1, likwid.getNumberOfMetrics(groupID) do
+ local metricdesc = likwid.getNameOfMetric(groupID, i)
+ itemlist = likwid.stringsplit(metricdesc, "%s+", nil, "%s+")
+ func = itemlist[1]
+ table.remove(itemlist, 1)
+ desc = table.concat(itemlist," ")
+ if func == "AVG" then
+ output[metricdesc:gsub(" ","_")] = calc_avg(metricdesc, threadOutput)
+ elseif func == "SUM" then
+ output[metricdesc:gsub(" ","_")] = calc_sum(metricdesc, threadOutput)
+ elseif func == "MIN" then
+ output[metricdesc:gsub(" ","_")] = calc_min(metricdesc, threadOutput)
+ elseif func == "MAX" then
+ output[metricdesc:gsub(" ","_")] = calc_max(metricdesc, threadOutput)
+ elseif func == "ONCE" then
+ output[metricdesc:gsub(" ","_")] = threadOutput[1][metricdesc]
+ else
+ for thread=1, likwid.getNumberOfThreads() do
+ output["T"..cpulist[thread] .. "_" .. metricdesc] = threadOutput[thread][metricdesc]
+ end
+ end
+ end
+ if dconfig["logPath"] ~= nil then
+ logfile(groupID, output)
+ end
+ if dconfig["syslog"] then
+ logger(output)
+ end
+ if dconfig["gmetric"] then
+ gmetric(gdata, output)
+ end
+ if dconfig["rrd"] then
+ rrd(gdata, output)
+ end
+ if dconfig["stdout"] then
+ for i,o in pairs(output) do
+ print(i,o)
+ end
+ print(likwid.hline)
+ end
+ end
+ end
+end
+
+-- Finalize likwid perfctr
+likwid.catchSignal()
+likwid.finalize()
+likwid.putConfiguration()
+likwid.putTopology()
diff --git a/src/applications/likwid-bench.c b/src/applications/likwid-bench.c
deleted file mode 100644
index 15f6f0d..0000000
--- a/src/applications/likwid-bench.c
+++ /dev/null
@@ -1,536 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: likwid-bench.c
- *
- * Description: A flexible and extensible benchmarking toolbox
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <errno.h>
-#include <stdio.h>
-#include <string.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <ctype.h>
-
-#include <bstrlib.h>
-#include <types.h>
-#include <error.h>
-#include <cpuid.h>
-#include <numa.h>
-#include <affinity.h>
-#include <timer.h>
-#include <threads.h>
-#include <barrier.h>
-#include <testcases.h>
-#include <strUtil.h>
-#include <allocator.h>
-
-#include <likwid.h>
-#ifdef PAPI
-#include <papi.h>
-#include <omp.h>
-#endif
-
-extern void* runTest(void* arg);
-
-/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
-
-#define HELP_MSG \
- fprintf(stdout, "Threaded Memory Hierarchy Benchmark -- Version %d.%d \n\n",VERSION,RELEASE); \
- fprintf(stdout, "\n"); \
- fprintf(stdout, "Supported Options:\n"); \
- fprintf(stdout, "-h\t Help message\n"); \
- fprintf(stdout, "-v\t Version information\n"); \
- fprintf(stdout, "-q\t Silent without output\n"); \
- fprintf(stdout, "-a\t list available benchmarks \n"); \
- fprintf(stdout, "-p\t list available thread domains\n"); \
- fprintf(stdout, "-l <TEST>\t list properties of benchmark \n"); \
- fprintf(stdout, "-i <INT>\t number of iterations \n"); \
- fprintf(stdout, "-g <INT>\t number of workgroups (mandatory)\n"); \
- fprintf(stdout, "-t <TEST>\t type of test \n"); \
- fprintf(stdout, "-w\t <thread_domain>:<size>[:<num_threads>[:<chunk size>:<stride>][-<streamId>:<domain_id>[:<offset>]], size in kB, MB or GB (mandatory)\n"); \
- fprintf(stdout, "Processors are in compact ordering. Optionally every stream can be placed. Either no stream or all streams must be placed. Multiple streams are separated by commas.\n"); \
- fprintf(stdout, "Usage: likwid-bench -t copy -i 1000 -g 1 -w S0:100kB:10:1:2 \n"); \
- fprintf(stdout, "\tRun 10 threads on socket 0 using physical cores only (presuming SMT2 system).\n"); \
- fprintf(stdout, "Example with data placement: likwid-bench -t copy -i 1000 -g 1 -w S0:100kB:20-0:S1,1:S1 \n"); \
- fprintf(stdout, "\tRun 20 threads on socket 0 and place both arrays of the copy test case on socket 1.\n"); \
- fflush(stdout);
-
-#define VERSION_MSG \
- fprintf(stdout, "likwid-bench %d.%d \n\n",VERSION,RELEASE); \
- fflush(stdout);
-
-/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ############ */
-
-void copyThreadData(ThreadUserData* src,ThreadUserData* dst)
-{
- uint32_t i;
-
- *dst = *src;
- dst->processors = (int*) malloc(src->numberOfThreads*sizeof(int));
- dst->streams = (void**) malloc(src->test->streams*sizeof(void*));
-
- for (i=0; i< src->test->streams; i++)
- {
- dst->streams[i] = src->streams[i];
- }
-
- for (i=0; i<src->numberOfThreads; i++)
- {
- dst->processors[i] = src->processors[i];
- }
-}
-
-
-
-/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
-
-int main(int argc, char** argv)
-{
- int iter = 100;
- uint32_t i;
- uint32_t j;
- int globalNumberOfThreads = 0;
- int optPrintDomains = 0;
- int c;
- ThreadUserData myData;
- bstring testcase = bfromcstr("none");
- uint32_t numberOfWorkgroups = 0;
- int tmp = 0;
- double time;
- const TestCase* test = NULL;
- Workgroup* currentWorkgroup = NULL;
- Workgroup* groups = NULL;
- FILE* OUTSTREAM = stdout;
-
- if (cpuid_init() == EXIT_FAILURE)
- {
- ERROR_PLAIN_PRINT(Unsupported processor!);
- }
- numa_init();
- affinity_init();
-
- /* Handling of command line options */
- if (argc == 1)
- {
- HELP_MSG;
- affinity_finalize();
- exit(EXIT_SUCCESS);
- }
- opterr = 0;
- while ((c = getopt (argc, argv, "g:w:t:i:l:aphvq")) != -1) {
- switch (c)
- {
- case 'h':
- HELP_MSG;
- affinity_finalize();
- if (groups)
- {
- free(groups);
- }
- exit (EXIT_SUCCESS);
- case 'v':
- VERSION_MSG;
- affinity_finalize();
- if (groups)
- {
- free(groups);
- }
- exit (EXIT_SUCCESS);
- case 'a':
- if (OUTSTREAM)
- {
- fprintf(OUTSTREAM, TESTS"\n");
- fflush(OUTSTREAM);
- }
- affinity_finalize();
- if (groups)
- {
- free(groups);
- }
- exit (EXIT_SUCCESS);
- case 'q':
- OUTSTREAM = NULL;
- break;
- case 'w':
- tmp--;
-
- if (tmp == -1)
- {
- fprintf (stderr, "More workgroups configured than allocated!\n"
- "Did you forget to set the number of workgroups with -g?\n");
- affinity_finalize();
- if (groups)
- {
- free(groups);
- }
- return EXIT_FAILURE;
- }
- if (!test)
- {
- fprintf (stderr, "You need to specify a test case first!\n");
- affinity_finalize();
- if (groups)
- {
- free(groups);
- }
- return EXIT_FAILURE;
- }
- testcase = bfromcstr(optarg);
- currentWorkgroup = groups+tmp; /*FIXME*/
- bstr_to_workgroup(currentWorkgroup, testcase, test->type, test->streams);
- bdestroy(testcase);
-
- for (i=0; i< test->streams; i++)
- {
- if (currentWorkgroup->streams[i].offset%test->stride)
- {
- fprintf (stderr, "Stream %d: offset is not a multiple of stride!\n",i);
- affinity_finalize();
- if (groups)
- {
- free(groups);
- }
- return EXIT_FAILURE;
- }
-
- allocator_allocateVector(OUTSTREAM,
- &(currentWorkgroup->streams[i].ptr),
- PAGE_ALIGNMENT,
- currentWorkgroup->size,
- currentWorkgroup->streams[i].offset,
- test->type,
- currentWorkgroup->streams[i].domain);
- }
-
- break;
- case 'i':
- iter = atoi(optarg);
- if (iter <= 0)
- {
- fprintf(stderr, "Iterations must be greater than 0.\n");
- exit(EXIT_FAILURE);
- }
- break;
- case 'l':
- testcase = bfromcstr(optarg);
- for (i=0; i<NUMKERNELS; i++)
- {
- if (biseqcstr(testcase, kernels[i].name))
- {
- test = kernels+i;
- break;
- }
- }
-
- if (biseqcstr(testcase,"none") || !test)
- {
- fprintf (stderr, "Unknown test case %s\n",optarg);
- if (OUTSTREAM)
- {
- fprintf(OUTSTREAM, "Available test cases:\n");
- fprintf(OUTSTREAM, TESTS"\n");
- fflush(OUTSTREAM);
- }
- affinity_finalize();
- if (groups)
- {
- free(groups);
- }
- return EXIT_FAILURE;
- }
- else
- {
- if (OUTSTREAM)
- {
- fprintf(OUTSTREAM, "Name: %s\n",test->name);
- fprintf(OUTSTREAM, "Number of streams: %d\n",test->streams);
- fprintf(OUTSTREAM, "Loop stride: %d\n",test->stride);
- fprintf(OUTSTREAM, "Flops: %d\n", (int) test->flops);
- fprintf(OUTSTREAM, "Bytes: %d\n",test->bytes);
- switch (test->type)
- {
- case SINGLE:
- fprintf(OUTSTREAM, "Data Type: Single precision float\n");
- break;
- case DOUBLE:
- fprintf(OUTSTREAM, "Data Type: Double precision float\n");
- break;
- }
- fflush(OUTSTREAM);
- }
- }
- bdestroy(testcase);
- affinity_finalize();
- if (groups)
- {
- free(groups);
- }
- exit (EXIT_SUCCESS);
-
- break;
- case 'p':
- optPrintDomains = 1;
- break;
- case 'g':
- numberOfWorkgroups = atoi(optarg);
- if (numberOfWorkgroups <= 0)
- {
- fprintf(stderr, "Number of Workgroups must be 1 or greater.\n");
- exit(EXIT_FAILURE);
- }
- allocator_init(numberOfWorkgroups * MAX_STREAMS);
- tmp = numberOfWorkgroups;
- groups = (Workgroup*) malloc(numberOfWorkgroups*sizeof(Workgroup));
- break;
- case 't':
- testcase = bfromcstr(optarg);
-
- for (i=0; i<NUMKERNELS; i++)
- {
- if (biseqcstr(testcase, kernels[i].name))
- {
- test = kernels+i;
- break;
- }
- }
- if (biseqcstr(testcase,"none"))
- {
- fprintf (stderr, "Unknown test case %s\n",optarg);
- affinity_finalize();
- if (groups)
- {
- free(groups);
- }
- return EXIT_FAILURE;
- }
- bdestroy(testcase);
- break;
- case '?':
- if (optopt == 'l' || optopt == 'g' || optopt == 'w' ||
- optopt == 't' || optopt == 'i')
- fprintf (stderr, "Option `-%c' requires an argument.\n", optopt);
- else if (isprint (optopt))
- fprintf (stderr, "Unknown option `-%c'.\n", optopt);
- else
- fprintf (stderr,
- "Unknown option character `\\x%x'.\n",
- optopt);
- affinity_finalize();
- if (groups)
- {
- free(groups);
- }
- return EXIT_FAILURE;
- default:
- HELP_MSG;
- }
- }
-
- if (numberOfWorkgroups == 0 && !optPrintDomains)
- {
- fprintf(stderr, "Number of Workgroups must be 1 or greater.\n");
- affinity_finalize();
- allocator_finalize();
- if (groups)
- {
- free(groups);
- }
- exit(EXIT_FAILURE);
- }
- if (tmp > 0 && iter > 0)
- {
- fprintf(stderr, "%d workgroups requested but only %d given on commandline\n",numberOfWorkgroups,numberOfWorkgroups-tmp);
- affinity_finalize();
- allocator_finalize();
- if (groups)
- {
- free(groups);
- }
- exit(EXIT_FAILURE);
- }
- if (iter <= 0)
- {
- fprintf(stderr,"Iterations must be greater than 0\n");
- affinity_finalize();
- allocator_finalize();
- if (groups)
- {
- free(groups);
- }
- exit(EXIT_FAILURE);
- }
- if (test && !(currentWorkgroup || groups))
- {
- fprintf(stderr, "Workgroups must be set on commandline\n");
- affinity_finalize();
- allocator_finalize();
- if (groups)
- {
- free(groups);
- }
- exit(EXIT_FAILURE);
- }
-
- if (optPrintDomains)
- {
- affinity_printDomains(OUTSTREAM);
- affinity_finalize();
- allocator_finalize();
- if (groups)
- {
- free(groups);
- }
- exit (EXIT_SUCCESS);
- }
- timer_init();
-
- /* :WARNING:05/04/2010 08:58:05 AM:jt: At the moment the thread
- * module only allows equally sized thread groups*/
- for (i=0; i<numberOfWorkgroups; i++)
- {
- globalNumberOfThreads += groups[i].numberOfThreads;
- }
-
- threads_init(OUTSTREAM, globalNumberOfThreads);
- threads_createGroups(numberOfWorkgroups);
-
- /* we configure global barriers only */
- barrier_init(1);
- barrier_registerGroup(globalNumberOfThreads);
-
-#ifdef PERFMON
- if (OUTSTREAM)
- {
- fprintf(OUTSTREAM, "Using likwid\n");
- fflush(OUTSTREAM);
- }
- likwid_markerInit();
-#endif
-#ifdef PAPI
- if (OUTSTREAM)
- {
- fprintf(OUTSTREAM, "Using PAPI\n");
- }
- PAPI_library_init (PAPI_VER_CURRENT);
- PAPI_thread_init((unsigned long (*)(void))(omp_get_thread_num));
-#endif
-
-
- /* initialize data structures for threads */
- for (i=0; i<numberOfWorkgroups; i++)
- {
- myData.iter = iter;
- myData.size = groups[i].size;
- myData.test = test;
- myData.numberOfThreads = groups[i].numberOfThreads;
- myData.processors = (int*) malloc(myData.numberOfThreads * sizeof(int));
- myData.streams = (void**) malloc(test->streams * sizeof(void*));
-
- for (j=0; j<groups[i].numberOfThreads; j++)
- {
- myData.processors[j] = groups[i].processorIds[j];
- }
-
- for (j=0; j< test->streams; j++)
- {
- myData.streams[j] = groups[i].streams[j].ptr;
- }
- threads_registerDataGroup(i, &myData, copyThreadData);
-
- free(myData.processors);
- free(myData.streams);
- }
-
- if (OUTSTREAM)
- {
- fprintf(OUTSTREAM, HLINE);
- fprintf(OUTSTREAM, "LIKWID MICRO BENCHMARK\n");
- fprintf(OUTSTREAM, "Test: %s\n",test->name);
- fprintf(OUTSTREAM, HLINE);
- fprintf(OUTSTREAM, "Using %d work groups\n",numberOfWorkgroups);
- fprintf(OUTSTREAM, "Using %d threads\n",globalNumberOfThreads);
- fprintf(OUTSTREAM, HLINE);
- fflush(OUTSTREAM);
- }
-
- threads_create(runTest);
- threads_join();
- allocator_finalize();
-
- uint32_t realSize = 0;
- uint64_t realCycles = 0;
- int current_id = 0;
-
- if (OUTSTREAM)
- {
- fprintf(OUTSTREAM, HLINE);
- for(j=0;j<numberOfWorkgroups;j++)
- {
- current_id = j*groups[j].numberOfThreads;
- realCycles += threads_data[current_id].cycles;
- realSize += groups[j].numberOfThreads * threads_data[current_id].data.size;
- }
- time = (double) realCycles / (double) timer_getCpuClock();
- fprintf(OUTSTREAM, "Cycles: %llu \n", LLU_CAST realCycles);
- fprintf(OUTSTREAM, "Iterations: %llu \n", LLU_CAST iter);
- fprintf(OUTSTREAM, "Size %d \n", realSize );
- fprintf(OUTSTREAM, "Vectorlength: %llu \n", LLU_CAST threads_data[current_id].data.size);
- fprintf(OUTSTREAM, "Time: %e sec\n", time);
- fprintf(OUTSTREAM, "Number of Flops: %llu \n", LLU_CAST (iter * realSize * test->flops));
- fprintf(OUTSTREAM, "MFlops/s: %.2f\n",
- 1.0E-06 * ((double) iter * realSize * test->flops/ time));
- fprintf(OUTSTREAM, "MByte/s: %.2f\n",
- 1.0E-06 * ( (double) iter * realSize * test->bytes/ time));
- fprintf(OUTSTREAM, "Cycles per update: %f\n",
- ((double) realCycles / (double) (iter * numberOfWorkgroups * threads_data[current_id].numberOfThreads * threads_data[current_id].data.size)));
-
- switch ( test->type )
- {
- case SINGLE:
- fprintf(OUTSTREAM, "Cycles per cacheline: %f\n",
- (16.0 * (double) realCycles / (double) (iter * realSize)));
- break;
- case DOUBLE:
- fprintf(OUTSTREAM, "Cycles per cacheline: %f\n",
- (8.0 * (double) realCycles / (double) (iter * realSize)));
- break;
- }
-
- fprintf(OUTSTREAM, HLINE);
- fflush(OUTSTREAM);
- }
- threads_destroy(numberOfWorkgroups);
- barrier_destroy();
-
- affinity_finalize();
-#ifdef PERFMON
- likwid_markerClose();
-#endif
-
- return EXIT_SUCCESS;
-}
-
diff --git a/src/applications/likwid-features.c b/src/applications/likwid-features.c
deleted file mode 100644
index 6fe5477..0000000
--- a/src/applications/likwid-features.c
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: likwid-features.c
- *
- * Description: An application to read out and set the feature flag
- * register on Intel Core 2 processors.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sched.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <ctype.h>
-
-#include <types.h>
-#include <error.h>
-#include <strUtil.h>
-#include <accessClient.h>
-#include <msr.h>
-#include <cpuid.h>
-#include <cpuFeatures.h>
-
-#define HELP_MSG \
- fprintf(stdout, "\nlikwid-features -- Version %d.%d \n\n",VERSION,RELEASE); \
- fprintf(stdout, "A tool to print and toggle the feature flag msr on Intel CPUS.\n"); \
- fprintf(stdout, "Supported Features: HW_PREFETCHER, CL_PREFETCHER, DCU_PREFETCHER, IP_PREFETCHER.\n\n"); \
- fprintf(stdout, "Options:\n"); \
- fprintf(stdout, "-h\t Help message\n"); \
- fprintf(stdout, "-v\t Version information\n"); \
- fprintf(stdout, "-s <FEATURE>\t set cpu feature \n"); \
- fprintf(stdout, "-u <FEATURE>\t unset cpu feature \n"); \
- fprintf(stdout, "-c <ID>\t core id\n\n"); \
- fflush(stdout);
-
-#define VERSION_MSG \
- fprintf(stdout, "likwid-features %d.%d \n\n",VERSION,RELEASE); \
- fflush(stdout);
-
-int main (int argc, char** argv)
-{
- int socket_fd = -1;
- int optSetFeature = 0;
- int cpuId = 0;
- int c;
- bstring argString;
- CpuFeature feature = HW_PREFETCHER ;
-
- while ((c = getopt (argc, argv, "c:s:u:hv")) != -1)
- {
- switch (c)
- {
- case 'h':
- HELP_MSG;
- exit (EXIT_SUCCESS);
- case 'v':
- VERSION_MSG;
- exit (EXIT_SUCCESS);
- case 'u':
- optSetFeature = 2;
- case 's':
- if (! (argString = bSecureInput(40,optarg)))
- {
- fprintf(stderr,"Failed to read argument string!\n");
- exit(EXIT_FAILURE);
- }
-
- if (biseqcstr(argString,"HW_PREFETCHER"))
- {
- feature = HW_PREFETCHER;
- }
- else if (biseqcstr(argString,"CL_PREFETCHER"))
- {
- feature = CL_PREFETCHER;
- }
- else if (biseqcstr(argString,"DCU_PREFETCHER"))
- {
- feature = DCU_PREFETCHER;
- }
- else if (biseqcstr(argString,"IP_PREFETCHER"))
- {
- feature = IP_PREFETCHER;
- }
- else
- {
- fprintf(stderr,"Feature not supported!\n");
- exit(EXIT_FAILURE);
- }
-
-
- if (!optSetFeature)
- {
- optSetFeature = 1;
- }
- break;
- case 'c':
- if (! (argString = bSecureInput(20,optarg)))
- {
- fprintf(stderr,"Failed to read argument string!\n");
- exit(EXIT_FAILURE);
- }
-
- cpuId = str2int((char*) argString->data);
-
- break;
- case '?':
- if (isprint (optopt))
- {
- fprintf (stderr, "Unknown option `-%c'.\n", optopt);
- }
- else
- {
- fprintf (stderr,
- "Unknown option character `\\x%x'.\n",
- optopt);
- }
- return EXIT_FAILURE;
- default:
- HELP_MSG;
- exit (EXIT_SUCCESS);
- }
- }
-
- if (cpuid_init() == EXIT_FAILURE)
- {
- ERROR_PLAIN_PRINT(Unsupported processor!);
- }
-
- fprintf(stdout, HLINE);
- fprintf(stdout, "CPU name:\t%s \n",cpuid_info.name);
- fprintf(stdout, "CPU core id:\t%d \n", cpuId);
- fflush(stdout);
-
- if (cpuid_info.family != P6_FAMILY)
- {
- fprintf (stderr, "likwid-features only supports Intel P6 based processors!\n");
- exit(EXIT_FAILURE);
- }
-
- if (cpuId >= (int) cpuid_topology.numHWThreads)
- {
- fprintf (stderr, "This processor has only %d HWthreads! \n",cpuid_topology.numHWThreads);
- exit(EXIT_FAILURE);
- }
-
- accessClient_init(&socket_fd);
- msr_init(socket_fd);
- cpuFeatures_init(cpuId);
- cpuFeatures_print(cpuId);
-
- if (optSetFeature == 1)
- {
- fprintf(stdout, SLINE);
- cpuFeatures_enable(cpuId, feature);
- fprintf(stdout, SLINE);
- }
- else if (optSetFeature == 2)
- {
- fprintf(stdout, SLINE);
- cpuFeatures_disable(cpuId, feature);
- fprintf(stdout, SLINE);
- }
- fflush(stdout);
-
- msr_finalize();
- return EXIT_SUCCESS;
-}
-
diff --git a/src/applications/likwid-features.lua b/src/applications/likwid-features.lua
new file mode 100644
index 0000000..37d765d
--- /dev/null
+++ b/src/applications/likwid-features.lua
@@ -0,0 +1,191 @@
+#!<INSTALLED_BINPREFIX>/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ * Filename: likwid-features.lua
+ *
+ * Description: A application to retrieve and manipulate CPU features.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+
+function version()
+ print(string.format("likwid-features -- Version %d.%d",likwid.version,likwid.release))
+end
+
+function usage()
+ version()
+ print("A tool list and modify the states of CPU features.\n")
+ print("Options:")
+ print("-h, --help\t\t Help message")
+ print("-v, --version\t\t Version information")
+ print("-a, --all\t\t List all available features")
+ print("-l, --list\t\t List features and state for given CPUs")
+ print("-c, --cpus <list>\t Perform operations on given CPUs")
+ print("-e, --enable <list>\t List of features that should be enabled")
+ print("-d, --disable <list>\t List of features that should be disabled")
+ print()
+ print("Currently modifiable features:")
+ print("HW_PREFETCHER, CL_PREFETCHER, DCU_PREFETCHER, IP_PREFETCHER")
+end
+
+if #arg == 0 then
+ usage()
+ os.exit(0)
+end
+
+listFeatures = false
+num_cpus = 0
+cpulist = {}
+enableList = {}
+disableList = {}
+skipList = {}
+
+for opt,arg in likwid.getopt(arg, {"h","v","l","c:","e:","d:","a","help","version","list", "enable:", "disable:","all", "cpus:"}) do
+ if (type(arg) == "string") then
+ local s,e = arg:find("-");
+ if s == 1 then
+ print(string.format("Argmument %s to option -%s starts with invalid character -.", arg, opt))
+ print("Did you forget an argument to an option?")
+ os.exit(1)
+ end
+ end
+ if opt == "h" or opt == "help" then
+ usage()
+ os.exit(0)
+ elseif opt == "v" or opt == "version" then
+ version()
+ os.exit(0)
+ elseif opt == "c" or opt == "cpus"then
+ num_cpus, cpulist = likwid.cpustr_to_cpulist(arg)
+ elseif opt == "l" or opt == "list" then
+ listFeatures = true
+ elseif opt == "a" or opt == "all" then
+ print("Available features:")
+ for i=0,likwid.tablelength(likwid.cpuFeatures)-1 do
+ if likwid.cpuFeatures[i]:match("PREFETCHER") then
+ print(string.format("\t%s*",likwid.cpuFeatures[i]))
+ else
+ print(string.format("\t%s",likwid.cpuFeatures[i]))
+ end
+ end
+ print("Modifiable features are marked with *")
+ os.exit(0)
+ elseif opt == "e" or opt == "enable" then
+ local tmp = likwid.stringsplit(arg, ",")
+ for i, f in pairs(tmp) do
+ for i=0,likwid.tablelength(likwid.cpuFeatures)-1 do
+ if likwid.cpuFeatures[i] == f then
+ table.insert(enableList, i)
+ end
+ end
+ end
+ elseif opt == "d" or opt == "disable" then
+ local tmp = likwid.stringsplit(arg, ",")
+ for i, f in pairs(tmp) do
+ for i=0,likwid.tablelength(likwid.cpuFeatures)-1 do
+ if likwid.cpuFeatures[i] == f then
+ table.insert(disableList, i)
+ end
+ end
+ end
+ elseif opt == "?" then
+ print("Invalid commandline option -"..arg)
+ os.exit(1)
+ elseif opt == "!" then
+ print("Option requires an argument")
+ os.exit(1)
+ end
+end
+
+likwid.initCpuFeatures()
+
+if listFeatures and #cpulist > 0 then
+ local str = "Feature"..string.rep(" ",string.len("BRANCH_TRACE_STORAGE")-string.len("Feature")+2)
+ for j, c in pairs(cpulist) do
+ str = str..string.format("CPU %d\t",c)
+ end
+ print(str)
+ str = ""
+ for i=0,likwid.tablelength(likwid.cpuFeatures)-1 do
+ str = likwid.cpuFeatures[i]..string.rep(" ",string.len("BRANCH_TRACE_STORAGE")-string.len(likwid.cpuFeatures[i])+2)
+ for j, c in pairs(cpulist) do
+ if (likwid.getCpuFeatures(c, i) == 1) then
+ str = str .. "on\t"
+ else
+ str = str .. "off\t"
+ end
+ end
+ print(str)
+ end
+elseif #cpulist == 0 then
+ print("Need CPU to list current feature state")
+ os.exit(1)
+end
+
+if #enableList > 0 and #disableList > 0 then
+ for i,e in pairs(enableList) do
+ for j, d in pairs(disableList) do
+ if (e == d) then
+ print(string.format("Feature %s is in enable and disable list, doing nothing for feature", e))
+ table.insert(skipList, e)
+ end
+ end
+ end
+ for i, s in pairs(skipList) do
+ for j, e in pairs(enableList) do
+ if (s == e) then table.remove(enableList, j) end
+ end
+ for j, e in pairs(disableList) do
+ if (s == e) then table.remove(disableList, j) end
+ end
+ end
+end
+
+if #enableList > 0 then
+ for i, c in pairs(cpulist) do
+ for j, f in pairs(enableList) do
+ local ret = likwid.enableCpuFeatures(c, f, 1)
+ if ret == 0 then
+ print(string.format("Enabled %s for CPU %d", likwid.cpuFeatures[f], c))
+ else
+ print(string.format("Failed %s for CPU %d", likwid.cpuFeatures[f], c))
+ end
+ end
+ end
+end
+if #disableList > 0 then
+ for i, c in pairs(cpulist) do
+ for j, f in pairs(disableList) do
+ local ret = likwid.disableCpuFeatures(c, f, 1)
+ if ret == 0 then
+ print(string.format("Disabled %s for CPU %d", likwid.cpuFeatures[f], c))
+ else
+ print(string.format("Failed %s for CPU %d", likwid.cpuFeatures[f], c))
+ end
+ end
+ end
+end
diff --git a/src/applications/likwid-genCfg.c b/src/applications/likwid-genCfg.c
deleted file mode 100644
index 97147fd..0000000
--- a/src/applications/likwid-genCfg.c
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: likwid-genCfg.c
- *
- * Description: An application to dump the cpu topology information to
- * a config file.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sched.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <ctype.h>
-
-#include <types.h>
-#include <error.h>
-#include <cpuid.h>
-
-/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
-
-#define HELP_MSG \
- fprintf(stdout, "\nlikwid-genCfg -- Version %d.%d \n\n",VERSION,RELEASE); \
- fprintf(stdout, "A tool to dump node topology information into a file.\n"); \
- fprintf(stdout, "Options:\n"); \
- fprintf(stdout, "-h\t Help message\n"); \
- fprintf(stdout, "-v\t Version information\n"); \
- fprintf(stdout, "-o\t output file path (optional)\n\n"); \
- fflush(stdout);
-
-#define VERSION_MSG \
- fprintf(stdout, "likwid-genCfg %d.%d \n\n",VERSION,RELEASE); \
- fflush(stdout);
-
-
-int main (int argc, char** argv)
-{
- FILE *file;
- char *filepath = TOSTRING(CFGFILE);
- size_t size;
- int c;
-
- while ((c = getopt (argc, argv, "ho:v")) != -1)
- {
- switch (c)
- {
- case 'h':
- HELP_MSG;
- exit (EXIT_SUCCESS);
- case 'o':
- filepath = optarg;
- break;
- case 'v':
- VERSION_MSG;
- exit (EXIT_SUCCESS);
- case '?':
- if (isprint (optopt))
- {
- fprintf (stderr, "Unknown option `-%c'.\n", optopt);
- }
- else
- {
- fprintf (stderr,
- "Unknown option character `\\x%x'.\n",
- optopt);
- }
- return EXIT_FAILURE;
- default:
- HELP_MSG;
- exit (EXIT_SUCCESS);
- }
- }
-
- cpuid_init();
- fprintf(stdout, HLINE);
- fprintf(stdout, "CPU name:\t%s \n",cpuid_info.name);
- fflush(stdout);
-
- if ((file = fopen(filepath, "wb")) != NULL)
- {
- size = fwrite((void*) &cpuid_topology, sizeof(CpuTopology), 1, file);
-
- size = fwrite((void*) cpuid_topology.threadPool,
- sizeof(HWThread), cpuid_topology.numHWThreads, file);
-
- size = fwrite((void*) cpuid_topology.cacheLevels,
- sizeof(CacheLevel), cpuid_topology.numCacheLevels, file);
-
- fclose(file);
- }
- else
- {
- fprintf(stderr,"Cfg file could not be written to %s\n", filepath);
- ERROR;
- }
-
- return EXIT_SUCCESS;
-}
-
diff --git a/src/applications/likwid-genTopoCfg.lua b/src/applications/likwid-genTopoCfg.lua
new file mode 100644
index 0000000..fdd4d69
--- /dev/null
+++ b/src/applications/likwid-genTopoCfg.lua
@@ -0,0 +1,153 @@
+#!<INSTALLED_BINPREFIX>/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ * Filename: likwid-genTopoCfg.lua
+ *
+ * Description: A application to create a file of the underlying system configuration
+ * that is used by likwid to avoid reading the systems architecture at
+ * each start.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+
+local filename = "<INSTALLED_PREFIX>/etc/likwid_topo.cfg"
+
+function version()
+ print(string.format("likwid-genTopoCfg -- Version %d.%d",likwid.version,likwid.release))
+end
+
+function usage()
+ version()
+ print("A tool to store the system's architecture to a config file for LIKWID.\n")
+ print("Options:")
+ print("-h, --help\t\t Help message")
+ print("-v, --version\t\t Version information")
+ print("-o, --output <file>\t Use <file> instead of default "..filename)
+ print("\t\t\t Likwid searches at startup per default:")
+ print("\t\t\t /etc/likwid_topo.cfg and <INSTALLED_PREFIX>/etc/likwid_topo.cfg")
+ print("\t\t\t Another location can be configured in the configuration file /etc/likwid.cfg,")
+ print("\t\t\t <INSTALLED_PREFIX>/etc/likwid.cfg or the path defined at the build process of Likwid.")
+end
+
+for opt,arg in likwid.getopt(arg, {"h","v","help","version", "o:", "output:"}) do
+ if opt == "h" or opt == "help" then
+ usage()
+ os.exit(0)
+ elseif opt == "v" or opt == "version" then
+ version()
+ os.exit(0)
+ elseif opt == "o" or opt == "output" then
+ filename = arg
+ elseif opt == "?" then
+ print("Invalid commandline option -"..arg)
+ os.exit(1)
+ elseif opt == "!" then
+ print("Option requires an argument")
+ os.exit(1)
+ end
+end
+local file = io.open(filename, "r")
+if file ~= nil then
+ print("File "..filename.." exists, please delete it first.")
+ file:close()
+ os.exit(1)
+end
+file = io.open(filename, "w")
+if file == nil then
+ print("Cannot open file "..filename.." for writing")
+ os.exit(1)
+end
+
+
+local cpuinfo = likwid.getCpuInfo()
+local cputopo = likwid.getCpuTopology()
+local numainfo = likwid.getNumaInfo()
+local affinity = likwid.getAffinityInfo()
+if cpuinfo == nil or cputopo == nil or numainfo == nil or affinity == nil then
+ print("Cannot initialize topology module of LIKWID")
+ os.exit(1)
+end
+print(string.format("Writing new topology file %s", filename))
+cpuinfo["clock"] = likwid.getCpuClock()
+
+local threadPool_order = {"threadId", "coreId", "packageId", "apicId"}
+local cacheLevels_order = {"type", "associativity", "sets", "lineSize", "size", "threads", "inclusive"}
+
+for field, value in pairs(cpuinfo) do
+ file:write("cpuid_info " .. field .. " = " .. tostring(value).."\n")
+end
+
+for field, value in pairs(cputopo) do
+ if (field ~= "threadPool" and field ~= "cacheLevels" and field ~= "topologyTree") then
+ if field ~= "activeHWThreads" then
+ file:write("cpuid_topology " .. field .. " = " .. tostring(value).."\n")
+ end
+ elseif (field == "threadPool") then
+ --file:write("cpuid_topology threadPool count = "..tostring(likwid.tablelength(cputopo["threadPool"])).."\n")
+ for id, tab in pairs(cputopo["threadPool"]) do
+ str = "cpuid_topology threadPool "..tostring(id).." "
+ for k,v in pairs(threadPool_order) do
+ file:write(str..tostring(v).." = "..tostring(tab[v]).."\n")
+ end
+
+ end
+ elseif (field == "cacheLevels") then
+ for id, tab in pairs(cputopo["cacheLevels"]) do
+ str = "cpuid_topology cacheLevels "..tostring(id).." "
+ for k,v in pairs(cacheLevels_order) do
+ file:write(str..tostring(v).." = "..tostring(tab[v]).."\n")
+ end
+
+ end
+ end
+end
+
+file:write("numa_info numberOfNodes = "..tostring(numainfo["numberOfNodes"]).."\n")
+for field, value in pairs(numainfo["nodes"]) do
+ for id, tab in pairs(value) do
+ if id ~= "processors" and id ~= "distances" then
+ file:write("numa_info nodes "..tostring(field).." "..tostring(id).." = "..tostring(tab).."\n")
+ elseif id == "processors" then
+ for k,v in pairs(tab) do
+ str = str..","..tostring(v)
+ file:write("numa_info nodes "..tostring(field).." "..tostring(id).." "..tostring(k).." = "..tostring(v).."\n")
+ end
+ elseif id == "distances" then
+ for k,v in pairs(tab) do
+ for k1,v1 in pairs(v) do
+ file:write("numa_info nodes "..tostring(field).." "..tostring(id).." "..tostring(k1).." = "..tostring(v1).."\n")
+ end
+ end
+ end
+ end
+end
+
+file:close()
+likwid.putAffinityInfo()
+likwid.putNumaInfo()
+likwid.putTopology()
+
diff --git a/src/applications/likwid-memsweeper.c b/src/applications/likwid-memsweeper.c
deleted file mode 100644
index 4806763..0000000
--- a/src/applications/likwid-memsweeper.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: likwid-memsweeper.c
- *
- * Description: An application to clean up NUMA memory domains.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sched.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <ctype.h>
-
-#include <types.h>
-#include <strUtil.h>
-#include <error.h>
-#include <cpuid.h>
-#include <numa.h>
-#include <affinity.h>
-#include <memsweep.h>
-
-/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
-
-#define HELP_MSG \
- fprintf(stdout, "\nlikwid-memsweeper -- Version %d.%d \n\n",VERSION,RELEASE); \
- fprintf(stdout, "A tool clean up NUMA memory domains and last level caches.\n"); \
- fprintf(stdout, "Options:\n"); \
- fprintf(stdout, "-h\t Help message\n"); \
- fprintf(stdout, "-v\t Version information\n"); \
- fprintf(stdout, "-q\t Silent without output\n"); \
- fprintf(stdout, "-c\t Specify NUMA domain ID to clean up\n"); \
- fprintf(stdout, "\t If no specific domain is set, all domains are swept.\n"); \
- fprintf(stdout, "Usage:\n"); \
- fprintf(stdout, "To clean specific domain: likwid-memsweeper -c 2 \n"); \
- fflush(stdout);
-
-#define VERSION_MSG \
- fprintf(stdout, "likwid-memsweeper %d.%d \n\n",VERSION,RELEASE); \
- fflush(stdout);
-
-
-int main (int argc, char** argv)
-{
- int domainId = -1;
- int c;
- int optSilent = 0;
- bstring argString;
- FILE* OUTSTREAM = stdout;
-
- while ((c = getopt (argc, argv, "+c:hvq")) != -1)
- {
- switch (c)
- {
- case 'h':
- HELP_MSG;
- exit (EXIT_SUCCESS);
- case 'v':
- VERSION_MSG;
- exit (EXIT_SUCCESS);
- case 'q':
- optSilent = 1;
- OUTSTREAM = NULL;
- break;
- case 'c':
- if (! (argString = bSecureInput(10,optarg)))
- {
- fprintf(stderr,"Failed to read argument string!\n");
- exit(EXIT_FAILURE);
- }
-
- domainId = str2int((char*) argString->data);
-
- break;
- case '?':
- if (isprint (optopt))
- {
- fprintf (stderr, "Unknown option `-%c'.\n", optopt);
- }
- else
- {
- fprintf (stderr,
- "Unknown option character `\\x%x'.\n",
- optopt);
- }
- return EXIT_FAILURE;
- default:
- HELP_MSG;
- exit (EXIT_SUCCESS);
- }
- }
-
- if (cpuid_init() == EXIT_FAILURE)
- {
- ERROR_PLAIN_PRINT(Unsupported processor!);
- }
- numa_init();
-
- if (domainId < 0)
- {
- memsweep_node(OUTSTREAM);
- }
- else if (domainId < numa_info.numberOfNodes)
- {
- memsweep_domain(OUTSTREAM, domainId);
- }
- else
- {
- fprintf(stderr, "Unknown NUMA domain %d\n", domainId);
- exit(EXIT_FAILURE);
- }
-
- return EXIT_SUCCESS;
-}
-
diff --git a/src/applications/likwid-memsweeper.lua b/src/applications/likwid-memsweeper.lua
new file mode 100644
index 0000000..d3315ac
--- /dev/null
+++ b/src/applications/likwid-memsweeper.lua
@@ -0,0 +1,89 @@
+#!<INSTALLED_BINPREFIX>/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ * Filename: likwid-memsweeper.lua
+ *
+ * Description: An application to clean up NUMA memory domains.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
+local likwid = require("likwid")
+
+local function version()
+ print(string.format("likwid-memsweeper -- Version %d.%d",likwid.version,likwid.release))
+end
+
+local function examples()
+ print("Examples:")
+ print("To clean specific domain:")
+ print("likwid-memsweeper -c 2")
+ print("To clean a range of domains:")
+ print("likwid-memsweeper -c 1-2")
+ print("To clean specific domains:")
+ print("likwid-memsweeper -c 0,1-2")
+
+end
+
+local function usage()
+ version()
+ print("A tool clean up NUMA memory domains.\n")
+ print("Options:")
+ print("-h\t\t Help message")
+ print("-v\t\t Version information")
+ print("-c <list>\t Specify NUMA domain ID to clean up")
+ print("")
+ examples()
+end
+
+numainfo = likwid.getNumaInfo()
+nodes = {}
+for i,_ in pairs(numainfo["nodes"]) do
+ if tonumber(numainfo["nodes"][i]["id"]) ~= nil then
+ table.insert(nodes,numainfo["nodes"][i]["id"])
+ end
+end
+
+for opt,arg in likwid.getopt(arg, {"c:", "h", "v", "help", "version"}) do
+ if opt == "h" or opt == "help" then
+ usage()
+ os.exit(0)
+ elseif opt == "v" or opt == "version" then
+ version()
+ os.exit(0)
+ elseif (opt == "c") then
+ num_nodes, nodes = likwid.nodestr_to_nodelist(arg)
+ elseif opt == "?" then
+ print("Invalid commandline option -"..arg)
+ os.exit(1)
+ elseif opt == "!" then
+ print("Option requires an argument")
+ os.exit(1)
+ end
+end
+
+for i,socket in pairs(nodes) do
+ likwid.memSweepDomain(socket)
+end
+likwid.putNumaInfo()
diff --git a/src/applications/likwid-mpirun.lua b/src/applications/likwid-mpirun.lua
new file mode 100644
index 0000000..07d6dc4
--- /dev/null
+++ b/src/applications/likwid-mpirun.lua
@@ -0,0 +1,1967 @@
+#!<INSTALLED_BINPREFIX>/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ * Filename: likwid-mpirun.lua
+ *
+ * Description: A wrapper script to pin threads spawned by MPI processes and
+ * measure hardware performance counters
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+
+local function version()
+ print(string.format("likwid-mpirun -- Version %d.%d",likwid.version,likwid.release))
+end
+
+local function examples()
+ print("Examples:")
+ print("Run 32 processes on hosts in hostlist")
+ print("likwid-mpirun -np 32 ./a.out")
+ print("")
+ print("Run 1 MPI process on each socket")
+ print("likwid-mpirun -nperdomain S:1 ./a.out")
+ print("Total amount of MPI processes is calculated using the number of hosts in the hostfile")
+ print("")
+ print("For hybrid MPI/OpenMP jobs you need to set the -pin option")
+ print("Starts 2 MPI processes on each host, one on socket 0 and one on socket 1")
+ print("Each MPI processes may start 2 OpenMP threads pinned to the first two CPUs on each socket")
+ print("likwid-mpirun -pin S0:0-1_S1:0-1 ./a.out")
+ print("")
+ print("Run 2 processes on each socket and measure the MEM performance group")
+ print("likwid-mpirun -nperdomain S:2 -g MEM ./a.out")
+ print("Only one process on a socket measures the Uncore/RAPL counters, the other one(s) only core-local counters")
+ print("")
+end
+
+local function usage()
+ version()
+ print("A wrapper script to pin threads spawned by MPI processes and measure hardware performance counters.\n")
+ print("Options:")
+ print("-h, --help\t\t Help message")
+ print("-v, --version\t\t Version information")
+ print("-d, --debug\t\t Debugging output")
+ print("-n/-np <count>\t\t Set the number of processes")
+ print("-nperdomain <domain>\t Set the number of processes per node by giving an affinity domain and count")
+ print("-pin <list>\t\t Specify pinning of threads. CPU expressions like likwid-pin separated with '_'")
+ print("-s, --skip <hex>\t Bitmask with threads to skip")
+ print("-mpi <id>\t\t Specify which MPI should be used. Possible values: openmpi, intelmpi and mvapich2")
+ print("\t\t\t If not set, module system is checked")
+ print("-omp <id>\t\t Specify which OpenMP should be used. Possible values: gnu and intel")
+ print("\t\t\t Only required for statically linked executables.")
+ print("-hostfile\t\t Use custom hostfile instead of searching the environment")
+ print("-g/-group <perf>\t Set a likwid-perfctr conform event set for measuring on nodes")
+ print("-m/-marker\t\t Activate marker API mode")
+ print("-O\t\t\t Output easily parseable CSV instead of fancy tables")
+ print("-f\t\t\t Force overwrite of registers if they are in use. You can also use environment variable LIKWID_FORCE")
+ print("")
+ print("Processes are pinned to physical CPU cores first. For syntax questions see likwid-pin")
+ print("")
+ examples()
+end
+
+local np = 0
+local ppn = 0
+local nperdomain = nil
+local npernode = 0
+local cpuexprs = {}
+local perfexprs = {}
+local hostfile = nil
+local hosts = {}
+local perf = {}
+local mpitype = nil
+local omptype = nil
+local skipStr = ""
+local executable = {}
+local debug = false
+local use_marker = false
+local use_csv = false
+local force = false
+if os.getenv("LIKWID_FORCE") ~= nil then
+ force = true
+end
+
+local LIKWID_PIN="<INSTALLED_PREFIX>/bin/likwid-pin"
+local LIKWID_PERFCTR="<INSTALLED_PREFIX>/bin/likwid-perfctr"
+
+local readHostfile = nil
+local writeHostfile = nil
+local getEnvironment = nil
+local executeCommand = nil
+local mpiexecutable = nil
+
+
+local function readHostfileOpenMPI(filename)
+ local hostlist = {}
+ if filename == nil or filename == "" then
+ return {}
+ end
+ local f = io.open(filename, "r")
+ if f == nil then
+ print("ERROR: Cannot open hostfile "..filename)
+ os.exit(1)
+ end
+ if debug then
+ print("DEBUG: Reading hostfile in openmpi style")
+ end
+ local t = f:read("*all")
+ f:close()
+ for i, line in pairs(likwid.stringsplit(t,"\n")) do
+ if line:match("^#") == nil and line:match("^%s*$") == nil then
+ hostname, slots, maxslots = line:match("^([%.%a%d]+)%s+slots=(%d*)%s+max%-slots=(%d*)")
+ if not hostname then
+ hostname, slots = line:match("^([%.%a%d]+)%s+slots=(%d*)")
+ if not hostname then
+ hostname = line:match("^([%.%a%d]+)")
+ slots = 1
+ maxslots = 1
+ end
+ end
+ local found = false
+ for i, host in pairs(hostlist) do
+ if host["hostname"] == hostname then
+ if slots and host["slots"] then
+ host["slots"] = host["slots"] + tonumber(slots)
+ end
+ if maxslots and host["maxslots"] then
+ host["maxslots"] = host["maxslots"] + tonumber(maxslots)
+ end
+ break
+ end
+ end
+ if not found then
+ table.insert(hostlist, {hostname=hostname, slots=tonumber(slots), maxslots=tonumber(maxslots)})
+ end
+ end
+ end
+ local topo = likwid.getCpuTopology()
+ for i,host in pairs(hostlist) do
+ if host["slots"] == nil or host["slots"] == 0 then
+ host["slots"] = topo.numHWThreads
+ end
+ if host["maxslots"] == nil or host["maxslots"] == 0 then
+ host["maxslots"] = topo.numHWThreads
+ end
+ if debug then
+ print(string.format("DEBUG: Read host %s with %d slots and %d slots maximally", host["hostname"], host["slots"], host["maxslots"]))
+ end
+ end
+ return hostlist
+end
+
+local function writeHostfileOpenMPI(hostlist, filename)
+ if filename == nil or filename == "" then
+ return
+ end
+
+ local f = io.open(filename, "w")
+ if f == nil then
+ print("ERROR: Cannot open hostfile "..filename)
+ os.exit(1)
+ end
+ for i, hostcontent in pairs(hostlist) do
+ str = hostcontent["hostname"]
+ if hostcontent["slots"] then
+ str = str .. string.format(" slots=%d", hostcontent["slots"])
+ end
+ if hostcontent["maxslots"] then
+ str = str .. string.format(" max-slots=%d", hostcontent["maxslots"])
+ end
+ f:write(str .. "\n")
+ end
+ f:close()
+end
+
+local function getEnvironmentOpenMPI()
+ return {}
+end
+
+local function executeOpenMPI(wrapperscript, hostfile, env, nrNodes)
+ local bindstr = ""
+ if wrapperscript.sub(1,1) ~= "/" then
+ wrapperscript = os.getenv("PWD").."/"..wrapperscript
+ end
+
+ local f = io.popen(string.format("%s -V 2>&1", mpiexecutable), "r")
+ if f ~= nil then
+ local input = f:read("*a")
+ ver1,ver2,ver3 = input:match("(%d+)%.(%d+)%.(%d+)")
+ if ver1 == "1" then
+ if ver2 == "7" then
+ bindstr = "--bind-to none"
+ elseif ver2 == "6" then
+ bindstr = "--bind-to-none"
+ end
+ end
+ f:close()
+ end
+
+ local cmd = string.format("%s -hostfile %s %s -np %d -npernode %d %s",
+ mpiexecutable, hostfile, bindstr,
+ np, ppn, wrapperscript)
+ if debug then
+ print("EXEC: "..cmd)
+ end
+ os.execute(cmd)
+end
+
+local function readHostfileIntelMPI(filename)
+ local hostlist = {}
+ if filename == nil or filename == "" then
+ return {}
+ end
+ local f = io.open(filename, "r")
+ if f == nil then
+ print("ERROR: Cannot open hostfile "..filename)
+ os.exit(1)
+ end
+ if debug then
+ print("DEBUG: Reading hostfile in intelmpi style")
+ end
+ local topo = likwid.getCpuTopology()
+ local t = f:read("*all")
+ f:close()
+ for i, line in pairs(likwid.stringsplit(t,"\n")) do
+ if line:match("^#") == nil and line:match("^%s*$") == nil then
+ hostname, slots = line:match("^([%.%a%d]+):(%d+)")
+ if not hostname then
+ hostname = line:match("^([%.%a%d]+)")
+ slots = topo["numHWThreads"]
+ end
+ table.insert(hostlist, {hostname=hostname, slots=slots, maxslots=slots})
+ end
+ end
+ if debug then
+ for i, host in pairs(hostlist) do
+ print(string.format("DEBUG: Read host %s with %d slots and %d slots maximally", host["hostname"], host["slots"], host["maxslots"]))
+ end
+ end
+ return hostlist
+end
+
+local function writeHostfileIntelMPI(hostlist, filename)
+ if filename == nil or filename == "" then
+ return
+ end
+
+ local f = io.open(filename, "w")
+ if f == nil then
+ print("ERROR: Cannot open hostfile "..filename)
+ os.exit(1)
+ end
+ for i, hostcontent in pairs(hostlist) do
+ str = hostcontent["hostname"]
+ if hostcontent["slots"] then
+ str = str .. string.format(":%d", hostcontent["slots"])
+ end
+ f:write(str .. "\n")
+ end
+ f:close()
+end
+
+local function getEnvironmentIntelMPI()
+ local env = {}
+ env['I_MPI_PIN']='off'
+ env['KMP_AFFINITY']='disabled'
+ return env
+end
+
+local function executeIntelMPI(wrapperscript, hostfile, env, nrNodes)
+ local use_hydra = true
+ if wrapperscript.sub(1,1) ~= "/" then
+ wrapperscript = os.getenv("PWD").."/"..wrapperscript
+ end
+ if hostfile.sub(1,1) ~= "/" then
+ hostfile = os.getenv("PWD").."/"..hostfile
+ end
+ local path = ""
+ local f = io.popen(string.format("dirname %s", mpiexecutable))
+ if f ~= nil then
+ path = f:read("*line")
+ f:close()
+ end
+ if likwid.access(string.format("%s/mpdboot", path), "x") == 0 then
+ use_hydra = false
+ end
+ for i, env in pairs({"MPIHOME", "MPI_HOME", "MPI_ROOT", "MPI_BASE"}) do
+ if likwid.access(string.format("%s/bin/mpdboot", os.getenv(env)), "x") == 0 then
+ use_hydra = false
+ path = string.format("%s/bin",os.getenv(env))
+ break
+ end
+ end
+
+ local envstr = ""
+ for i, e in pairs(env) do
+ if use_hydra then
+ envstr = envstr .. string.format("-genv %s %s ", i, e)
+ else
+ envstr = envstr .. string.format("-env %s %s ", i, e)
+ end
+ end
+
+ if debug then
+ if use_hydra == false then
+ print(string.format("EXEC: %s/mpdboot -r ssh -n %d -f %s", path, nrNodes, hostfile))
+ print(string.format("EXEC: %s/mpiexec -perhost %d %s -np %d %s", path, ppn, envstr, np, wrapperscript))
+ print(string.format("EXEC: %s/mpdallexit", path))
+ else
+ print(string.format("%s %s -f %s -np %d -perhost %d %s",mpiexecutable, envstr, hostfile, np, ppn, wrapperscript))
+ end
+ end
+
+ --os.execute(string.format("%s -genv I_MPI_PIN 0 -f %s -np %d -perhost %d %s",mpiexecutable, hostfile, np, ppn, wrapperscript))
+ if use_hydra == false then
+ os.execute(string.format("%s/mpdboot -r ssh -n %d -f %s", path, nrNodes, hostfile))
+ os.execute(string.format("%s/mpiexec -perhost %d %s -np %d %s", path, ppn, envstr, np, wrapperscript))
+ os.execute(string.format("%s/mpdallexit", path))
+ else
+ os.execute(string.format("%s %s -f %s -np %d -perhost %d %s",mpiexecutable, envstr, hostfile, np, ppn, wrapperscript))
+ end
+end
+
+local function readHostfileMvapich2(filename)
+ local hostlist = {}
+ if filename == nil or filename == "" then
+ return {}
+ end
+ local f = io.open(filename, "r")
+ if f == nil then
+ print("ERROR: Cannot open hostfile "..filename)
+ os.exit(1)
+ end
+ if debug then
+ print("DEBUG: Reading hostfile in mvapich2 style")
+ end
+ local t = f:read("*all")
+ f:close()
+ for i, line in pairs(likwid.stringsplit(t,"\n")) do
+ if line:match("^#") == nil and line:match("^%s*$") == nil then
+ hostname, slots, interface = line:match("^([%.%a%d]+):(%d+):([%a%d]+)")
+ if not hostname then
+ hostname, slots = line:match("^([%.%a%d]+):(%d+)")
+ if not hostname then
+ hostname = line:match("^([%.%a%d]+)")
+ slots = 1
+ interface = nil
+ else
+ interface = nil
+ end
+ end
+ table.insert(hostlist, {hostname=hostname, slots=slots, maxslots=slots, interface=interface})
+ end
+ end
+ if debug then
+ for i, host in pairs(hostlist) do
+ print(string.format("DEBUG: Read host %s with %d slots and %d slots maximally", host["hostname"], host["slots"], host["maxslots"]))
+ end
+ end
+ return hostlist
+end
+
+local function writeHostfileMvapich2(hostlist, filename)
+ if filename == nil or filename == "" then
+ return
+ end
+
+ local f = io.open(filename, "w")
+ if f == nil then
+ print("ERROR: Cannot open hostfile "..filename)
+ os.exit(1)
+ end
+ for i, hostcontent in pairs(hostlist) do
+ str = hostcontent["hostname"]
+ if hostcontent["slots"] then
+ str = str .. string.format(":%d", hostcontent["slots"])
+ end
+ if hostcontent["interface"] then
+ str = str .. string.format(":%s", hostcontent["interface"])
+ end
+ f:write(str .. "\n")
+ end
+ f:close()
+end
+
+local function getEnvironmentMvapich2()
+ local env = {}
+ env['MV2_ENABLE_AFFINITY'] = "0"
+ return env
+end
+
+local function executeMvapich2(wrapperscript, hostfile, env, nrNodes)
+ if wrapperscript.sub(1,1) ~= "/" then
+ wrapperscript = os.getenv("PWD").."/"..wrapperscript
+ end
+ if hostfile.sub(1,1) ~= "/" then
+ hostfile = os.getenv("PWD").."/"..hostfile
+ end
+
+ local envstr = ""
+ for i, e in pairs(env) do
+ envstr = envstr .. string.format("%s=%s ", i, e)
+ end
+
+ local cmd = string.format("%s -f %s -np %d -ppn %d %s %s",
+ mpiexecutable, hostfile,
+ np, ppn, envstr, wrapperscript)
+ if debug then
+ print("EXEC: "..cmd)
+ end
+ os.execute(cmd)
+end
+
+
+local function readHostfilePBS(filename)
+ local hostlist = {}
+ if filename == nil or filename == "" then
+ return {}
+ end
+ local f = io.open(filename, "r")
+ if f == nil then
+ print("ERROR: Cannot open hostfile "..filename)
+ os.exit(1)
+ end
+ if debug then
+ print("DEBUG: Reading hostfile from batch system")
+ end
+ local t = f:read("*all")
+ f:close()
+ for i, line in pairs(likwid.stringsplit(t,"\n")) do
+ if line:match("^#") == nil and line:match("^%s*$") == nil then
+ hostname = line:match("^([%.%a%d]+)")
+ local found = false
+ for i, host in pairs(hostlist) do
+ if host["hostname"] == hostname then
+ host["slots"] = host["slots"] + 1
+ host["maxslots"] = host["maxslots"] + 1
+ found = true
+ break
+ end
+ end
+ if not found then
+ table.insert(hostlist, {hostname=hostname, slots=1, maxslots=1})
+ end
+ end
+ end
+ if debug then
+ for i, host in pairs(hostlist) do
+ print(string.format("DEBUG: Read host %s with %d slots and %d slots maximally", host["hostname"], host["slots"], host["maxslots"]))
+ end
+ end
+ return hostlist
+end
+
+local function readHostfileSlurm(hostlist)
+ nperhost = tonumber(os.getenv("SLURM_TASKS_PER_NODE"):match("(%d+)"))
+ if hostlist and nperhost then
+ hostfile = write_hostlist_to_file(hostlist, nperhost)
+ hosts = readHostfilePBS(hostfile)
+ os.remove(hostfile)
+ end
+ return hosts
+end
+
+function write_hostlist_to_file(hostlist, nperhost)
+ if hostlist == "" then
+ return {}
+ end
+ outlist = {}
+ list = likwid.stringsplit(hostlist, ",")
+ for i, item in pairs(list) do
+ if not item:match("%[") then
+ table.insert(outlist, item)
+ else
+ prefixzeros = 0
+
+ host, start, ende,remain = item:match("(%w+)%[(%d+)-(%d+)%]([%w%d%[%]-]*)")
+ if host and start and ende then
+ if tonumber(start) ~= 0 then
+ for j=1,#start do
+ if start:sub(j,j+1) == '0' then
+ prefixzeros = prefixzeros + 1
+ end
+ end
+ end
+ if start and ende then
+ for j=start,ende do
+ newh = host..string.rep("0", prefixzeros)..tostring(math.tointeger(j))
+ if remain then
+ newh = newh .. remain
+ end
+ table.insert(outlist, newh)
+ end
+ end
+ end
+ end
+ end
+ fname = string.format("/tmp/hostlist.%d", likwid.getpid())
+ f = io.open(fname, "w")
+ if f ~= nil then
+ for i=1,#outlist do
+ for j=1, nperhost do
+ f:write(outlist[i].."\n")
+ end
+ end
+ f:close()
+ end
+ return fname
+end
+
+local function writeHostfileSlurm(hostlist, filename)
+ l = {}
+
+ for i, h in pairs(hostlist) do
+ table.insert(l, h["hostname"])
+ end
+ print("SLURM_NODELIST", table.concat(l,","))
+ likwid.setenv("SLURM_NODELIST", table.concat(l,","))
+end
+
+local function getEnvironmentSlurm()
+ return {}
+end
+
+local function executeSlurm(wrapperscript, hostfile, env, nrNodes)
+ if wrapperscript.sub(1,1) ~= "/" then
+ wrapperscript = os.getenv("PWD").."/"..wrapperscript
+ end
+
+ local exec = string.format("srun -N %d --ntasks-per-node=%d --cpu_bind=none %s", nrNodes, ppn, wrapperscript)
+ if debug then
+ print("EXEC: "..exec)
+ end
+ os.execute(exec)
+end
+local function getNumberOfNodes(hostlist)
+ local n = 0
+ for i, h in pairs(hostlist) do
+ hostname = h["hostname"]
+ exists = false
+ for j=1,i-1 do
+ if hostlist[i]["hostname"] == hostlist[j]["hostname"] then
+ exists = true
+ end
+ end
+ if not exists then
+ n = n + 1
+ end
+ end
+ return n
+end
+
+local function getMpiType()
+ local mpitype = nil
+ if os.getenv("SLURM_JOB_ID") ~= nil then
+ return "slurm"
+ end
+ cmd = "bash -c 'tclsh /apps/modules/modulecmd.tcl sh list -t' 2>&1"
+ local f = io.popen(cmd, 'r')
+ if f == nil then
+ cmd = os.getenv("SHELL").." -c 'module -t list' 2>&1"
+ f = io.popen(cmd, 'r')
+ end
+ if f ~= nil then
+ local s = assert(f:read('*a'))
+ f:close()
+ s = string.gsub(s, '^%s+', '')
+ s = string.gsub(s, '%s+$', '')
+ for i,line in pairs(likwid.stringsplit(s, "\n")) do
+ if line:match("[iI]ntel[mM][pP][iI]") or (line:match("[iI]ntel") and line:match("[mM][pP][iI]")) then
+ mpitype = "intelmpi"
+ --libmpi%a*.so
+ elseif line:match("[oO]pen[mM][pP][iI]") or (line:match("[oO]pen") and line:match("[mM][pP][iI]")) then
+ mpitype = "openmpi"
+ --libmpi.so
+ elseif line:match("mvapich2") then
+ mpitype = "mvapich2"
+ --libmpich.so
+ end
+ end
+ end
+ for i, exec in pairs({"mpiexec.hydra", "mpiexec", "mpirun"}) do
+ f = io.popen(string.format("which %s 2>/dev/null", exec), 'r')
+ if f ~= nil then
+ local s = f:read('*line')
+ if s ~= nil then
+ f:close()
+ f = io.popen(string.format("%s --help 2>/dev/null", s), 'r')
+ if f ~= nil then
+ out = f:read("*a")
+ b,e = out:find("Intel")
+ if (b ~= nil) then
+ mpitype = "intelmpi"
+ break
+ end
+ b,e = out:find("OpenRTE")
+ if (b ~= nil) then
+ mpitype = "openmpi"
+ break
+ end
+ b,e = out:find("MPICH")
+ if (b ~= nil) then
+ mpitype = "mvapich2"
+ break
+ else
+ b,e = out:find("MVAPICH2")
+ if (b ~= nil) then
+ mpitype = "mvapich2"
+ break
+ end
+ end
+ end
+ end
+ end
+ end
+ if not mpitype then
+ print("WARN: No supported MPI loaded in module system")
+ end
+ return mpitype
+end
+
+local function getMpiExec(mpitype)
+ testing = {}
+ if mpitype == "intelmpi" then
+ testing = {"mpiexec.hydra", "mpiexec"}
+ executeCommand = executeIntelMPI
+ readHostfile = readHostfileIntelMPI
+ writeHostfile = writeHostfileIntelMPI
+ getEnvironment = getEnvironmentIntelMPI
+ elseif mpitype == "openmpi" then
+ testing = {"mpiexec", "mpirun"}
+ executeCommand = executeOpenMPI
+ readHostfile = readHostfileOpenMPI
+ writeHostfile = writeHostfileOpenMPI
+ getEnvironment = getEnvironmentOpenMPI
+ elseif mpitype == "mvapich2" then
+ testing = {"mpiexec", "mpirun"}
+ executeCommand = executeMvapich2
+ readHostfile = readHostfileMvapich2
+ writeHostfile = writeHostfileMvapich2
+ getEnvironment = getEnvironmentMvapich2
+ elseif mpitype == "slurm" then
+ testing = {"srun"}
+ executeCommand = executeSlurm
+ readHostfile = readHostfileSlurm
+ writeHostfile = writeHostfileSlurm
+ getEnvironment = getEnvironmentSlurm
+ end
+
+ for i, exec in pairs(testing) do
+ f = io.popen(string.format("which %s 2>/dev/null", exec), 'r')
+ if f ~= nil then
+ local s = f:read('*line')
+ if s ~= nil then
+ mpiexecutable = s
+ end
+ end
+ end
+end
+
+local function getOmpType()
+ local cmd = string.format("ldd `which %s` 2>/dev/null", executable[1])
+ local f = io.popen(cmd, 'r')
+ if f ~= nil then
+ cmd = string.format("ldd %s", executable[1])
+ f = io.popen(cmd, 'r')
+ end
+ omptype = nil
+ dyn_linked = true
+ if f ~= nil then
+ local s = f:read('*a')
+ f:close()
+ for i,line in pairs(likwid.stringsplit(s, "\n")) do
+ if line:match("libgomp.so") then
+ omptype = "gnu"
+ break
+ elseif line:match("libiomp%d*.so") then
+ omptype = "intel"
+ break
+ elseif line:match("not a dynamic executable") then
+ omptype = "none"
+ dyn_linked = false
+ break
+ end
+ end
+ end
+ if not omptype and dyn_linked == false then
+ print("WARN: Cannot get OpenMP variant from executable, trying module system")
+ cmd = "bash -c 'tclsh /apps/modules/modulecmd.tcl sh list -t' 2>&1"
+ local f = io.popen(cmd, 'r')
+ if f == nil then
+ cmd = os.getenv("SHELL").." -c 'module -t list' 2>&1"
+ f = io.popen(cmd, 'r')
+ end
+ if f ~= nil then
+ local s = f:read('*a')
+ f:close()
+ s = string.gsub(s, '^%s+', '')
+ s = string.gsub(s, '%s+$', '')
+ for i,line in pairs(likwid.stringsplit(s, "\n")) do
+ if line:match("[iI]ntel") or line:match("[iI][cC][cC]") then
+ omptype = "intel"
+ elseif line:match("[gG][nN][uU]") or line:match("[gG][cC][cC]") then
+ omptype = "gnu"
+ end
+ end
+ end
+ if not omptype then
+ print("WARN: No supported OpenMP loaded in module system")
+ end
+ end
+ if omptype == "none" then
+ return nil
+ end
+ return omptype
+end
+
+local function assignHosts(hosts, np, ppn)
+ tmp = np
+ newhosts = {}
+ current = 0
+ if debug then
+ print(string.format("Assign %d processes with %d per node to %d hosts", np, ppn, #hosts))
+ print("Available hosts for scheduling:")
+ print("Host", "Slots", "MaxSlots", "Interface")
+ for i, h in pairs(hosts) do
+ print (h["hostname"], h["slots"], h["maxslots"],"", h["interface"])
+ end
+ end
+ local break_while = false
+ while tmp > 0 and #hosts > 0 do
+ for i, host in pairs(hosts) do
+ if host["slots"] and host["slots"] >= ppn then
+ if host["maxslots"] and host["maxslots"] < ppn then
+ table.insert(newhosts, {hostname=host["hostname"],
+ slots=host["maxslots"],
+ maxslots=host["maxslots"],
+ interface=host["interface"]})
+ if debug then
+ print(string.format("DEBUG: Add Host %s with %d slots to host list", host["hostname"], host["maxslots"]))
+ end
+ current = host["maxslots"]
+ hosts[i] = nil
+ else
+ table.insert(newhosts, {hostname=host["hostname"],
+ slots=ppn,
+ maxslots=host["slots"],
+ interface=host["interface"]})
+ if debug then
+ print(string.format("DEBUG: Add Host %s with %d slots to host list", host["hostname"], ppn))
+ end
+ current = ppn
+ hosts[i] = nil
+ end
+ elseif host["slots"] then
+ if host["maxslots"] then
+ if host["maxslots"] < ppn then
+ print(string.format("WARN: Oversubscription for host %s needed, but max-slots set to %d.",
+ host["hostname"], host["maxslots"]))
+ table.insert(newhosts, {hostname=host["hostname"],
+ slots=host["maxslots"],
+ maxslots=host["maxslots"],
+ interface=host["interface"]})
+ current = host["maxslots"]
+ host["maxslots"] = 0
+ hosts[i] = nil
+ else
+ print(string.format("WARN: Oversubscription for host %s.", host["hostname"]))
+ table.insert(newhosts, {hostname=host["hostname"],
+ slots=ppn,
+ maxslots=host["maxslots"],
+ interface=host["interface"]})
+ current = ppn
+
+ end
+ else
+ print(string.format("WARN: Oversubscription for host %s.", host["hostname"]))
+ table.insert(newhosts, {hostname=host["hostname"],
+ slots=ppn,
+ maxslots=host["slots"],
+ interface=host["interface"]})
+ current = ppn
+ end
+ else
+ table.insert(newhosts, {hostname=host["hostname"],
+ slots=ppn,
+ maxslots=host["slots"],
+ interface=host["interface"]})
+ if debug then
+ print(string.format("DEBUG: Add Host %s with %d slots to host list", host["hostname"], ppn))
+ end
+ current = ppn
+ end
+ tmp = tmp - current
+ if tmp < 1 then
+ break_while = true
+ break
+ elseif tmp < ppn then
+ ppn = tmp
+ end
+ end
+ if break_while then
+ break
+ end
+ end
+ for i=1, #newhosts do
+ if newhosts[i] then
+ for j=i+1,#newhosts do
+ if newhosts[j] then
+ if newhosts[i]["hostname"] == newhosts[j]["hostname"] then
+ newhosts[i]["slots"] = newhosts[i]["slots"] + newhosts[j]["slots"]
+ if newhosts[i]["maxslots"] ~= nil and newhosts[j]["maxslots"] ~= nil then
+ newhosts[i]["maxslots"] = newhosts[i]["maxslots"] + newhosts[j]["maxslots"]
+ end
+ if newhosts[i]["slots"] > ppn then
+ ppn = newhosts[i]["slots"]
+ end
+ table.remove(newhosts, j)
+ end
+ end
+ end
+ end
+ end
+ if debug then
+ print("DEBUG: Scheduling on hosts:")
+ for i, h in pairs(newhosts) do
+ if h["maxslots"] ~= nil then
+ str = string.format("DEBUG: Host %s with %d slots (max. %d slots)",
+ h["hostname"],h["slots"],h["maxslots"])
+ else
+ str = string.format("DEBUG: Host %s with %d slots", h["hostname"],h["slots"])
+ end
+ if h["interface"] then
+ str = str.. string.format(" using interface %s", h["interface"])
+ end
+ print(str)
+ end
+ end
+ return newhosts, ppn
+end
+
+local function calculatePinExpr(cpuexprs)
+ local newexprs = {}
+ for i, expr in pairs(cpuexprs) do
+ local strList = {}
+ amount, list = likwid.cpustr_to_cpulist(expr)
+ for _, c in pairs(list) do
+ table.insert(strList, c)
+ end
+ table.insert(newexprs, table.concat(strList,","))
+ end
+ return newexprs
+end
+
+local function calculateCpuExprs(nperdomain, cpuexprs)
+ local topo = likwid.getCpuTopology()
+ local affinity = likwid.getAffinityInfo()
+ local domainlist = {}
+ local newexprs = {}
+ domainname, count = nperdomain:match("[E:]*(%g*):(%d+)")
+
+ for i, domain in pairs(affinity["domains"]) do
+ if domain["tag"]:match(domainname.."%d*") then
+ table.insert(domainlist, i)
+ end
+ end
+ if debug then
+ local str = "DEBUG: NperDomain string "..nperdomain.." covers the domains: "
+ for i, idx in pairs(domainlist) do
+ str = str .. affinity["domains"][idx]["tag"] .. " "
+ end
+ print(str)
+ end
+
+ for i, domidx in pairs(domainlist) do
+ local sortedlist = {}
+ for off=1,topo["numThreadsPerCore"] do
+ for i=0,affinity["domains"][domidx]["numberOfProcessors"]/topo["numThreadsPerCore"] do
+ table.insert(sortedlist, affinity["domains"][domidx]["processorList"][off + (i*topo["numThreadsPerCore"])])
+ end
+ end
+ local tmplist = {}
+ for j=1,count do
+ table.insert(newexprs, tostring(sortedlist[1]))
+ table.remove(sortedlist, 1)
+ end
+ end
+ if debug then
+ local str = "DEBUG: Resolved NperDomain string "..nperdomain.." to CPUs: "
+ for i, expr in pairs(newexprs) do
+ str = str .. expr .. " "
+ end
+ print(str)
+ end
+ return newexprs
+end
+
+local function createEventString(eventlist)
+ if eventlist == nil or #eventlist == 0 then
+ print("ERROR: Empty event list. Failed to create event set string")
+ return ""
+ end
+ local str = ""
+ if eventlist[1] ~= nil and eventlist[1]["Event"] ~= nil and eventlist[1]["Counter"] ~= nil then
+ str = str .. eventlist[1]["Event"]..":"..eventlist[1]["Counter"]
+ end
+ for i=2,#eventlist do
+ if eventlist[i] ~= nil and eventlist[i]["Event"] ~= nil and eventlist[i]["Counter"] ~= nil then
+ str = str .. ","..eventlist[i]["Event"]..":"..eventlist[i]["Counter"]
+ end
+ end
+ return str
+end
+
+local function setPerfStrings(perflist, cpuexprs)
+ local uncore = false
+ local perfexprs = {}
+ local grouplist = {}
+ local cpuinfo = likwid.getCpuInfo()
+ local affinity = likwid.getAffinityInfo()
+ local socketList = {}
+ local socketListFlags = {}
+ for i, d in pairs(affinity["domains"]) do
+ if d["tag"]:match("S%d+") then
+ local tmpList = {}
+ for j,cpu in pairs(d["processorList"]) do
+ table.insert(tmpList, cpu)
+ end
+ table.insert(socketList, tmpList)
+ table.insert(socketListFlags, 1)
+ end
+ end
+
+ for k, perfStr in pairs(perflist) do
+ local coreevents = {}
+ local uncoreevents = {}
+ local gdata = nil
+ gdata = likwid.get_groupdata(perfStr)
+ if gdata == nil then
+ print("Cannot get data for group "..perfStr..". Skipping...")
+ else
+ table.insert(grouplist, gdata)
+ if perfexprs[k] == nil then
+ perfexprs[k] = {}
+ end
+
+ for i, e in pairs(gdata["Events"]) do
+ if not e["Counter"]:match("FIXC%d") and
+ not e["Counter"]:match("^PMC%d") and
+ not e["Counter"]:match("TMP%d") then
+ table.insert(uncoreevents, e)
+ else
+ table.insert(coreevents, e)
+ end
+ end
+
+ local tmpSocketFlags = {}
+ for _,e in pairs(socketListFlags) do
+ table.insert(tmpSocketFlags, e)
+ end
+
+ for i,cpuexpr in pairs(cpuexprs) do
+ for j, cpu in pairs(likwid.stringsplit(cpuexpr,",")) do
+ local uncore = false
+ for sidx, socket in pairs(socketList) do
+ local switchedFlag = false
+ for _,c in pairs(socket) do
+ if c == tonumber(cpu) then
+ if tmpSocketFlags[sidx] == 1 then
+ local eventStr = createEventString(coreevents)
+ if #uncoreevents > 0 then
+ eventStr = eventStr .. ","..createEventString(uncoreevents)
+ end
+ table.insert(perfexprs[k], eventStr)
+ tmpSocketFlags[sidx] = 0
+ switchedFlag = true
+ uncore = true
+ break
+ else
+ table.insert(perfexprs[k], createEventString(coreevents))
+ switchedFlag = true
+ uncore = true
+ end
+ end
+ end
+ if switchedFlag then break end
+ end
+ if uncore then break end
+ end
+ end
+
+ if debug then
+ for i, expr in pairs(perfexprs[k]) do
+ print(string.format("DEBUG: Process %d measures with event set: %s", i-1, expr))
+ end
+ end
+ end
+ end
+ return perfexprs, grouplist
+end
+
+local function checkLikwid()
+ local f = io.popen("which likwid-pin 2>/dev/null", "r")
+ if f ~= nil then
+ local s = f:read("*line")
+ if s ~= nil and s ~= LIKWID_PIN then
+ LIKWID_PIN = s
+ end
+ f:close()
+ end
+ f = io.popen("which likwid-perfctr 2>/dev/null", "r")
+ if f ~= nil then
+ local s = f:read("*line")
+ if s ~= nil and s ~= LIKWID_PERFCTR then
+ LIKWID_PERFCTR = s
+ end
+ f:close()
+ end
+end
+
+local function writeWrapperScript(scriptname, execStr, hosts, outputname)
+ if scriptname == nil or scriptname == "" then
+ return
+ end
+ local oversubscripted = {}
+ local commands = {}
+ tmphosts = {}
+ for i, host in pairs(hosts) do
+ if tmphosts[host["hostname"]] ~= nil then
+ tmphosts[host["hostname"]] = tmphosts[host["hostname"]] + host["slots"]
+ else
+ tmphosts[host["hostname"]] = host["slots"]
+ end
+ end
+
+ if mpitype == "openmpi" then
+ glsize_var = "$OMPI_COMM_WORLD_SIZE"
+ glrank_var = "${OMPI_COMM_WORLD_RANK:-$(($GLOBALSIZE * 2))}"
+ losize_var = "$OMPI_COMM_WORLD_LOCAL_SIZE"
+ elseif mpitype == "intelmpi" then
+ glrank_var = "${PMI_RANK:-$(($GLOBALSIZE * 2))}"
+ glsize_var = tostring(math.tointeger(np))
+ losize_var = tostring(math.tointeger(ppn))
+ elseif mpitype == "mvapich2" then
+ glrank_var = "${PMI_RANK:-$(($GLOBALSIZE * 2))}"
+ glsize_var = tostring(math.tointeger(np))
+ losize_var = tostring(math.tointeger(ppn))
+ elseif mpitype == "slurm" then
+ glrank_var = "${PMI_RANK:-$(($GLOBALSIZE * 2))}"
+ glsize_var = tostring(math.tointeger(np))
+ losize_var = "$MPI_LOCALNRANKS"
+ else
+ print("Invalid MPI vendor "..mpitype)
+ return
+ end
+
+ local taillength = np % ppn
+ if taillength ~= 0 then
+ local full = tostring(math.tointeger(np -taillength))
+ table.insert(oversubscripted, "if [ $GLOBALRANK >= "..tostring(math.tointeger(full)).." ]; then\n")
+ table.insert(oversubscripted, "\tLOCALRANK=$($GLOBALRANK - "..tostring(math.tointeger(full))..")\n")
+ table.insert(oversubscripted, "fi\n")
+ end
+
+ local f = io.open(scriptname, "w")
+ if f == nil then
+ print("ERROR: Cannot open hostfile "..scriptname)
+ os.exit(1)
+ end
+
+ if outputname:sub(1,1) ~= "/" then
+ outputname = os.getenv("PWD").."/"..outputname
+ end
+
+ for i=1,#cpuexprs do
+ local cmd = {}
+ local cpuexpr_opt = "-c"
+ if #perf > 0 then
+ table.insert(cmd,LIKWID_PERFCTR)
+ if use_marker then
+ table.insert(cmd,"-m")
+ end
+ cpuexpr_opt = "-C"
+ else
+ table.insert(cmd,LIKWID_PIN)
+ table.insert(cmd,"-q")
+ end
+ if force and #perf > 0 then
+ table.insert(cmd,"-f")
+ end
+ table.insert(cmd,skipStr)
+ table.insert(cmd,cpuexpr_opt)
+ table.insert(cmd,cpuexprs[i])
+ if #perf > 0 then
+ for j, expr in pairs(perfexprs) do
+ table.insert(cmd,"-g")
+ table.insert(cmd,expr[i])
+ end
+ table.insert(cmd,"-o")
+ table.insert(cmd,outputname)
+ end
+ table.insert(cmd,execStr)
+ commands[i] = table.concat(cmd, " ")
+ end
+
+ f:write("#!/bin/bash -l\n")
+ f:write("GLOBALSIZE="..glsize_var.."\n")
+ f:write("GLOBALRANK="..glrank_var.."\n")
+ f:write("unset OMP_NUM_THREADS\n")
+ if mpitype == "intelmpi" then
+ f:write("export I_MPI_PIN=disable\n")
+ end
+ f:write("LOCALSIZE="..losize_var.."\n\n")
+
+ if mpitype == "openmpi" then
+ f:write("LOCALRANK=$OMPI_COMM_WORLD_LOCAL_RANK\n\n")
+ elseif mpitype == "slurm" then
+ f:write("LOCALRANK=$MPI_LOCALRANKID\n\n")
+ else
+ local full = tostring(math.tointeger(np - (np % ppn)))
+ f:write("if [ \"$GLOBALRANK\" -lt "..tostring(math.tointeger(full)).." ]; then\n")
+ f:write("\tLOCALRANK=$(($GLOBALRANK % $LOCALSIZE))\n")
+ f:write("else\n")
+ f:write("\tLOCALRANK=$(($GLOBALRANK - ("..tostring(math.tointeger(full)).." - 1)))\n")
+ f:write("fi\n\n")
+ end
+
+ if #perf > 0 then
+ f:write("which `basename "..LIKWID_PERFCTR.."` 1>/dev/null 2>&1\n")
+ else
+ f:write("which `basename "..LIKWID_PIN.."` 1>/dev/null 2>&1\n")
+ end
+ f:write("if [ $? -eq 1 ]; then\n")
+ f:write("\tmodule load likwid 1>/dev/null 2>&1\n")
+ f:write("fi\n\n")
+
+ f:write("if [ \"$LOCALRANK\" -eq 0 ]; then\n")
+ if debug then
+ print("NODE_EXEC: "..commands[1])
+ end
+ f:write("\t"..commands[1].."\n")
+
+ for i=2,#commands do
+ f:write("elif [ \"$LOCALRANK\" -eq "..tostring(i-1).." ]; then\n")
+ if debug then
+ print("NODE_EXEC: "..commands[i])
+ end
+ f:write("\t"..commands[i].."\n")
+ end
+ f:write("else\n")
+ f:write("\techo \"Unknown local rank $LOCALRANK\"\n")
+ f:write("fi\n")
+
+ f:close()
+ os.execute("chmod +x "..scriptname)
+end
+
+
+local function listdir(dir, infilepart)
+ local outlist = {}
+ local p = io.popen("find "..dir.." -maxdepth 1 -type f -name \"*"..infilepart.."*\"")
+ for file in p:lines() do
+ table.insert(outlist, file)
+ end
+ p:close()
+ if #outlist > 0 then
+ table.sort(outlist)
+ end
+ return outlist
+end
+
+
+local function parseOutputFile(filename)
+ local rank = 0
+ local host = nil
+ local cpulist = {}
+ local eventlist = {}
+ local counterlist = {}
+ local idx = 1
+ local gidx = 0
+ local results = {}
+ local f = io.open(filename, "r")
+ if f == nil then
+ print("ERROR: Cannot open output file "..filename)
+ os.exit(1)
+ end
+ rank, host = filename:match("output_%d+_(%d+)_(%g+).csv")
+
+ local t = f:read("*all")
+ f:close()
+ if t:len() == 0 then
+ print("Error Output file "..filename.." is empty")
+ os.exit(1)
+ end
+ for i, line in pairs(likwid.stringsplit(t, "\n")) do
+ if (not line:match("^-")) and
+ (not line:match("^CPU type:")) and
+ (not line:match("^CPU name:")) and
+ (not line:match("^TABLE")) and
+ (not line:match("^STRUCT")) and
+ (not line:match("^%s*$")) and
+ (not line:match("STAT")) then
+ if line:match("^Event") and not line:match("Sum,Min,Max,Avg") then
+ linelist = likwid.stringsplit(line,",")
+ table.remove(linelist,1)
+ table.remove(linelist,1)
+ for _, cpustr in pairs(linelist) do
+ local test = tonumber(cpustr:match("Core (%d+)"))
+ if test ~= nil then
+ for _cpu in pairs(cpulist) do
+ if tonumber(cpu) == test then test = -1 end
+ end
+ if test >= 0 then
+ table.insert(cpulist, test)
+ end
+ end
+ end
+ gidx = gidx + 1
+ idx = 1
+ if results[gidx] == nil then
+ results[gidx] = {}
+ eventlist[gidx] = {}
+ counterlist[gidx] = {}
+ results[gidx]["time"] = {}
+ end
+ elseif not line:match("^CPU clock:") and not line:match("Sum,Min,Max,Avg") then
+ linelist = likwid.stringsplit(line,",")
+ event = linelist[1]
+ counter = linelist[2]
+ table.remove(linelist,1)
+ table.remove(linelist,1)
+ for j=#linelist,1,-1 do
+ if linelist[j] == "" then
+ table.remove(linelist, j)
+ end
+ end
+ if results[gidx][idx] == nil then
+ results[gidx][idx] = {}
+ end
+ for j, value in pairs(linelist) do
+ if event:match("[Rr]untime") then
+ results[gidx]["time"][cpulist[j]] = tonumber(value)
+ else
+ results[gidx][idx][cpulist[j]] = tonumber(value)
+ end
+ end
+ if not event:match("[Rr]untime") then
+ table.insert(eventlist[gidx], idx, event)
+ table.insert(counterlist[gidx], idx, counter)
+ idx = idx + 1
+ end
+ elseif line:match("^CPU clock:") then
+ results["clock"] = line:match("^CPU clock:,([%d.]+)")
+ results["clock"] = tonumber(results["clock"])*1.E09
+ end
+ end
+ end
+ return host, tonumber(rank), results, cpulist
+end
+
+local function parseMarkerOutputFile(filename)
+ local rank = 0
+ local host = nil
+ local cpulist = {}
+ local eventlist = {}
+ local counterlist = {}
+ local idx = 1
+
+ local results = {}
+ local f = io.open(filename, "r")
+ if f == nil then
+ print("ERROR: Cannot open output file "..filename)
+ os.exit(1)
+ end
+ rank, host = filename:match("output_%d+_(%d+)_(%g+).csv")
+ local t = f:read("*all")
+ f:close()
+ local parse_reg_info = false
+ local parse_reg_output = false
+ local current_region = nil
+ local gidx = 0
+ local gname = ""
+ local clock = 0
+
+ for i, line in pairs(likwid.stringsplit(t, "\n")) do
+ if (not line:match("^-")) and
+ (not line:match("^CPU type:")) and
+ (not line:match("^CPU name:")) and
+ (not line:match("STAT")) then
+
+ if line:match("^STRUCT,Info") and not parse_reg_info then
+ parse_reg_info = true
+ elseif line:match("^Event") and not line:match("Sum,Min,Max,Avg") then
+ parse_reg_info = false
+ parse_reg_output = true
+ idx = 1
+ elseif line:match("^Event") and line:match("Sum,Min,Max,Avg") then
+ parse_reg_output = false
+ elseif line:match("^CPU clock:,") then
+ clock = line:match("^CPU clock:,([%d.]+)")
+ clock = tonumber(clock)*1.E09
+ elseif parse_reg_info and line:match("TABLE,Region (%g+),Group (%d+) Raw,(%g+),") then
+ current_region, gidx, gname = line:match("TABLE,Region (%g+),Group (%d+) Raw,(%g+),")
+ gidx = tonumber(gidx)+1
+ if results[current_region] == nil then
+ results[current_region] = {}
+ end
+ if results[current_region][gidx] == nil then
+ results[current_region][gidx] = {}
+ results[current_region][gidx]["name"] = gname
+ results[current_region][gidx]["time"] = {}
+ results[current_region][gidx]["calls"] = {}
+ end
+ elseif parse_reg_info and line:match("^Region Info") then
+ linelist = likwid.stringsplit(line,",")
+ table.remove(linelist,1)
+ for _, cpustr in pairs(linelist) do
+ if cpustr:match("Core %d+") then
+ local test = tonumber(cpustr:match("Core (%d+)"))
+ if test ~= nil then
+ for _,cpu in pairs(cpulist) do
+ if test == cpu then test = -1 end
+ end
+ if test >= 0 then
+ table.insert(cpulist, test)
+ end
+ end
+ end
+ end
+ elseif parse_reg_info and line:match("^RDTSC") then
+ linelist = likwid.stringsplit(line,",")
+ table.remove(linelist,1)
+ for i, time in pairs(linelist) do
+ if time ~= "" then
+ results[current_region][gidx]["time"][cpulist[i]] = tonumber(time)
+ end
+ end
+ elseif parse_reg_info and line:match("^call count") then
+ linelist = likwid.stringsplit(line,",")
+ table.remove(linelist,1)
+ for j, calls in pairs(linelist) do
+ if calls:match("%d+") then
+ if calls ~= "" then
+ results[current_region][gidx]["calls"][cpulist[j]] = tonumber(calls)
+ end
+ end
+ end
+ elseif parse_reg_output then
+ linelist = likwid.stringsplit(line,",")
+ if linelist[2] ~= "TSC" then
+ table.remove(linelist,1)
+ table.remove(linelist,1)
+ for j=#linelist,1,-1 do
+ if linelist[j] == "" then
+ table.remove(linelist, j)
+ end
+ end
+ if results[current_region][gidx][idx] == nil then
+ results[current_region][gidx][idx] = {}
+ end
+ for j, value in pairs(linelist) do
+ results[current_region][gidx][idx][cpulist[j]] = tonumber(value)
+ end
+ idx = idx + 1
+ end
+ end
+ end
+ end
+ for region, data in pairs(results) do
+ results[region]["clock"] = clock
+ end
+
+ return host, tonumber(rank), results, cpulist
+end
+
+
+function percentile_table(inputtable, skip_cols, skip_lines)
+ local function percentile(sorted_valuelist, k)
+ index = tonumber(k)/100.0 * #sorted_valuelist
+ if index - math.floor(index) >= 0.5 then
+ index = math.ceil(index)
+ else
+ index = math.floor(index)
+ end
+ return tonumber(sorted_valuelist[index])
+ end
+ local outputtable = {}
+ local ncols = #inputtable
+ if ncols == 0 then
+ return outputtable
+ end
+ local nlines = #inputtable[1]
+ if nlines == 0 then
+ return outputtable
+ end
+ perc25 = {"%ile 25"}
+ perc50 = {"%ile 50"}
+ perc75 = {"%ile 75"}
+ for i=skip_lines+1,nlines do
+ perc25[i-skip_lines+1] = 0
+ perc50[i-skip_lines+1] = 0
+ perc75[i-skip_lines+1] = 0
+ end
+ for l=skip_lines+1,nlines do
+ valuelist = {}
+ for c=skip_cols+1, ncols do
+ table.insert(valuelist, inputtable[c][l])
+ end
+ table.sort(valuelist)
+ perc25[l-skip_lines+1] = likwid.num2str(percentile(valuelist, 25))
+ perc50[l-skip_lines+1] = likwid.num2str(percentile(valuelist, 50))
+ perc75[l-skip_lines+1] = likwid.num2str(percentile(valuelist, 75))
+ end
+ table.insert(outputtable, perc25)
+ table.insert(outputtable, perc50)
+ table.insert(outputtable, perc75)
+ return outputtable
+end
+
+function printMpiOutput(group_list, all_results, regionname)
+ region = regionname or nil
+ if #group_list == 0 or likwid.tablelength(all_results) == 0 then
+ return
+ end
+ for gidx, gdata in pairs(group_list) do
+ local firsttab = {}
+ local firsttab_combined = {}
+ local secondtab = {}
+ local secondtab_combined = {}
+ local total_threads = 0
+ local all_counters = {}
+ for rank = 0, #all_results do
+ total_threads = total_threads + #all_results[rank]["cpus"]
+ end
+
+ desc = {"Event"}
+ if total_threads == 1 or not gdata["Metrics"] then
+ table.insert(desc, "Runtime (RDTSC) [s]")
+ end
+ if all_results[0]["results"][1]["calls"] then
+ table.insert(desc, "Region calls")
+ end
+ for i=1,#gdata["Events"] do
+ table.insert(desc, gdata["Events"][i]["Event"])
+ end
+ table.insert(firsttab, desc)
+
+ desc = {"Counter"}
+ if total_threads == 1 or not gdata["Metrics"] then
+ table.insert(desc, "TSC")
+ end
+ if all_results[0]["results"][1]["calls"] then
+ table.insert(desc, "CTR")
+ end
+ for i=1,#gdata["Events"] do
+ table.insert(desc, gdata["Events"][i]["Counter"])
+ end
+ table.insert(firsttab, desc)
+
+ for rank = 0, #all_results do
+ for i, cpu in pairs(all_results[rank]["cpus"]) do
+ column = {all_results[rank]["hostname"]..":"..tostring(rank)..":"..tostring(cpu)}
+ if total_threads == 1 or not gdata["Metrics"] then
+ table.insert(column, all_results[rank]["results"][gidx]["time"][cpu])
+ end
+ if all_results[0]["results"][1]["calls"] then
+ table.insert(column, all_results[rank]["results"][gidx]["calls"][cpu])
+ end
+ for j=1,#gdata["Events"] do
+ local value = "0"
+ if all_results[rank]["results"][gidx][j] and
+ all_results[rank]["results"][gidx][j][cpu] then
+ value = likwid.num2str(all_results[rank]["results"][gidx][j][cpu])
+ end
+ table.insert(column, value)
+ end
+ table.insert(firsttab, column)
+ end
+ end
+
+ if total_threads > 1 then
+ firsttab_combined = likwid.tableToMinMaxAvgSum(firsttab, 2, 1)
+ end
+ if gdata["Metrics"] then
+ secondtab[1] = {"Metric"}
+ for j=1,#gdata["Metrics"] do
+ table.insert(secondtab[1], gdata["Metrics"][j]["description"])
+ end
+
+ for rank = 0, #all_results do
+ for i, cpu in pairs(all_results[rank]["cpus"]) do
+ local counterlist = {}
+ for j=1,#gdata["Events"] do
+ local counter = gdata["Events"][j]["Counter"]
+ counterlist[counter] = 0
+ if all_results[rank]["results"][gidx][j] ~= nil and
+ all_results[rank]["results"][gidx][j][cpu] ~= nil then
+ counterlist[counter] = all_results[rank]["results"][gidx][j][cpu]
+ end
+ end
+ counterlist["time"] = all_results[rank]["results"][gidx]["time"][cpu]
+ counterlist["inverseClock"] = 1.0/all_results[rank]["results"]["clock"]
+ tmpList = {all_results[rank]["hostname"]..":"..tostring(rank)..":"..tostring(cpu)}
+ for j=1,#groupdata["Metrics"] do
+ local tmp = likwid.num2str(likwid.calculate_metric(gdata["Metrics"][j]["formula"], counterlist))
+ table.insert(tmpList, tmp)
+ end
+ table.insert(secondtab,tmpList)
+ end
+ end
+
+ if total_threads > 1 then
+ secondtab_combined = likwid.tableToMinMaxAvgSum(secondtab, 1, 1)
+ local tmp = percentile_table(secondtab, 1, 1)
+ for i, col in pairs(tmp) do
+ table.insert(secondtab_combined, col)
+ end
+ end
+ end
+ if use_csv then
+ local maxLineFields = #firsttab
+ if #firsttab_combined > maxLineFields then maxLineFields = #firsttab_combined end
+ if gdata["Metrics"] then
+ if #secondtab > maxLineFields then maxLineFields = #secondtab end
+ if #secondtab_combined > maxLineFields then maxLineFields = #secondtab_combined end
+ end
+ if region then
+ print("Region,"..tostring(region).. string.rep(",", maxLineFields - 2))
+ end
+ print("Group,"..tostring(gidx) .. string.rep(",", maxLineFields - 2))
+ likwid.printcsv(firsttab, maxLineFields)
+ if total_threads > 1 then likwid.printcsv(firsttab_combined, maxLineFields) end
+ if gdata["Metrics"] then
+ likwid.printcsv(secondtab, maxLineFields)
+ if total_threads > 1 then likwid.printcsv(secondtab_combined, maxLineFields) end
+ end
+ else
+ if region then
+ print("Region: "..tostring(region))
+ end
+ print("Group: "..tostring(gidx))
+ likwid.printtable(firsttab)
+ if total_threads > 1 then likwid.printtable(firsttab_combined) end
+ if gdata["Metrics"] then
+ likwid.printtable(secondtab)
+ if total_threads > 1 then likwid.printtable(secondtab_combined) end
+ end
+ end
+ end
+end
+
+
+
+function cpuCount()
+ cputopo = likwid.getCpuTopology()
+ local cpus = cputopo["activeHWThreads"]
+ return cpus
+end
+
+if #arg == 0 then
+ usage()
+ os.exit(0)
+end
+
+for opt,arg in likwid.getopt(arg, {"n:","np:", "nperdomain:","pin:","hostfile:","h","help","v","g:","group:","mpi:","omp:","d","m","O","debug","marker","version","s:","skip:","f"}) do
+ if (type(arg) == "string") then
+ local s,e = arg:find("-")
+ if s == 1 then
+ print(string.format("ERROR: Argmument %s to option -%s starts with invalid character -.", arg, opt))
+ print("ERROR: Did you forget an argument to an option?")
+ os.exit(1)
+ end
+ end
+
+ if opt == "h" or opt == "help" then
+ usage()
+ os.exit(0)
+ elseif opt == "v" or opt == "version"then
+ version()
+ os.exit(0)
+ elseif opt == "d" or opt == "debug" then
+ debug = true
+ elseif opt == "m" or opt == "marker" then
+ use_marker = true
+ elseif opt == "O" then
+ use_csv = true
+ elseif opt == "f" then
+ force = true
+ elseif opt == "n" or opt == "np" then
+ np = tonumber(arg)
+ if np == nil then
+ print("Argument for -n/-np must be a number")
+ os.exit(1)
+ end
+ elseif opt == "nperdomain" then
+ nperdomain = arg
+ local domain, count = nperdomain:match("([NSCM]%d*):(%d+)")
+ if domain == nil then
+ print("Invalid option to -nperdomain")
+ os.exit(1)
+ end
+ elseif opt == "hostfile" then
+ hostfile = arg
+ elseif opt == "pin" then
+ cpuexprs = likwid.stringsplit(arg, "_")
+ elseif opt == "g" or opt == "group" then
+ table.insert(perf, arg)
+ elseif opt == "mpi" then
+ mpitype = arg
+ elseif opt == "omp" then
+ omptype = arg
+ elseif opt == "s" or opt == "skip" then
+ skipStr = "-s "..arg
+ elseif opt == "?" then
+ print("Invalid commandline option -"..arg)
+ os.exit(1)
+ elseif opt == "!" then
+ print("Option requires an argument")
+ os.exit(1)
+ end
+end
+
+
+if np == 0 and nperdomain == nil and #cpuexprs == 0 then
+ print("ERROR: No option -n/-np, -nperdomain or -pin")
+ os.exit(1)
+end
+
+if use_marker and #perf == 0 then
+ print("ERROR: You selected the MarkerAPI feature but didn't set any events on the commandline")
+ os.exit(1)
+end
+
+for i=1,#arg do
+ table.insert(executable, arg[i])
+end
+if #executable == 0 then
+ print("ERROR: No executable given on commandline")
+ os.exit(1)
+elseif os.execute(string.format("ls %s 1>/dev/null 2>&1", executable[1])) == 0 then
+ print("ERROR: Cannot find executable given on commandline")
+ os.exit(1)
+else
+ local f = io.popen(string.format("which %s 2>/dev/null", executable[1]))
+ if f ~= nil then
+ executable[1] = f:read("*line")
+ f:close()
+ end
+ if debug then
+ print("DEBUG: Executable given on commandline: "..table.concat(executable, " "))
+ end
+end
+
+if mpitype == nil then
+ mpitype = getMpiType()
+ if debug then
+ print("DEBUG: Using MPI implementation "..mpitype)
+ end
+end
+if mpitype ~= "intelmpi" and mpitype ~= "mvapich2" and mpitype ~= "openmpi" and mpitype ~= "slurm" then
+ print("ERROR: Cannot determine current MPI implementation. likwid-mpirun checks for openmpi, intelmpi and mvapich2 or if running in a SLURM environment")
+ os.exit(1)
+end
+
+getMpiExec(mpitype)
+if (mpiexecutable == nil) then
+ print(string.format("Cannot find executable for determined MPI implementation %s", mpitype))
+ os.exit(1)
+end
+
+if omptype == nil then
+ omptype = getOmpType()
+ if debug and omptype ~= nil then
+ print("DEBUG: Using OpenMP implementation "..omptype)
+ end
+end
+if omptype == nil then
+ print("WARN: Cannot extract OpenMP vendor from executable or commandline, assuming no OpenMP")
+end
+
+if not hostfile then
+ if os.getenv("PBS_NODEFILE") ~= nil then
+ hostfile = os.getenv("PBS_NODEFILE")
+ hosts = readHostfilePBS(hostfile)
+ elseif os.getenv("LOADL_HOSTFILE") ~= nil then
+ hostfile = os.getenv("LOADL_HOSTFILE")
+ hosts = readHostfilePBS(hostfile)
+ elseif mpitype == "slurm" and os.getenv("SLURM_NODELIST") ~= nil then
+ hostlist = os.getenv("SLURM_NODELIST")
+ hosts = readHostfileSlurm(hostlist)
+ else
+ local cpus = cpuCount()
+ table.insert(hosts, {hostname='localhost', slots=cpus, maxslots=cpus})
+ end
+else
+ hosts = readHostfile(hostfile)
+end
+
+local givenNrNodes = getNumberOfNodes(hosts)
+
+if skipStr == "" then
+ if mpitype == "intelmpi" then
+ if omptype == "intel" and givenNrNodes > 1 then
+ skipStr = '-s 0x3'
+ elseif omptype == "intel" and givenNrNodes == 1 then
+ skipStr = '-s 0x1'
+ elseif omptype == "gnu" and givenNrNodes > 1 then
+ skipStr = '-s 0x1'
+ elseif omptype == "gnu" and givenNrNodes == 1 then
+ skipStr = '-s 0x0'
+ end
+ elseif mpitype == "mvapich2" then
+ if omptype == "intel" and givenNrNodes > 1 then
+ skipStr = '-s 0x7'
+ end
+ elseif mpitype == "openmpi" then
+ if omptype == "intel" and givenNrNodes > 1 then
+ skipStr = '-s 0x7'
+ elseif omptype == "intel" and givenNrNodes == 1 then
+ skipStr = '-s 0x1'
+ elseif omptype == "gnu" and givenNrNodes > 1 then
+ skipStr = '-s 0x7'
+ elseif omptype == "gnu" and givenNrNodes == 1 then
+ skipStr = '-s 0x0'
+ end
+ end
+end
+if debug and skipStr ~= "" then
+ print(string.format("DEBUG: Using skip option %s to skip pinning of shepard threads", skipStr))
+end
+
+if #perf > 0 then
+ local sum_maxslots = 0
+ local topo = likwid.getCpuTopology()
+ if debug then
+ print("DEBUG: Switch to perfctr mode, there are "..tostring(#perf).." eventsets given on the commandline")
+ end
+ for i, host in pairs(hosts) do
+ if debug then
+ local str = string.format("DEBUG: Working on host %s with %d slots", host["hostname"], host["slots"])
+ if host["maxslots"] ~= nil then
+ str = str .. string.format(" and %d slots maximally", host["maxslots"])
+ end
+ print(str)
+ end
+ if host["maxslots"] ~= nil then
+ sum_maxslots = sum_maxslots + host["maxslots"]
+ elseif host["slots"] ~= nil then
+ sum_maxslots = sum_maxslots + host["slots"]
+ else
+ sum_maxslots = sum_maxslots + topo["numHWThreads"]
+ host["slots"] = topo["numHWThreads"]
+ end
+ end
+ if np > sum_maxslots then
+ print("ERROR: Processes requested exceeds maximally available slots of given hosts. Maximal processes: "..sum_maxslots)
+ os.exit(1)
+ end
+end
+
+if #cpuexprs > 0 then
+ cpuexprs = calculatePinExpr(cpuexprs)
+ likwid.tableprint(cpuexprs)
+ print(#cpuexprs)
+ ppn = #cpuexprs
+ if np == 0 then
+ if debug then
+ print(string.format("DEBUG: No -np given , setting according to pin expression and number of available hosts"))
+ end
+ np = givenNrNodes * #cpuexprs
+ ppn = #cpuexprs
+ elseif np < #cpuexprs*givenNrNodes then
+ while np < #cpuexprs*givenNrNodes and #cpuexprs > 1 do
+ print("remove")
+ table.remove(cpuexprs)
+ end
+ ppn = #cpuexprs
+ end
+ newhosts = assignHosts(hosts, np, ppn)
+ if np > #cpuexprs*#newhosts and #perf > 0 then
+ print("ERROR: Oversubsribing not allowed.")
+ print(string.format("ERROR: You want %d processes but the pinning expression has only expressions for %d processes. There are only %d hosts in the host list.", np, #cpuexprs*#newhosts, #newhosts))
+ os.exit(1)
+ end
+elseif nperdomain ~= nil then
+ cpuexprs = calculateCpuExprs(nperdomain, cpuexprs)
+ ppn = #cpuexprs
+ if np == 0 then
+ np = givenNrNodes * ppn
+ end
+ if np < ppn then
+ if debug then
+ print("WARN: Removing additional cpu expressions to get requested number of processes")
+ end
+ for i=np+1,ppn do
+ if debug then
+ print("WARN: Remove cpuexpr: "..cpuexprs[#cpuexprs])
+ end
+ table.remove(cpuexprs, #cpuexprs)
+ end
+ ppn = np
+ elseif np > (givenNrNodes * ppn) and #perf > 0 then
+ print("ERROR: Oversubsribing nodes not allowed!")
+ print(string.format("ERROR: You want %d processes with %d on each of the %d hosts", np, ppn, givenNrNodes))
+ os.exit(1)
+ end
+ newhosts, ppn = assignHosts(hosts, np, ppn)
+elseif ppn == 0 and np > 0 then
+ maxnp = 0
+ maxppn = 0
+ for i, host in pairs(hosts) do
+ maxnp = maxnp + host["slots"]
+ if host["slots"] > maxppn then
+ maxppn = host["slots"]
+ end
+ end
+
+ if ppn == 0 then
+ ppn = 1
+ end
+ if ppn > maxppn and np > maxppn then
+ ppn = maxppn
+ elseif np < maxppn then
+ ppn = np
+ elseif maxppn == np then
+ ppn = maxppn
+ end
+ if (ppn * givenNrNodes) < np then
+ if #perf == 0 then
+ print("ERROR: Processes cannot be equally distributed")
+ print(string.format("WARN: You want %d processes on %d hosts.", np, givenNrNodes))
+ ppn = np/givenNrNodes
+ print(string.format("WARN: Sanitizing number of processes per node to %d", ppn))
+ else
+ ppn = 0
+ os.exit(1)
+ end
+ end
+ local newexprs = calculateCpuExprs("E:N:"..tostring(ppn), cpuexprs)
+ local copynp = np
+ while copynp > 0 do
+ for i, expr in pairs(newexprs) do
+ local exprlist = likwid.stringsplit(expr, ",")
+ local seclength = math.ceil(#exprlist/ppn)
+ local offset = 0
+ for p=1, ppn do
+ local str = ""
+ for j=1, seclength do
+ if exprlist[((p-1)*seclength) + j] then
+ str = str .. exprlist[((p-1)*seclength) + j] ..","
+ end
+ end
+ if str ~= "" then
+ str = str:sub(1,#str-1)
+ table.insert(cpuexprs, str)
+ copynp = copynp - seclength
+ else
+ break
+ end
+ end
+ end
+ end
+ newhosts, ppn = assignHosts(hosts, np, ppn)
+ if np < ppn*#newhosts then
+ np = 0
+ for i, host in pairs(newhosts) do
+ np = np + host["slots"]
+ end
+ end
+else
+ print("ERROR: Commandline settings are not supported.")
+ os.exit(1)
+end
+
+local grouplist = {}
+if #perf > 0 then
+ perfexprs, grouplist = setPerfStrings(perf, cpuexprs)
+end
+
+local nrNodes = getNumberOfNodes(newhosts)
+
+local pid = likwid.getpid()
+local hostfilename = string.format(".hostfile_%s.txt", pid)
+local scriptfilename = string.format(".likwidscript_%s.txt", pid)
+local outfilename = string.format(os.getenv("PWD").."/.output_%s_%%r_%%h.csv", pid)
+
+checkLikwid()
+
+if writeHostfile == nil or getEnvironment == nil or executeCommand == nil then
+ print("ERROR: Initialization for MPI specific functions failed")
+ os.exit(1)
+end
+
+writeHostfile(newhosts, hostfilename)
+writeWrapperScript(scriptfilename, table.concat(executable, " "), newhosts, outfilename)
+local env = getEnvironment()
+executeCommand(scriptfilename, hostfilename, env, nrNodes)
+
+os.remove(scriptfilename)
+os.remove(hostfilename)
+
+infilepart = ".output_"..pid
+filelist = listdir(os.getenv("PWD"), infilepart)
+all_results = {}
+if not use_marker then
+ for i, file in pairs(filelist) do
+ local host, rank, results, cpulist = parseOutputFile(file)
+ if host ~= nil and rank ~= nil then
+ if all_results[rank] == nil then
+ all_results[rank] = {}
+ end
+ all_results[rank]["hostname"] = host
+ all_results[rank]["results"] = results
+ all_results[rank]["cpus"] = cpulist
+ os.remove(file)
+ end
+ end
+ if likwid.tablelength(all_results) > 0 then
+ printMpiOutput(grouplist, all_results)
+ end
+else
+ local tmpList = {}
+ local cpuCount = 0
+ for i, file in pairs(filelist) do
+ host, rank, results, cpulist = parseMarkerOutputFile(file)
+ if host ~= nil and rank ~= nil then
+ if all_results[rank] == nil then
+ all_results[rank] = {}
+ end
+ all_results[rank]["hostname"] = host
+ all_results[rank]["cpus"] = cpulist
+ cpuCount = cpuCount + #cpulist
+ tmpList[rank] = results
+ os.remove(file)
+ end
+ end
+ if likwid.tablelength(all_results) > 0 then
+ for reg, _ in pairs(tmpList[0]) do
+ for rank,_ in pairs(all_results) do
+ all_results[rank]["results"] = tmpList[rank][reg]
+ end
+ printMpiOutput(grouplist, all_results, reg)
+ end
+ end
+end
diff --git a/src/applications/likwid-perfctr.c b/src/applications/likwid-perfctr.c
deleted file mode 100644
index 6c9f98f..0000000
--- a/src/applications/likwid-perfctr.c
+++ /dev/null
@@ -1,528 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: likwid-perfctr.c
- *
- * Description: An application to read out performance counter registers
- * on x86 processors
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-/* ##### HEADER FILE INCLUDES ######################################### */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sched.h>
-#include <time.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <ctype.h>
-#include <signal.h>
-
-#include <error.h>
-#include <types.h>
-#include <bitUtil.h>
-#include <accessClient.h>
-#include <msr.h>
-#include <timer.h>
-#include <cpuid.h>
-#include <affinity.h>
-#include <cpuFeatures.h>
-#include <perfmon.h>
-#include <daemon.h>
-#include <bstrlib.h>
-#include <numa.h>
-#include <strUtil.h>
-
-
-/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
-#define HELP_MSG \
-fprintf(stdout, "likwid-perfctr -- Version %d.%d \n\n",VERSION,RELEASE); \
-fprintf(stdout, "\n"); \
-fprintf(stdout, "Example Usage: likwid-perfctr -C 2 ./a.out \n"); \
-fprintf(stdout, "Supported Options:\n"); \
-fprintf(stdout, "-h\t Help message\n"); \
-fprintf(stdout, "-v\t Version information\n"); \
-fprintf(stdout, "-V\t verbose output\n"); \
-fprintf(stdout, "-g\t performance group or event set string\n"); \
-fprintf(stdout, "-H\t Get group help (together with -g switch) \n"); \
-fprintf(stdout, "-t\t timeline mode with frequency in s or ms, e.g. 300ms\n"); \
-fprintf(stdout, "-S\t stethoscope mode with duration in s\n"); \
-fprintf(stdout, "-m\t use markers inside code \n"); \
-fprintf(stdout, "-s\t bitmask with threads to skip\n"); \
-fprintf(stdout, "-o\t Store output to file, with output conversation according to file suffix\n"); \
-fprintf(stdout, "\t Conversation scripts can be supplied in %s\n",TOSTRING(LIKWIDFILTERPATH)); \
-fprintf(stdout, "-O\t Output easily parseable CSV instead of fancy tables\n"); \
-fprintf(stdout, "-M\t set how MSR registers are accessed: 0=direct, 1=msrd\n"); \
-fprintf(stdout, "-a\t list available performance groups\n"); \
-fprintf(stdout, "-e\t list available counters and events\n"); \
-fprintf(stdout, "-i\t print cpu info\n"); \
-fprintf(stdout, "-c\t processor ids to measure (required), e.g 0,3-4,8\n"); \
-fprintf(stdout, "-C\t processor ids to measure (this variant also cares for pinning of process/threads)\n"); \
-fprintf(stdout, "\t\t for -c and -C, see likwid-pin -h for details\n"); \
-fflush(stdout);
-
-
-#define VERSION_MSG \
-fprintf(stdout, "likwid-perfctr %d.%d \n\n",VERSION,RELEASE); \
-fflush(stdout);
-
-/* To be able to give useful error messages instead of just dieing without a
- * comment. Mainly happens because we get a SIGPIPE if the daemon drops us. */
-static void Signal_Handler(int sig)
-{
- fprintf(stderr, "ERROR - [%s:%d] Signal %d caught\n", __FILE__, __LINE__, sig);
-}
-
-/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
-int main (int argc, char** argv)
-{
- int optInfo = 0;
- int optPrintGroups = 0;
- int optPrintGroupHelp = 0;
- int optPrintEvents = 0;
- int optUseMarker = 0;
- int optReport = 0;
- int optTimeline = 0;
- int optStethoscope = 0;
- int optPin = 0;
- int c;
- bstring eventString = bfromcstr("_NOGROUP");
- bstring argString;
- bstring pinString;
- bstring skipString;
- bstring filterScript = bfromcstr("NO");
- int skipMask = -1;
- BitMask counterMask;
- bstring filepath = bformat("/tmp/likwid_%u.txt", (uint32_t) getpid());
- int numThreads = 0;
- int threads[MAX_NUM_THREADS];
- threads[0] = 0;
- int i,j;
- FILE* OUTSTREAM = stdout;
- struct timespec interval;
-
- if (argc == 1)
- {
- HELP_MSG;
- bdestroy(filepath);
- bdestroy(eventString);
- exit (EXIT_SUCCESS);
- }
-
- if (cpuid_init() == EXIT_FAILURE)
- {
- ERROR_PLAIN_PRINT(Unsupported processor!);
- }
- numa_init();
- affinity_init();
-
- while ((c = getopt (argc, argv, "+ac:C:d:eg:hHimM:o:OPs:S:t:vV")) != -1)
- {
- switch (c)
- {
- case 'a':
- numThreads = 1; /*to get over the error message */
- threads[0] = 0;
- optPrintGroups = 1;
- break;
- case 'C':
- optPin = 1;
- CHECK_OPTION_STRING;
- numThreads = bstr_to_cpuset(threads, argString);
-
- if(!numThreads)
- {
- ERROR_PLAIN_PRINT(Failed to parse cpu list.);
- }
-
- break;
- case 'c':
- CHECK_OPTION_STRING;
- numThreads = bstr_to_cpuset(threads, argString);
- if(!numThreads)
- {
- ERROR_PLAIN_PRINT(Failed to parse cpu list.);
- }
-
- break;
- case 'd':
- fprintf(stdout, "Option -d for daemon mode is deprecated. Daemon mode has be renamed to timeline mode (Option -t)!\n");
- fflush(stdout);
- break;
- case 'e':
- numThreads=1; /*to get over the error message */
- threads[0]=0;
- optPrintEvents = 1;
- break;
- case 'g':
- CHECK_OPTION_STRING;
- eventString = bstrcpy(argString);
- break;
- case 'h':
- HELP_MSG;
- cpuid_print();
- bdestroy(filepath);
- bdestroy(eventString);
- exit (EXIT_SUCCESS);
- case 'H':
- numThreads=1; /*to get over the error message */
- threads[0]=0;
- optPrintGroupHelp = 1;
- break;
- case 'i':
- numThreads=1; /*to get over the error message */
- threads[0]=0;
- optInfo = 1;
- perfmon_verbose = 1;
- break;
- case 'm':
- optUseMarker = 1;
- break;
- case 'M': /* Set MSR Access mode */
- CHECK_OPTION_STRING;
- accessClient_setaccessmode(str2int((char*) argString->data));
- break;
- case 'o':
- CHECK_OPTION_STRING;
- OUTSTREAM = bstr_to_outstream(argString, filterScript);
-
- if(!OUTSTREAM)
- {
- ERROR_PLAIN_PRINT(Failed to parse out file pattern.);
- }
- break;
- case 'O':
- perfmon_setCSVMode(1);
- break;
- case 's':
- CHECK_OPTION_STRING;
- skipMask = strtoul((char*) argString->data,NULL,16);
- break;
- case 'S':
- CHECK_OPTION_STRING;
- optStethoscope = str2int((char*) argString->data);
- if (optStethoscope <= 0)
- {
- fprintf(stderr, "The measurement time must be larger than 0\n\n");
- HELP_MSG;
- exit(EXIT_FAILURE);
- }
- break;
- case 't':
- CHECK_OPTION_STRING;
- bstr_to_interval(argString, &interval);
- optTimeline = 1;
- break;
- case 'v':
- VERSION_MSG;
- bdestroy(filepath);
- bdestroy(eventString);
- exit (EXIT_SUCCESS);
- case 'V':
- perfmon_verbose = 1;
- break;
- case '?':
- if (optopt == 'S'||optopt == 't'||optopt == 'c'||optopt == 'C'||
- optopt == 'o'||optopt == 'M'||optopt == 'g')
- {
-
- }
- else if (isprint (optopt))
- {
- fprintf (stderr, "Unknown option `-%c'.\n", optopt);
- }
- else
- {
- fprintf (stderr,
- "Unknown option character `\\x%x'.\n",
- optopt);
- }
- return EXIT_FAILURE;
- default:
- HELP_MSG;
- bdestroy(filepath);
- bdestroy(eventString);
- exit (EXIT_SUCCESS);
- }
- }
-
- if (!numThreads)
- {
- fprintf (stderr, "ERROR: Required -c. You must specify at least one processor.\n");
- HELP_MSG;
- exit(EXIT_FAILURE);
- }
-
- if (optPin)
- {
-
- if ( getenv("OMP_NUM_THREADS") == NULL )
- {
- argString = bformat("%d",numThreads);
- setenv("OMP_NUM_THREADS",(char*) argString->data , 0);
- }
-
- if (numThreads > 1)
- {
- bstring ldPreload = bfromcstr(getenv("LD_PRELOAD"));
-
- pinString = bformat("%d",threads[1]);
-
- for (i=2; i < numThreads;i++)
- {
- bformata(pinString,",%d",threads[i]);
- }
-
- bformata(pinString,",%d",threads[0]);
-
- if (skipMask > 0)
- {
- skipString = bformat("%d",skipMask);
- setenv("LIKWID_SKIP",(char*) skipString->data , 1);
- }
- setenv("KMP_AFFINITY", "disabled", 1);
- setenv("LIKWID_PIN",(char*) pinString->data , 1);
-
- setenv("LIKWID_SILENT","true", 1);
- if (ldPreload == NULL)
- {
- setenv("LD_PRELOAD",TOSTRING(LIBLIKWIDPIN), 1);
- }
- else
- {
- bconchar(ldPreload, ':');
- bcatcstr(ldPreload, TOSTRING(LIBLIKWIDPIN));
- setenv("LD_PRELOAD", bdata(ldPreload), 1);
- }
- }
-
- affinity_pinProcess(threads[0]);
- }
-
-
- for (i = 0; i< numThreads;i++)
- {
- for (j = 0; j< numThreads;j++)
- {
- if(i != j && threads[i] == threads[j])
- {
- fprintf (stderr, "ERROR: Processor list (%d",threads[0]);
- for (c=1;c<numThreads;c++)
- {
- fprintf (stderr, ",%d",threads[c]);
- }
- fprintf (stderr, ") is not unique.\n");
- exit(EXIT_FAILURE);
- }
- }
- }
-
- { /* Init signal handler */
- struct sigaction sia;
- sia.sa_handler = Signal_Handler;
- sigemptyset(&sia.sa_mask);
- sia.sa_flags = 0;
- sigaction(SIGPIPE, &sia, NULL);
- }
-
- perfmon_init(numThreads, threads, OUTSTREAM);
-
- if (perfmon_verbose)
- {
- fprintf(OUTSTREAM,"CPU family:\t%u \n",cpuid_info.family);
- fprintf(OUTSTREAM,"CPU model:\t%u \n", cpuid_info.model);
- fprintf(OUTSTREAM,"CPU stepping:\t%u \n", cpuid_info.stepping);
- fprintf(OUTSTREAM,"CPU features:\t%s \n", cpuid_info.features);
-
- if( cpuid_info.family == P6_FAMILY && cpuid_info.perf_version)
- {
- fprintf(OUTSTREAM,HLINE);
- fprintf(OUTSTREAM,"PERFMON version:\t%u \n",cpuid_info.perf_version);
- fprintf(OUTSTREAM,"PERFMON number of counters:\t%u \n",cpuid_info.perf_num_ctr);
- fprintf(OUTSTREAM,"PERFMON width of counters:\t%u \n",cpuid_info.perf_width_ctr);
- fprintf(OUTSTREAM,"PERFMON number of fixed counters:\t%u \n",cpuid_info.perf_num_fixed_ctr);
- }
- }
- fprintf(OUTSTREAM,HLINE);
- fflush(OUTSTREAM);
-
- if (optInfo)
- {
- exit (EXIT_SUCCESS);
- }
- if (optPrintGroups)
- {
- perfmon_printAvailableGroups();
- exit (EXIT_SUCCESS);
- }
- if (optPrintGroupHelp)
- {
- perfmon_printGroupHelp(eventString);
- exit (EXIT_SUCCESS);
- }
- if (optPrintEvents)
- {
- perfmon_printCounters();
- perfmon_printEvents();
- exit (EXIT_SUCCESS);
- }
- if ((!optTimeline && !optStethoscope) && (optind == argc))
- {
- fprintf(OUTSTREAM,"NOTICE: You have to specify a program to measure as argument!\n");
- exit (EXIT_SUCCESS);
- }
- argv += optind;
- bstring exeString = bfromcstr(argv[0]);
- for (i=1; i<(argc-optind); i++)
- {
- bconchar(exeString, ' ');
- bcatcstr(exeString, argv[i]);
- }
- if (blength(exeString) == 0 && !optStethoscope)
- {
- fprintf(OUTSTREAM, "Executable must be given on commandline\n");
- fflush(OUTSTREAM);
- exit(EXIT_FAILURE);
- }
- if (biseqcstr(eventString,"_NOGROUP"))
- {
- fprintf(OUTSTREAM,"NOTICE: You have to specify a group or event set to measure using the -g option.\n");
- fprintf(OUTSTREAM," Use likwid-perfctr -a to get a list of available groups and likwid-perfctr -e for supported events.\n\n");
- exit (EXIT_SUCCESS);
- }
-
- timer_init();
-
- fprintf(OUTSTREAM,HLINE);
- fprintf(OUTSTREAM,"CPU type:\t%s \n",cpuid_info.name);
- fprintf(OUTSTREAM,"CPU clock:\t%3.2f GHz \n", (float) timer_getCpuClock() * 1.E-09);
- fflush(OUTSTREAM);
-
- fprintf(OUTSTREAM,HLINE);
- fflush(OUTSTREAM);
-
- if (optStethoscope)
- {
- perfmon_setupEventSet(eventString, &counterMask);
- perfmon_startCounters();
- sleep(optStethoscope);
- perfmon_stopCounters();
- perfmon_printCounterResults();
- }
- else if (optTimeline)
- {
- fprintf(OUTSTREAM,"CORES: %d", threads[0]);
- for (int i=1; i<numThreads; i++)
- {
- fprintf(OUTSTREAM," %d", threads[i]);
- }
- fprintf(OUTSTREAM," \n");
- fflush(OUTSTREAM);
-
- daemon_start(eventString, interval);
- if (system(bdata(exeString)) == EOF)
- {
- fprintf(stderr, "Failed to execute %s!\n", bdata(exeString));
- exit(EXIT_FAILURE);
- }
- daemon_stop(SIGINT);
- }
- else
- {
- if (perfmon_verbose)
- {
- fprintf(OUTSTREAM,"Executing: %s \n",bdata(exeString));
- fflush(OUTSTREAM);
- }
-
- if (optReport)
- {
- // multiplex_start();
- }
- else if (!optUseMarker && !optTimeline)
- {
- perfmon_setupEventSet(eventString, &counterMask);
- perfmon_startCounters();
- }
- else
- {
- if (getenv("LIKWID_FILEPATH") == NULL)
- setenv("LIKWID_FILEPATH",(char*) filepath->data, 1);
- perfmon_setupEventSet(eventString, &counterMask);
- char* modeStr = (char*) malloc(40 * sizeof(char));
- sprintf(modeStr,"%d",accessClient_mode);
- setenv("LIKWID_MODE", modeStr, 1);
- bitMask_toString(modeStr,counterMask);
- setenv("LIKWID_MASK", modeStr, 1);
- free(modeStr);
-
- perfmon_startCounters();
- }
-
- if (system(bdata(exeString)) == EOF)
- {
- fprintf(stderr, "Failed to execute %s!\n", bdata(exeString));
- exit(EXIT_FAILURE);
- }
-
- if (optReport)
- {
- // multiplex_stop();
- // perfmon_printReport(&set);
- }
- else
- {
- if (optUseMarker)
- {
- perfmon_stopCounters();
- perfmon_printMarkerResults(filepath);
- }
- else
- {
- perfmon_stopCounters();
- perfmon_printCounterResults();
- }
- }
- }
-
- bdestroy(filepath);
- bdestroy(exeString);
- perfmon_finalize();
- fflush(OUTSTREAM);
- fclose(OUTSTREAM);
- /* call filterscript if specified */
- if (!biseqcstr(filterScript,"NO"))
- {
- bcatcstr(filterScript, " perfctr");
- if (system(bdata(filterScript)) == EOF)
- {
- fprintf(stderr, "Failed to execute filter %s!\n", bdata(filterScript));
- exit(EXIT_FAILURE);
- }
- }
-
- return EXIT_SUCCESS;
-}
-
diff --git a/src/applications/likwid-perfctr.lua b/src/applications/likwid-perfctr.lua
new file mode 100644
index 0000000..f49ecc7
--- /dev/null
+++ b/src/applications/likwid-perfctr.lua
@@ -0,0 +1,775 @@
+#!<INSTALLED_BINPREFIX>/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ * Filename: likwid-perfctr.lua
+ *
+ * Description: An application to read out performance counter registers
+ * on x86 processors
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+
+package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+
+local function version()
+ print(string.format("likwid-perfctr -- Version %d.%d",likwid.version,likwid.release))
+end
+
+local function examples()
+ print("Examples:")
+ print("Run command on CPU 2 and measure performance group TEST:")
+ print("likwid-perfctr -C 2 -g TEST ./a.out")
+end
+
+local function usage()
+ version()
+ print("A tool to read out performance counter registers on x86 processors\n")
+ print("Options:")
+ print("-h, --help\t\t Help message")
+ print("-v, --version\t\t Version information")
+ print("-V, --verbose <level>\t Verbose output, 0 (only errors), 1 (info), 2 (details), 3 (developer)")
+ print("-c <list>\t\t Processor ids to measure (required), e.g. 1,2-4,8")
+ print("-C <list>\t\t Processor ids to pin threads and measure, e.g. 1,2-4,8")
+ print("\t\t\t For information about the <list> syntax, see likwid-pin")
+ print("-g, --group <string>\t Performance group or custom event set string")
+ print("-H\t\t\t Get group help (together with -g switch)")
+ print("-s, --skip <hex>\t Bitmask with threads to skip")
+ print("-M <0|1>\t\t Set how MSR registers are accessed, 0=direct, 1=accessDaemon")
+ print("-a\t\t\t List available performance groups")
+ print("-e\t\t\t List available events and counter registers")
+ print("-E <string>\t\t List available events and corresponding counters that match <string>")
+ print("-i, --info\t\t Print CPU info")
+ print("-T <time>\t\t Switch eventsets with given frequency")
+ print("-f, --force\t\t Force overwrite of registers if they are in use")
+ print("Modes:")
+ print("-S <time>\t\t Stethoscope mode with duration in s, ms or us, e.g 20ms")
+ print("-t <time>\t\t Timeline mode with frequency in s, ms or us, e.g. 300ms")
+ print("-m, --marker\t\t Use Marker API inside code")
+ print("Output options:")
+ print("-o, --output <file>\t Store output to file. (Optional: Apply text filter according to filename suffix)")
+ print("-O\t\t\t Output easily parseable CSV instead of fancy tables")
+ print("--stats\t\t\t Always print statistics table")
+ print("\n")
+ examples()
+end
+
+
+local config = likwid.getConfiguration()
+verbose = 0
+print_groups = false
+print_events = false
+print_event = nil
+print_info = false
+cpulist = nil
+num_cpus = 0
+pin_cpus = false
+group_string = nil
+event_string = nil
+event_string_list = {}
+avail_groups = {}
+num_avail_groups = 0
+group_list = {}
+group_ids = {}
+activeGroup = 0
+print_group_help = false
+skip_mask = nil
+counter_mask = {}
+access_flags = "e"
+if config["daemonMode"] < 0 then
+ access_mode = 1
+else
+ access_mode = config["daemonMode"]
+ if access_mode == 0 then
+ access_flags = "rw"
+ end
+end
+set_access_modes = false
+use_marker = false
+use_stethoscope = false
+use_timeline = false
+daemon_run = 0
+use_wrapper = false
+duration = 2.E06
+switch_interval = 5
+output = ""
+use_csv = false
+print_stats = false
+execString = nil
+outfile = nil
+forceOverwrite = 0
+gotC = false
+markerFile = string.format("/tmp/likwid_%d.txt",likwid.getpid())
+print_stdout = print
+cpuClock = 1
+likwid.catchSignal()
+
+if #arg == 0 then
+ usage()
+ os.exit(0)
+end
+
+for opt,arg in likwid.getopt(arg, {"a", "c:", "C:", "e", "E:", "g:", "h", "H", "i", "m", "M:", "o:", "O", "P", "s:", "S:", "t:", "v", "V:", "T:", "f", "group:", "help", "info", "version", "verbose:", "output:", "skip:", "marker", "force", "stats"}) do
+ if (type(arg) == "string") then
+ local s,e = arg:find("-");
+ if s == 1 then
+ print_stdout(string.format("Argmument %s to option -%s starts with invalid character -.", arg, opt))
+ print_stdout("Did you forget an argument to an option?")
+ os.exit(1)
+ end
+ end
+ if opt == "h" or opt == "help" then
+ usage()
+ os.exit(0)
+ elseif opt == "v" or opt == "version" then
+ version()
+ os.exit(0)
+ elseif opt == "V" or opt == "verbose" then
+ verbose = tonumber(arg)
+ likwid.setVerbosity(verbose)
+ elseif (opt == "c") then
+ num_cpus, cpulist = likwid.cpustr_to_cpulist(arg)
+ gotC = true
+ elseif (opt == "C") then
+ num_cpus, cpulist = likwid.cpustr_to_cpulist(arg)
+ pin_cpus = true
+ gotC = true
+ elseif (opt == "a") then
+ print_groups = true
+ elseif (opt == "e") then
+ print_events = true
+ elseif (opt == "E") then
+ print_event = arg
+ elseif opt == "f" or opt == "force" then
+ forceOverwrite = 1
+ elseif opt == "g" or opt == "group" then
+ table.insert(event_string_list, arg)
+ elseif (opt == "H") then
+ print_group_help = true
+ elseif opt == "s" or opt == "skip" then
+ if arg:match("0x[0-9A-F]") then
+ skip_mask = arg
+ else
+ if arg:match("[0-9A-F]") then
+ print("Given skip mask looks like hex, sanitizing arg to 0x"..arg)
+ skip_mask = "0x"..arg
+ else
+ print("Skip mask must be given in hex")
+ end
+ end
+ elseif (opt == "M") then
+ access_mode = tonumber(arg)
+ set_access_modes = true
+ if access_mode == 0 then
+ access_flags = "rw"
+ else
+ access_flags = "e"
+ end
+ if (access_mode < 0 and access_mode > 1) then
+ print_stdout("Access mode must be 0 for direct access and 1 for access daemon")
+ os.exit(1)
+ end
+ elseif opt == "i" or opt == "info" then
+ print_info = true
+ verbose = true
+ elseif opt == "m" or opt == "marker" then
+ use_marker = true
+ use_wrapper = true
+ elseif (opt == "S") then
+ use_stethoscope = true
+ duration = likwid.parse_time(arg)
+ elseif (opt == "t") then
+ use_timeline = true
+ duration = likwid.parse_time(arg)
+ elseif (opt == "T") then
+ duration = likwid.parse_time(arg)
+ elseif opt == "o" or opt == "output" then
+ local suffix = ""
+ if string.match(arg, "%.") then
+ suffix = string.match(arg, ".-[^\\/]-%.?([^%.\\/]*)$")
+ end
+ if suffix ~= "txt" then
+ use_csv = true
+ end
+ outfile = arg:gsub("%%h", likwid.gethostname())
+ outfile = outfile:gsub("%%p", likwid.getpid())
+ outfile = outfile:gsub("%%j", likwid.getjid())
+ outfile = outfile:gsub("%%r", likwid.getMPIrank())
+ io.output(outfile..".tmp")
+ print = function(...) for k,v in pairs({...}) do io.write(v .. "\n") end end
+ elseif (opt == "O") then
+ use_csv = true
+ elseif (opt == "stats") then
+ print_stats = true
+ elseif opt == "?" then
+ print("Invalid commandline option -"..arg)
+ os.exit(1)
+ elseif opt == "!" then
+ print("Option requires an argument")
+ os.exit(1)
+ end
+end
+
+io.stdout:setvbuf("no")
+cpuinfo = likwid.getCpuInfo()
+cputopo = likwid.getCpuTopology()
+
+if not likwid.msr_available(access_flags) then
+ if access_mode == 1 then
+ print_stdout("MSR device files not available")
+ print_stdout("Please load msr kernel module before retrying")
+ os.exit(1)
+ else
+ print_stdout("MSR device files not readable and writeable")
+ print_stdout("Be sure that you have enough permissions to access the MSR files directly")
+ os.exit(1)
+ end
+end
+
+if num_cpus == 0 and
+ not gotC and
+ not print_events and
+ print_event == nil and
+ not print_groups and
+ not print_group_help and
+ not print_info then
+ print_stdout("Option -c <list> or -C <list> must be given on commandline")
+ usage()
+ os.exit(1)
+elseif num_cpus == 0 and
+ gotC and
+ not print_events and
+ print_event == nil and
+ not print_groups and
+ not print_group_help and
+ not print_info then
+ print_stdout("CPUs given on commandline are not valid in current environment, maybe it's limited by a cpuset.")
+ os.exit(1)
+end
+
+
+if num_cpus > 0 then
+ for i,cpu1 in pairs(cpulist) do
+ for j, cpu2 in pairs(cpulist) do
+ if i ~= j and cpu1 == cpu2 then
+ print_stdout("List of CPUs is not unique, got two times CPU " .. tostring(cpu1))
+ os.exit(1)
+ end
+ end
+ end
+end
+
+
+
+if print_events == true then
+ local tab = likwid.getEventsAndCounters()
+ print_stdout(string.format("This architecture has %d counters.", #tab["Counters"]))
+ local outstr = "Counters names: "
+ print_stdout("Counter tags(name, type<, options>):")
+ for _, counter in pairs(tab["Counters"]) do
+ outstr = string.format("%s, %s", counter["Name"], counter["TypeName"]);
+ if counter["Options"]:len() > 0 then
+ outstr = outstr .. string.format(", %s",counter["Options"])
+ end
+ print_stdout(outstr)
+ end
+ print_stdout("\n\n")
+ print_stdout(string.format("This architecture has %d events.",#tab["Events"]))
+ print_stdout("Event tags (tag, id, umask, counters<, options>):")
+ for _, eventTab in pairs(tab["Events"]) do
+ outstr = eventTab["Name"] .. ", "
+ outstr = outstr .. string.format("0x%X, 0x%X, ",eventTab["ID"],eventTab["UMask"])
+ outstr = outstr .. eventTab["Limit"]
+ if #eventTab["Options"] > 0 then
+ outstr = outstr .. string.format(", %s",eventTab["Options"])
+ end
+ print_stdout(outstr)
+ end
+ os.exit(0)
+end
+
+if print_event ~= nil then
+ function case_insensitive_pattern(pattern)
+ local p = pattern:gsub("(%%?)(.)", function(percent, letter)
+ if percent ~= "" or not letter:match("%a") then
+ return percent .. letter
+ else
+ return string.format("[%s%s]", letter:lower(), letter:upper())
+ end
+ end)
+ return p
+ end
+ local tab = likwid.getEventsAndCounters()
+ local events = {}
+ local counters = {}
+ local outstr = ""
+ for _, eventTab in pairs(tab["Events"]) do
+ if eventTab["Name"]:match(case_insensitive_pattern(print_event)) then
+ table.insert(events, eventTab)
+ end
+ end
+ for _, counter in pairs(tab["Counters"]) do
+ for _, event in pairs(events) do
+ if counter["Name"]:match(event["Limit"]) then
+ counters[counter["Name"]] = counter
+ end
+ end
+ end
+ print_stdout(string.format("Found %d event(s) with search key %s:", #events, print_event))
+ for _, eventTab in pairs(events) do
+ outstr = eventTab["Name"] .. ", "
+ outstr = outstr .. string.format("0x%X, 0x%X, ",eventTab["ID"],eventTab["UMask"])
+ outstr = outstr .. eventTab["Limit"]
+ if #eventTab["Options"] > 0 then
+ outstr = outstr .. string.format(", %s",eventTab["Options"])
+ end
+ print_stdout(outstr)
+ end
+ print_stdout("\nUsable counter(s) for above event(s):")
+ for i, counter in pairs(counters) do
+ outstr = string.format("%s, %s", counter["Name"], counter["TypeName"]);
+ if counter["Options"]:len() > 0 then
+ outstr = outstr .. string.format(", %s",counter["Options"])
+ end
+ print_stdout(outstr)
+ end
+ likwid.putTopology()
+ likwid.putConfiguration()
+ os.exit(0)
+end
+
+avail_groups = likwid.getGroups()
+if print_groups == true then
+ print_stdout(string.format("%11s\t%s","Group name", "Description"))
+ print_stdout(likwid.hline)
+ for i,g in pairs(avail_groups) do
+ print_stdout(string.format("%11s\t%s",g["Name"], g["Info"]))
+ end
+ likwid.putTopology()
+ likwid.putConfiguration()
+ os.exit(0)
+end
+
+if print_group_help == true then
+ if #event_string_list == 0 then
+ print_stdout("Group(s) must be given on commandline to get group help")
+ os.exit(1)
+ end
+ for i,event_string in pairs(event_string_list) do
+ local s,e = event_string:find(":")
+ if s ~= nil then
+ print_stdout("Given string is no group")
+ os.exit(1)
+ end
+ for i,g in pairs(avail_groups) do
+ if event_string == g["Name"] then
+ print_stdout(string.format("Group %s:",g["Name"]))
+ print_stdout(g["Long"])
+ end
+ end
+ end
+ likwid.putTopology()
+ likwid.putConfiguration()
+ os.exit(0)
+end
+
+if #event_string_list == 0 and not print_info then
+ print_stdout("Option(s) -g <string> must be given on commandline")
+ usage()
+ likwid.putTopology()
+ likwid.putConfiguration()
+ os.exit(1)
+end
+
+if (cpuinfo["clock"] > 0) then
+ cpuClock = cpuinfo["clock"]
+else
+ cpuClock = likwid.getCpuClock()
+end
+
+if outfile == nil then
+ print_stdout(likwid.hline)
+ print_stdout(string.format("CPU name:\t%s",cpuinfo["osname"]))
+ print_stdout(string.format("CPU type:\t%s",cpuinfo["name"]))
+ print_stdout(string.format("CPU clock:\t%3.2f GHz",cpuClock * 1.E-09))
+end
+
+if print_info or verbose > 0 then
+ print_stdout(string.format("CPU family:\t%u", cpuinfo["family"]))
+ print_stdout(string.format("CPU model:\t%u", cpuinfo["model"]))
+ print_stdout(string.format("CPU short:\t%s", cpuinfo["short_name"]))
+ print_stdout(string.format("CPU stepping:\t%u", cpuinfo["stepping"]))
+ print_stdout(string.format("CPU features:\t%s", cpuinfo["features"]))
+ P6_FAMILY = 6
+ if cpuinfo["family"] == P6_FAMILY and cpuinfo["perf_version"] > 0 then
+ print_stdout(likwid.hline)
+ print_stdout(string.format("PERFMON version:\t%u",cpuinfo["perf_version"]))
+ print_stdout(string.format("PERFMON number of counters:\t%u",cpuinfo["perf_num_ctr"]))
+ print_stdout(string.format("PERFMON width of counters:\t%u",cpuinfo["perf_width_ctr"]))
+ print_stdout(string.format("PERFMON number of fixed counters:\t%u",cpuinfo["perf_num_fixed_ctr"]))
+ end
+ print_stdout(likwid.hline)
+ if print_info then
+ likwid.printSupportedCPUs()
+ likwid.putTopology()
+ likwid.putConfiguration()
+ os.exit(0)
+ end
+end
+
+if use_stethoscope == false and use_timeline == false and use_marker == false then
+ use_wrapper = true
+end
+
+if use_wrapper and likwid.tablelength(arg)-2 == 0 and print_info == false then
+ print_stdout("No Executable can be found on commandline")
+ usage()
+ likwid.putTopology()
+ likwid.putConfiguration()
+ os.exit(0)
+end
+
+if use_marker then
+ if likwid.access(markerFile, "rw") ~= -1 then
+ print_stdout(string.format("ERROR: MarkerAPI file %s not accessible. Maybe a remaining file of another user.", markerFile))
+ print_stdout("Please purge all MarkerAPI files from /tmp.")
+ os.exit(1)
+ end
+ if not pin_cpus then
+ print_stdout("Warning: The Marker API requires the application to run on the selected CPUs.")
+ print_stdout("Warning: likwid-perfctr pins the application only when using the -C command line option.")
+ print_stdout("Warning: LIKWID assumes that the application does it before the first instrumented code region is started.")
+ print_stdout("Warning: You can use the string in the environment variable LIKWID_THREADS to pin you application to")
+ print_stdout("Warning: to the CPUs specified after the -c command line option.")
+ end
+end
+
+if verbose == 0 then
+ likwid.setenv("LIKWID_SILENT","true")
+end
+
+if pin_cpus then
+ local omp_threads = os.getenv("OMP_NUM_THREADS")
+ if omp_threads == nil then
+ likwid.setenv("OMP_NUM_THREADS",tostring(math.tointeger(num_cpus)))
+ elseif num_cpus > tonumber(omp_threads) then
+ print_stdout(string.format("Environment variable OMP_NUM_THREADS already set to %s but %d cpus required", omp_threads,num_cpus))
+ end
+ if os.getenv("CILK_NWORKERS") == nil then
+ likwid.setenv("CILK_NWORKERS", tostring(math.tointeger(num_cpus)))
+ end
+ if skip_mask then
+ likwid.setenv("LIKWID_SKIP",skip_mask)
+ end
+ likwid.setenv("KMP_AFFINITY","disabled")
+
+ if num_cpus > 1 then
+ local pinString = tostring(math.tointeger(cpulist[2]))
+ for i=3,likwid.tablelength(cpulist) do
+ pinString = pinString .. "," .. tostring(math.tointeger(cpulist[i]))
+ end
+ pinString = pinString .. "," .. tostring(math.tointeger(cpulist[1]))
+ likwid.setenv("LIKWID_PIN", pinString)
+
+ local preload = os.getenv("LD_PRELOAD")
+ if preload == nil then
+ likwid.setenv("LD_PRELOAD",likwid.pinlibpath)
+ else
+ likwid.setenv("LD_PRELOAD",likwid.pinlibpath .. ":" .. preload)
+ end
+ elseif num_cpus == 1 then
+ likwid.setenv("LIKWID_PIN", tostring(math.tointeger(cpulist[1])))
+ if verbose > 0 then
+ likwid.pinProcess(cpulist[1], 0)
+ else
+ likwid.pinProcess(cpulist[1], 1)
+ end
+ end
+end
+
+
+
+--[[for i, event_string in pairs(event_string_list) do
+ local groupdata = likwid.get_groupdata(event_string)
+ if groupdata == nil then
+ print_stdout("Cannot read event string, it's neither a performance group nor a proper event string <event>:<counter>:<options>,...")
+ usage()
+ likwid.putTopology()
+ likwid.putConfiguration()
+ os.exit(1)
+ end
+ table.insert(group_list, groupdata)
+ event_string_list[i] = groupdata["EventString"]
+end]]
+
+
+if set_access_modes then
+ if likwid.setAccessClientMode(access_mode) ~= 0 then
+ likwid.putTopology()
+ likwid.putConfiguration()
+ os.exit(1)
+ end
+end
+if likwid.init(num_cpus, cpulist) < 0 then
+ likwid.putTopology()
+ likwid.putConfiguration()
+ os.exit(1)
+end
+
+likwid.setenv("LIKWID_FORCE", tostring(forceOverwrite))
+for i, event_string in pairs(event_string_list) do
+ if event_string:len() > 0 then
+ local gid = likwid.addEventSet(event_string)
+ if gid < 0 then
+ likwid.putTopology()
+ likwid.putConfiguration()
+ likwid.finalize()
+ os.exit(1)
+ end
+ table.insert(group_ids, gid)
+ end
+end
+if #group_ids == 0 then
+ print("ERROR: No valid eventset given on commandline. Exiting...")
+ likwid.putTopology()
+ likwid.putConfiguration()
+ likwid.finalize()
+ os.exit(1)
+end
+
+activeGroup = group_ids[1]
+likwid.setupCounters(activeGroup)
+if outfile == nil then
+ print_stdout(likwid.hline)
+end
+
+if use_marker == true then
+ likwid.setenv("LIKWID_FILEPATH", markerFile)
+ likwid.setenv("LIKWID_MODE", tostring(access_mode))
+ likwid.setenv("LIKWID_DEBUG", tostring(verbose))
+ local str = table.concat(event_string_list, "|")
+ likwid.setenv("LIKWID_EVENTS", str)
+ likwid.setenv("LIKWID_THREADS", table.concat(cpulist,","))
+ likwid.setenv("LIKWID_FORCE", "-1")
+end
+
+execString = table.concat(arg," ",1, likwid.tablelength(arg)-2)
+if verbose == true then
+ print_stdout(string.format("Executing: %s",execString))
+end
+local ldpath = os.getenv("LD_LIBRARY_PATH")
+local libpath = likwid.pinlibpath:match("([/%g]+)/%g+.so")
+if ldpath == nil then
+ likwid.setenv("LD_LIBRARY_PATH", libpath)
+elseif not ldpath:match(libpath) then
+ likwid.setenv("LD_LIBRARY_PATH", libpath..":"..ldpath)
+end
+
+
+if use_timeline == true then
+ local cores_string = "CORES: "
+ for i, cpu in pairs(cpulist) do
+ cores_string = cores_string .. tostring(cpu) .. "|"
+ end
+ io.stderr:write("# "..cores_string:sub(1,cores_string:len()-1).."\n")
+ for gid, group in pairs(group_list) do
+ local strlist = {}
+ if group["Metrics"] == nil then
+ for i,e in pairs(group["Events"]) do
+ table.insert(strlist, e["Event"])
+ end
+ else
+ for i,e in pairs(group["Metrics"]) do
+ table.insert(strlist, e["description"])
+ end
+ end
+ io.stderr:write("# "..table.concat(strlist, "|").."\n")
+ end
+end
+
+
+
+io.stdout:flush()
+local groupTime = {}
+if use_wrapper or use_timeline then
+ local start = likwid.startClock()
+ local stop = 0
+ local alltime = 0
+ local nr_events = likwid.getNumberOfEvents(activeGroup)
+ local nr_threads = likwid.getNumberOfThreads()
+ local firstrun = true
+
+ if use_wrapper and #group_ids == 1 then
+ duration = 30.E06
+ end
+
+ local ret = likwid.startCounters()
+ if ret < 0 then
+ print_stdout(string.format("Error starting counters for cpu %d.",cpulist[ret * (-1)]))
+ os.exit(1)
+ end
+
+ local pid = nil
+ if pin_cpus then
+ pid = likwid.startProgram(execString, #cpulist, cpulist)
+ else
+ pid = likwid.startProgram(execString, 0, cpulist)
+ end
+
+ if not pid then
+ print_stdout("Failed to execute command: ".. execString)
+ end
+ start = likwid.startClock()
+ groupTime[activeGroup] = 0
+ while true do
+ if likwid.getSignalState() ~= 0 then
+ likwid.killProgram()
+ break
+ end
+ local remain = likwid.sleep(duration)
+ if remain > 0 or not likwid.checkProgram(pid) then
+ io.stdout:flush()
+ break
+ end
+ if use_timeline == true then
+ stop = likwid.stopClock()
+ likwid.stopCounters()
+
+ local time = likwid.getClock(start, stop)
+ if likwid.getNumberOfMetrics(activeGroup) == 0 then
+ results = likwid.getLastResults()
+ else
+ results = likwid.getLastMetrics()
+ end
+ str = tostring(math.tointeger(activeGroup)) .. " "..tostring(#results[activeGroup]).." "..tostring(#cpulist).." "..tostring(time)
+ for i,l1 in pairs(results[activeGroup]) do
+ for j, value in pairs(l1) do
+ str = str .. " " .. tostring(value)
+ end
+ end
+ io.stderr:write(str.."\n")
+ groupTime[activeGroup] = time
+ likwid.startCounters()
+ else
+ likwid.readCounters()
+ end
+ if #group_ids > 1 then
+ likwid.switchGroup(activeGroup + 1)
+ activeGroup = likwid.getIdOfActiveGroup()
+ if groupTime[activeGroup] == nil then
+ groupTime[activeGroup] = 0
+ end
+ nr_events = likwid.getNumberOfEvents(activeGroup)
+ end
+
+ end
+ stop = likwid.stopClock()
+elseif use_stethoscope then
+ local ret = likwid.startCounters()
+ if ret < 0 then
+ print_stdout(string.format("Error starting counters for cpu %d.",cpulist[ret * (-1)]))
+ os.exit(1)
+ end
+ likwid.sleep(duration)
+elseif use_marker then
+ local ret = likwid.startCounters()
+ if ret < 0 then
+ print_stdout(string.format("Error starting counters for cpu %d.",cpulist[ret * (-1)]))
+ os.exit(1)
+ end
+ local ret = os.execute(execString)
+ if ret == nil then
+ print_stdout("Failed to execute command: ".. execString)
+ end
+end
+
+local ret = likwid.stopCounters()
+if ret < 0 then
+ print_stdout(string.format("Error stopping counters for thread %d.",ret * (-1)))
+ likwid.finalize()
+ likwid.putTopology()
+ likwid.putConfiguration()
+ os.exit(1)
+end
+io.stdout:flush()
+if outfile == nil then
+ print_stdout(likwid.hline)
+end
+
+
+if use_marker == true then
+ results, metrics = likwid.getMarkerResults(markerFile, cpulist)
+ if #results == 0 then
+ print_stdout("No regions could be found in Marker API result file")
+ else
+ for r=1, #results do
+ likwid.printOutput(results[r], metrics[r], cpulist, r, print_stats)
+ end
+ end
+ os.remove(markerFile)
+elseif use_timeline == false then
+ results = likwid.getResults()
+ metrics = likwid.getMetrics()
+ likwid.printOutput(results, metrics, cpulist, nil, print_stats)
+end
+
+if outfile then
+ local suffix = ""
+ if string.match(outfile,"%.") then
+ suffix = string.match(outfile, ".-[^\\/]-%.?([^%.\\/]*)$")
+ end
+ local command = "<INSTALLED_PREFIX>/share/likwid/filter/" .. suffix
+ local tmpfile = outfile..".tmp"
+ if suffix == "" then
+ os.rename(tmpfile, outfile)
+ elseif suffix ~= "txt" and suffix ~= "csv" and likwid.access(command, "x") then
+ print_stdout("Cannot find filter script, save output in CSV format to file "..outfile)
+ os.rename(tmpfile, outfile)
+ else
+ if suffix ~= "txt" and suffix ~= "csv" then
+ command = command .." ".. tmpfile .. " perfctr"
+ local f = assert(io.popen(command))
+ if f ~= nil then
+ local o = f:read("*a")
+ if o:len() > 0 then
+ print_stdout(string.format("Failed to executed filter script %s.",command))
+ end
+ else
+ print_stdout("Failed to call filter script, save output in CSV format to file "..outfile)
+ os.rename(tmpfile, outfile)
+ os.remove(tmpfile)
+ end
+ else
+ os.rename(tmpfile, outfile)
+ os.remove(tmpfile)
+ end
+ end
+end
+
+likwid.finalize()
+likwid.putTopology()
+likwid.putNumaInfo()
+likwid.putConfiguration()
+os.exit(0)
diff --git a/src/applications/likwid-perfscope.lua b/src/applications/likwid-perfscope.lua
new file mode 100644
index 0000000..c1165a7
--- /dev/null
+++ b/src/applications/likwid-perfscope.lua
@@ -0,0 +1,560 @@
+#!<INSTALLED_BINPREFIX>/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ * Filename: likwid-perfscope.lua
+ *
+ * Description: An application to use the timeline mode of likwid-perfctr to generate
+ * realtime plots using feedGnuplot
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+
+package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+
+PERFCTR="<INSTALLED_BINPREFIX>/likwid-perfctr"
+FEEDGNUPLOT="<INSTALLED_BINPREFIX>/feedGnuplot"
+
+local predefined_plots = {
+ FLOPS_DP = {
+ perfgroup = "FLOPS_DP",
+ ymetricmatch = "MFlops/s",
+ title = "Double Precision Flop Rate",
+ ytitle = "MFlops/s",
+ y2title = nil,
+ xtitle = "Time"
+ },
+ FLOPS_SP = {
+ perfgroup = "FLOPS_SP",
+ ymetricmatch = "MFlops/s",
+ title = "Single Precision Flop Rate",
+ ytitle = "MFlops/s",
+ y2title = nil,
+ xtitle = "Time"
+ },
+ L2 = {
+ perfgroup = "L2",
+ ymetricmatch = "L2D load bandwidth [MBytes/s]",
+ title = "L2 cache bandwidth",
+ ytitle = "Load Bandwidth [MBytes/s]",
+ y2metricmatch = "L2D evict bandwidth [MBytes/s]",
+ y2title = "Evict Bandwidth [MBytes/s]",
+ xtitle = "Time"
+ },
+ L3 = {
+ perfgroup = "L3",
+ ymetricmatch = "L3 load bandwidth [MBytes/s]",
+ title = "L3 cache bandwidth",
+ ytitle = "Load Bandwidth [MBytes/s]",
+ y2title = "Evict Bandwidth [MBytes/s]",
+ y2metricmatch = "L3 evict bandwidth [MBytes/s]",
+ xtitle = "Time"
+ },
+ MEM = {
+ perfgroup = "MEM",
+ ymetricmatch = "Memory bandwidth [MBytes/s]",
+ title = "Memory bandwidth",
+ ytitle = "Bandwidth [MBytes/s]",
+ y2title = nil,
+ xtitle = "Time"
+ },
+ QPI = {
+ perfgroup = "QPI",
+ ymetricmatch = "QPI data bandwidth [MByte/s]",
+ title = "QPI bandwidth",
+ ytitle = "Bandwidth [MBytes/s]",
+ y2title = nil,
+ xtitle = "Time",
+ y2metricmatch = "QPI link bandwidth [MByte/s]"
+ },
+ ENERGY = {
+ perfgroup = "ENERGY",
+ ymetricmatch = "Power [W]",
+ title = "Consumed energy",
+ ytitle = "Power [W]",
+ y2title = "Power DRAM [W]",
+ y2metricmatch = "Power DRAM [W]",
+ xtitle = "Time"
+ },
+ TEMP = {
+ perfgroup = "ENERGY",
+ ymetricmatch = "Temperature [C]",
+ title = "Temperature",
+ ytitle = "Temperature [C]",
+ y2title = nil,
+ xtitle = "Time"
+ },
+ NUMA = {
+ perfgroup = "NUMA",
+ ymetricmatch = "Local DRAM bandwidth [MByte/s]",
+ title = "NUMA separated memory bandwidth",
+ ytitle = "Bandwidth [MBytes/s]",
+ y2metricmatch = "Remote DRAM bandwidth [MByte/s]",
+ y2title = nil,
+ xtitle = "Time"
+ },
+}
+
+local function version()
+ print(string.format("likwid-perfscope -- Version %d.%d",likwid.version,likwid.release))
+end
+
+local function examples()
+ print("Examples:")
+ print("Run command on CPU 2 and measure performance group TEST:")
+ print("likwid-perfscope -C 2 -g TEST -f 1s ./a.out")
+end
+
+local function usage()
+ version()
+ print("A tool to generate pictures on-the-fly from likwid-perfctr measurements\n")
+ print("Options:")
+ print("-h, --help\t\t Help message")
+ print("-v, --version\t\t Version information")
+ print("-V, --verbose <level>\t Verbose output, 0 (only errors), 1 (info), 2 (details), 3 (developer)")
+ print("-a\t\t\t Print all preconfigured plot configurations for the current system.")
+ print("-c <list>\t\t Processor ids to measure, e.g. 1,2-4,8")
+ print("-C <list>\t\t Processor ids to pin threads and measure, e.g. 1,2-4,8")
+ print("-g, --group <string>\t Preconfigured plot group or custom event set string with plot config. See man page for information.")
+ print("-t, --time <time>\t Frequency in s, ms or us, e.g. 300ms, for the timeline mode of likwid-perfctr")
+ print("-f, --force\t\t Overwrite counter configuration although already in use")
+ print("-d, --dump\t\t Print output as it is send to feedGnuplot.")
+ print("-p, --plotdump\t\t Use dump functionality of feedGnuplot. Plots out plot configurations plus data to directly submit to gnuplot")
+ print("--host <host>\t\t Run likwid-perfctr on the selected host using SSH. Evaluation and plotting is done locally.")
+ print("\t\t\t This can be used for machines that have no gnuplot installed. All paths must be similar to the local machine.")
+ print("\n")
+ examples()
+end
+
+local function test_gnuplot()
+ cmd = "which gnuplot"
+ f = io.popen(cmd)
+ if f ~= nil then
+ io.close(f)
+ return true
+ end
+ return false
+end
+
+local eventStrings = {}
+local terminal = "x11"
+local num_cpus = 0
+local cpulist = {}
+local matchstring = nil
+local group_list = {}
+local timeline = "1s"
+local print_configs = false
+local pinning = false
+local dump = false
+local plotdump = false
+local nrgroups, allgroups = likwid.get_groups()
+local mfreq = 1.0
+local plotrange = 0
+local host = nil
+local force = false
+
+if #arg == 0 then
+ usage()
+ os.exit(0)
+end
+
+for opt,arg in likwid.getopt(arg, {"h","v","g:","C:","c:","t:","r:","a","d","p","f","help", "version","group:","time:","dump","range:","plotdump","all", "host:", "force"}) do
+ if opt == "h" or opt == "help" then
+ usage()
+ os.exit(0)
+ elseif opt == "v" or opt == "version" then
+ version()
+ os.exit(0)
+ elseif opt == "g" or opt == "group" then
+ table.insert(eventStrings, arg)
+ elseif (opt == "c") then
+ num_cpus, cpulist = likwid.cpustr_to_cpulist(arg)
+ elseif (opt == "C") then
+ num_cpus, cpulist = likwid.cpustr_to_cpulist(arg)
+ pinning = true
+ elseif opt == "t" or opt == "time" then
+ timeline = arg
+ mfreq = likwid.parse_time(timeline) * 1.E-6
+ elseif opt == "d" or opt == "dump" then
+ dump = true
+ elseif opt == "p" or opt == "plotdump" then
+ plotdump = true
+ elseif opt == "r" or opt == "range" then
+ plotrange = tonumber(arg)
+ elseif opt == "a" or opt == "all" then
+ print_configs = true
+ elseif opt == "host" then
+ host = arg
+ elseif opt == "f" or opt == "force" then
+ force = true
+ elseif opt == "?" then
+ print("Invalid commandline option -"..arg)
+ os.exit(1)
+ end
+end
+
+if print_configs then
+ local num_groups, all_groups = likwid.get_groups()
+ for name, config in pairs(predefined_plots) do
+ for i,g in pairs(all_groups) do
+ if g == config["perfgroup"] then
+ print("Group "..name)
+ print("\tPerfctr group: "..config["perfgroup"])
+ print("\tMatch for metric: "..config["ymetricmatch"])
+ print("\tTitle of plot: "..config["title"])
+ print("\tTitle of x-axis: "..config["xtitle"])
+ print("\tTitle of y-axis: "..config["ytitle"])
+ if config["y2metricmatch"] then
+ print("\tMatch for second metric: "..config["y2metricmatch"])
+ end
+ if config["y2title"] then
+ print("\tTitle of y2-axis: "..config["y2title"])
+ elseif config["y2metricmatch"] then
+ print("\tTitle of y2-axis: "..config["ytitle"])
+ end
+ print("")
+ break
+ end
+ end
+ end
+ os.exit(0)
+end
+
+if not test_gnuplot() then
+ print("GnuPlot not available")
+ os.exit(1)
+end
+
+if num_cpus == 0 then
+ print("ERROR: CPU string must be given")
+ os.exit(1)
+end
+
+if #arg == 0 then
+ print("ERROR: Executable must be given on commandline")
+ os.exit(1)
+end
+
+for i, event_def in pairs(eventStrings) do
+ local eventlist = likwid.stringsplit(event_def,",")
+
+ event_string = nil
+ plotgroup = nil
+ plotgroupconfig = nil
+ plotdefgroup = false
+ for j, preconf in pairs(predefined_plots) do
+ if eventlist[1] == j then
+ for j,g in pairs(allgroups) do
+ if g == preconf["perfgroup"] then
+ event_string = preconf["perfgroup"]
+ plotdefgroup = true
+ plotgroupconfig = preconf
+ plotgroup = j
+ break;
+ end
+ end
+ break;
+ end
+ end
+ if #eventlist > 1 then
+ outopts = eventlist[#eventlist]
+ table.remove(eventlist, #eventlist)
+ end
+ if event_string == nil then
+ if plotdefgroup == false then
+ event_string = table.concat(eventlist,",")
+ end
+ end
+
+ local groupdata = nil
+ groupdata = likwid.get_groupdata(event_string)
+ if groupdata == nil then
+ print("Cannot read event string, it's neither a performance group nor a proper event string <event>:<counter>:<options>,...")
+ usage()
+ os.exit(1)
+ end
+ if group_list[i] == nil then
+ group_list[i] = {}
+ end
+ group_list[i]["gdata"] = groupdata
+
+ formulalist = nil
+ local title = nil
+ local ytitle = nil
+ local y2title = nil
+ local y2funcindex = nil
+ local xtitle = nil
+ local output = nil
+ if plotgroup ~= nil then
+ title = plotgroupconfig["title"]
+ ytitle = plotgroupconfig["ytitle"]
+ xtitle = plotgroupconfig["xtitle"]
+ if plotgroupconfig["y2title"] ~= nil then
+ y2title = plotgroupconfig["y2title"]
+ elseif plotgroupconfig["y2metricmatch"] ~= nil then
+ y2title = plotgroupconfig["ytitle"]
+ end
+ for i,mconfig in pairs(groupdata["Metrics"]) do
+ local mmatch = "%a*"..plotgroupconfig["ymetricmatch"]:gsub("%[","%%["):gsub("%]","%%]").."%a*"
+ if mconfig["description"]:match(mmatch) then
+ formulalist = {{name=mconfig["description"], index=i}}
+ end
+ if plotgroupconfig["y2metricmatch"] ~= nil then
+ mmatch = "%a*"..plotgroupconfig["y2metricmatch"]:gsub("%[","%%["):gsub("%]","%%]").."%a*"
+ if mconfig["description"]:match(mmatch) then
+ table.insert(formulalist, {name=mconfig["description"], index=i})
+ end
+ end
+ end
+ end
+
+ --[[for j,estr in pairs(likwid.stringsplit(outopts, ":")) do
+ if estr:match("^title=([%g%s]+)") then
+ title = estr:match("^title=([%g%s]+)")
+ elseif estr:match("^TITLE=([%g%s]+)") then
+ title = estr:match("^TITLE=([%g%s]+)")
+ elseif estr:match("ytitle=([%g%s]+)") then
+ ytitle = estr:match("ytitle=([%g%s]+)")
+ elseif estr:match("YTITLE=([%g%s]+)")then
+ ytitle = estr:match("YTITLE=([%g%s]+)")
+ elseif estr:match("y2title=(%d+)-([%g%s]+)") then
+ y2funcindex, y2title = estr:match("y2title=(%d+)-([%g%s]+)")
+ elseif estr:match("Y2TITLE=(%d+)-([%g%s]+)") then
+ y2funcindex, y2title = estr:match("Y2TITLE=(%d+)-([%g%s]+)")
+ elseif estr:match("y2title=([%g%s]+)") then
+ y2title = estr:match("y2title=([%g%s]+)")
+ elseif estr:match("Y2TITLE=([%g%s]+)") then
+ y2title = estr:match("Y2TITLE=([%g%s]+)")
+ elseif estr:match("xtitle=([%g%s]+)") then
+ xtitle = estr:match("xtitle=([%g%s]+)")
+ elseif estr:match("XTITLE=([%g%s]+)")then
+ xtitle = estr:match("XTITLE=([%g%s]+)")
+ elseif estr:match("[%g%s]+=[%g]+") then
+ fname, form = estr:match("([%g%s]+)=([%g]+)")
+ if formulalist == nil then
+ formulalist = {}
+ end
+ if groupdata["Metrics"] ~= nil then
+ for i,mconfig in pairs(groupdata["Metrics"]) do
+ if mconfig["description"]:match(fname) then
+ table.insert(formulalist, {name=fname, index=i})
+ break
+ end
+ end
+ else
+ table.insert(formulalist, {name=fname, formula=form})
+ end
+ end
+ end]]
+
+ group_list[i]["eventstring"] = event_string
+ group_list[i]["counterlist"] = {}
+ for k=1,#groupdata["Events"] do
+ table.insert(group_list[i]["counterlist"], groupdata["Events"][k]["Counter"])
+ end
+ if title then
+ group_list[i]["title"] = title
+ end
+ if ytitle then
+ group_list[i]["ytitle"] = ytitle
+ end
+ if y2title then
+ group_list[i]["y2title"] = y2title
+ end
+ if y2funcindex then
+ group_list[i]["y2funcindex"] = y2funcindex - 1
+ else
+ if formulalist ~= nil then
+ group_list[i]["y2funcindex"] = #formulalist - 1
+ end
+ end
+ if xtitle then
+ group_list[i]["xtitle"] = xtitle
+ end
+ if formulalist ~= nil then
+ group_list[i]["formulas"] = formulalist
+ else
+ group_list[i]["formulas"] = {}
+ end
+end
+
+cmd = ""
+if host ~= nil then
+ cmd = cmd .. "ssh "..host.. " \"/bin/bash -c \\\" "
+end
+cmd = cmd .. " " ..PERFCTR
+if pinning then
+ cmd = cmd .. string.format(" -C %s",table.concat(cpulist,","))
+else
+ cmd = cmd .. string.format(" -c %s",table.concat(cpulist,","))
+end
+if force then
+ cmd = cmd .. " -f"
+end
+cmd = cmd .. string.format(" -t %s", timeline)
+
+for i, group in pairs(group_list) do
+ cmd = cmd .. " -g "..group["eventstring"]
+end
+cmd = cmd .. " ".. table.concat(arg, " ")
+-- since io.popen can only read stdout we swap stdout and stderr
+-- application output is written to stderr, we catch stdout
+cmd = cmd .. " 3>&1 1>&2 2>&3 3>&-"
+if host ~= nil then
+ cmd = cmd .. " \\\" \" "
+end
+perfctr = assert (io.popen (cmd))
+
+
+for i, group in pairs(group_list) do
+ gnucmd = string.format("%s --stream %f --with linespoints --domain --nodataid", FEEDGNUPLOT, mfreq/#group_list)
+ if plotrange > 0 then
+ gnucmd = gnucmd .. string.format(" --xlen %d", plotrange)
+ else
+ gnucmd = gnucmd .. " --xmin 0"
+ end
+ if group["title"] ~= nil then
+ if #group_list > 1 then
+ gnucmd = gnucmd .. string.format(" --title %q", "Group "..i..": "..group["title"])
+ else
+ gnucmd = gnucmd .. string.format(" --title %q", group["title"])
+ end
+ end
+ if group["xtitle"] ~= nil then
+ gnucmd = gnucmd .. string.format(" --xlabel %q", group["xtitle"])
+ else
+ gnucmd = gnucmd .. string.format(" --xlabel %q", "Time")
+ end
+ if group["ytitle"] ~= nil then
+ gnucmd = gnucmd .. string.format(" --ylabel %q", group["ytitle"])
+ end
+ if group["y2title"] ~= nil then
+ gnucmd = gnucmd .. string.format(" --y2 %d --y2label %q", group["y2funcindex"], group["y2title"])
+ end
+ if group["formulas"] then
+ if #cpulist == 1 then
+ for f, fdesc in pairs(group["formulas"]) do
+ gnucmd = gnucmd .. string.format(" --legend %d %q", f-1, fdesc["name"])
+ end
+ else
+ local curveID = 0
+ for c,cpu in pairs(cpulist) do
+ for f, fdesc in pairs(group["formulas"]) do
+ gnucmd = gnucmd .. string.format(" --legend %d %q", curveID, "C"..cpu..": "..fdesc["name"])
+ curveID = curveID + 1
+ end
+ end
+ end
+ end
+ gnucmd = gnucmd .. " --set 'key outside bmargin bottom'"
+ if plotdump then
+ gnucmd = gnucmd .. " --dump"
+ else
+ gnucmd = gnucmd .. " 1>/dev/null 2>&1"
+ end
+ group_list[i]["output"] = assert(io.popen(gnucmd,"w"))
+end
+
+
+likwid.catchSignal()
+local mtime = {}
+for i,g in pairs(group_list) do
+ local str = "0 "
+ for k,t in pairs(cpulist) do
+ for j,c in pairs(g["formulas"]) do
+ str = str .."0 "
+ end
+ end
+ mtime[i] = nil
+ g["output"]:write(str.."\n")
+ g["output"]:flush()
+ if dump then
+ print(tostring(i).." ".. str)
+ end
+end
+
+
+olddata = {}
+oldmetric = {}
+local perfctr_exited = false
+local oldtime = 0
+local clock = likwid.getCpuClock()
+while true do
+ local l = perfctr:read("*line")
+ if l == nil or l:match("^%s*$") then
+ break
+ end
+ if l:match("^%d+ %d+ %d+ [%d.]+ %d+") then
+ local data = {}
+ local diff = {}
+ linelist = likwid.stringsplit(l, " ")
+ group = tonumber(linelist[1])
+ nr_events = tonumber(linelist[2])
+ nr_threads = tonumber(linelist[3])
+ time = tonumber(linelist[4])
+ table.remove(linelist, 1)
+ table.remove(linelist, 1)
+ table.remove(linelist, 1)
+ table.remove(linelist, 1)
+
+ for i=1,nr_events do
+ if data[i] == nil then data[i] = {} end
+ for j=1,nr_threads do
+ data[i][j] = tonumber(linelist[1])
+ table.remove(linelist, 1)
+ end
+ end
+
+ str = tostring(time)
+ for f, flist in pairs(group_list[group]["formulas"]) do
+ if flist["index"] ~= nil then
+ for i=1,nr_threads do
+ str = str .." ".. data[flist["index"]][i]
+ end
+ end
+ end
+
+ group_list[group]["output"]:write(str.."\n")
+ group_list[group]["output"]:flush()
+ if dump then
+ print(tostring(group).." ".. str)
+ end
+ oldtime = time
+ end
+end
+
+if perfctr_exited == false then
+ while likwid.getSignalState() == 0 do
+ likwid.sleep(1E6)
+ end
+end
+for i, group in pairs(group_list) do
+ group["output"]:write("exit\n")
+ io.close(group["output"])
+end
+io.close(perfctr)
+
+
+
diff --git a/src/applications/likwid-pin.c b/src/applications/likwid-pin.c
deleted file mode 100644
index 3d9e85b..0000000
--- a/src/applications/likwid-pin.c
+++ /dev/null
@@ -1,346 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: likwid-pin.c
- *
- * Description: An application to pin a program including threads
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-/* ##### HEADER FILE INCLUDES ######################################### */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sched.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <ctype.h>
-
-#include <error.h>
-#include <types.h>
-#include <bstrlib.h>
-#include <cpuid.h>
-#include <affinity.h>
-#include <numa.h>
-#include <memsweep.h>
-#include <strUtil.h>
-
-#ifdef COLOR
-#include <textcolor.h>
-#endif
-
-/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
-#define HELP_MSG \
- fprintf(stdout, "likwid-pin -- Version %d.%d \n\n",VERSION,RELEASE); \
- fprintf(stdout, "\n"); \
- fprintf(stdout, "Supported Options:\n"); \
- fprintf(stdout, "-h\t Help message\n"); \
- fprintf(stdout, "-v\t Version information\n"); \
- fprintf(stdout, "-i\t Set numa interleave policy with all involved numa nodes\n"); \
- fprintf(stdout, "-S\t Sweep memory in involved numa nodes\n"); \
- fprintf(stdout, "-c\t comma separated list of processor ids or expression\n"); \
- fprintf(stdout, "-s\t bitmask with threads to skip\n"); \
- fprintf(stdout, "-p\t Print available domains with mapping on physical ids\n"); \
- fprintf(stdout, " \t If used together with -c option outputs a physical processor ids.\n"); \
- fprintf(stdout, "-d\t Delimiter used for using -p to output physical processor list, default is comma.\n\n"); \
- fprintf(stdout, "-q\t Silent without output\n\n"); \
- fprintf(stdout, "There are three possibilities to provide a thread to processor list:\n\n"); \
- fprintf(stdout, "1. Thread list with physical or logical thread numberings and physical cores first.\n"); \
- fprintf(stdout, "Example usage thread list: likwid-pin -c N:0,4-6 ./myApp\n"); \
- fprintf(stdout, "You can pin with the following numberings:\n"); \
- fprintf(stdout, "\t1. Physical numbering of OS.\n"); \
- fprintf(stdout, "\t2. Logical numbering inside node. e.g. -c N:0-3\n"); \
- fprintf(stdout, "\t3. Logical numbering inside socket. e.g. -c S0:0-3\n"); \
- fprintf(stdout, "\t4. Logical numbering inside last level cache group. e.g. -c C0:0-3\n"); \
- fprintf(stdout, "\t5. Logical numbering inside NUMA domain. e.g. -c M0:0-3\n"); \
- fprintf(stdout, "\tYou can also mix domains separated by @, e.g. -c S0:0-3 at S1:0-3 \n\n"); \
- fprintf(stdout, "2. Expressions based thread list generation with compact processor numbering.\n"); \
- fprintf(stdout, "Example usage expression: likwid-pin -c E:N:8 ./myApp\n"); \
- fprintf(stdout, "This will generate a compact list of thread to processor mapping for the node domain with eight threads.\n"); \
- fprintf(stdout, "The following syntax variants are available:\n"); \
- fprintf(stdout, "\t1. -c E:<thread domain>:<number of threads>\n"); \
- fprintf(stdout, "\t2. -c E:<thread domain>:<number of threads>:<chunk size>:<stride>\n"); \
- fprintf(stdout, "\t For two SMT threads per core on a SMT 4 machine use e.g. -c E:N:122:2:4\n\n"); \
- fprintf(stdout, "3. Scatter policy among thread domain type.\n"); \
- fprintf(stdout, "Example usage scatter: likwid-pin -c M:scatter ./myApp\n"); \
- fprintf(stdout, "This will generate a thread to processor mapping scattered among all memory domains with physical cores first.\n\n"); \
- fprintf(stdout, "4. Logical pinning.\n"); \
- fprintf(stdout, "Example usage logical pinning: likwid-pin -c L:0,3,4 ./myApp\n"); \
- fprintf(stdout, "This will generate a mapping containing the processors with index 0, 3 and 4 in the currently available processor list.\n"); \
- fprintf(stdout, "If you are running inside a cpuset (taskset, cgroup) the sorted list of allowed processors is taken as processor list.\n"); \
- fprintf(stdout, "Example usage logical pinning inside cpuset:\n"); \
- fprintf(stdout, "taskset -c 4,7,2,1,5 likwid-pin -c L:0,2,4 ./myApp\n"); \
- fprintf(stdout, "This maps the application to the processors 1,4,7.\n\n"); \
- fprintf(stdout, "If you ommit the -c option likwid will use all processors available on the node\n"); \
- fprintf(stdout, "with physical cores first. likwid-pin will also set OMP_NUM_THREADS with as many\n"); \
- fprintf(stdout, "threads as specified in your pin expression if OMP_NUM_THREADS is not present\n"); \
- fprintf(stdout, "in your environment.\n\n"); \
- fflush(stdout);
-
-#define VERSION_MSG \
- fprintf(stdout, "likwid-pin %d.%d \n\n",VERSION,RELEASE); \
- fflush(stdout);
-
-/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
- static void
-pinPid(int cpuid, int silent)
-{
- int status;
- cpu_set_t cpuset;
-
- CPU_ZERO(&cpuset);
- CPU_SET(cpuid, &cpuset);
-
- status = sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
-
- if (status == -1)
- {
- fprintf(stderr, "sched_setaffinity failed : %s \n",strerror(errno));
- }
- else
- {
- if(!silent)
- {
-#ifdef COLOR
- color_on(BRIGHT, COLOR);
-#endif
- fprintf(stdout, "[likwid-pin] Main PID -> core %d - OK", cpuid);
-#ifdef COLOR
- color_reset();
-#endif
- fprintf(stdout, "\n");
- fflush(stdout);
- }
- }
-}
-
-
-/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
-
-int main (int argc, char** argv)
-{
- int i;
- int c;
- int skipMask = -1;
- int optInterleaved = 0;
- int optMemSweep = 0;
- int optPrintDomains = 0;
- int optSilent = 0;
- int hasAffinity = 0;
- bstring pinString;
- bstring skipString;
- bstring argString;
- int numThreads=0;
- int threads[MAX_NUM_THREADS];
- char delimiter = ',';
- FILE* OUTSTREAM = stdout;
- threads[0] = 0;
-
- if (argc == 1) {
- HELP_MSG;
- exit (EXIT_SUCCESS);
- }
-
- if (cpuid_init() == EXIT_SUCCESS)
- {
- numa_init();
- affinity_init();
- hasAffinity = 1;
- }
-
- while ((c = getopt (argc, argv, "+c:d:hipqs:Sv")) != -1)
- {
- switch (c)
- {
- case 'c':
- CHECK_OPTION_STRING;
- if (hasAffinity)
- {
- numThreads = bstr_to_cpuset(threads, argString);
- }
- else
- {
- numThreads = bstr_to_cpuset_physical((uint32_t*) threads, argString);
- }
-
- if(!numThreads)
- {
- ERROR_PLAIN_PRINT(Failed to parse cpu list.);
- }
- break;
- case 'd':
- delimiter = optarg[0];
- break;
- case 'h':
- HELP_MSG;
- exit (EXIT_SUCCESS);
- case 'i':
- optInterleaved = 1;
- break;
- case 'p':
- if (!hasAffinity)
- {
- fprintf(stderr, "Option -p is not supported for unknown processor!\n");
- exit(EXIT_SUCCESS);
- }
- optPrintDomains = 1;
- break;
- case 'q':
- optSilent = 1;
- OUTSTREAM = NULL;
- setenv("LIKWID_SILENT","true", 1);
- break;
- case 's':
- CHECK_OPTION_STRING;
- skipMask = strtoul((char*) argString->data,NULL,16);
- break;
- case 'S':
- if (!hasAffinity)
- {
- fprintf(stderr, "Option -S is not supported for unknown processor!\n");
- exit(EXIT_SUCCESS);
- }
- optMemSweep = 1;
- break;
- case 'v':
- VERSION_MSG;
- exit (EXIT_SUCCESS);
- default:
- HELP_MSG;
- exit(EXIT_FAILURE);
- }
- }
- if (optind == argc && !optPrintDomains)
- {
- fprintf(stderr,"Executable must be given on commandline\n");
- exit(EXIT_FAILURE);
- }
-
- if (optPrintDomains && numThreads)
- {
- if ((!optSilent) && (OUTSTREAM))
- {
- fprintf(OUTSTREAM, "%d",threads[0]);
-
- for ( i=1; i< numThreads; i++)
- {
- fprintf(OUTSTREAM, "%c%d",delimiter,threads[i]);
- }
- fprintf(OUTSTREAM, "\n");
- fflush(OUTSTREAM);
- }
- exit (EXIT_SUCCESS);
- }
- else if ( optPrintDomains )
- {
- affinity_printDomains(OUTSTREAM);
- exit (EXIT_SUCCESS);
- }
-
- if (!numThreads)
- {
- argString = bformat("N:0-%u", cpuid_topology.numHWThreads-1);
- numThreads = bstr_to_cpuset(threads, argString);
- }
-
- /* CPU List:
- * pthread (default): pin main pid + all thread tids
- *
- * OpenMP: Pin OMP_NUM_THREADS
- * intel openmp: pin main pid + all thread tids (skip thread 1)
- * gcc openmp: pin main pid + all thread tids (one less)
- */
-
- if (optInterleaved)
- {
- if ((!optSilent) && (OUTSTREAM))
- {
- fprintf(OUTSTREAM, "Set mem_policy to interleaved\n");
- fflush(OUTSTREAM);
- }
- numa_setInterleaved(threads, numThreads);
- }
-
- if (optMemSweep)
- {
- if ((!optSilent) && (OUTSTREAM))
- {
- fprintf(OUTSTREAM, "Sweeping memory\n");
- fflush(OUTSTREAM);
- }
- memsweep_threadGroup(OUTSTREAM, threads, numThreads);
- }
-
- if ( getenv("OMP_NUM_THREADS") == NULL )
- {
- argString = bformat("%d",numThreads);
- setenv("OMP_NUM_THREADS",(char*) argString->data , 0);
- }
-
- if (numThreads > 1)
- {
- bstring ldPreload = bfromcstr(getenv("LD_PRELOAD"));
-
- pinString = bformat("%d",threads[1]);
-
- for (i=2; i < numThreads;i++)
- {
- bformata(pinString,",%d",threads[i]);
- }
-
- bformata(pinString,",%d",threads[0]);
-
- if (skipMask >= 0)
- {
- skipString = bformat("%d",skipMask);
- setenv("LIKWID_SKIP",(char*) bdata(skipString) , 1);
- }
-
- setenv("KMP_AFFINITY", "disabled", 1);
- setenv("LIKWID_PIN",(char*) bdata(pinString) , 1);
-
-
- if (ldPreload == NULL)
- {
- setenv("LD_PRELOAD",TOSTRING(LIBLIKWIDPIN), 1);
- }
- else
- {
- bconchar(ldPreload, ':');
- bcatcstr(ldPreload, TOSTRING(LIBLIKWIDPIN));
- setenv("LD_PRELOAD", bdata(ldPreload), 1);
- }
- }
-
- pinPid(threads[0], optSilent);
- fflush(stdout);
-
- argv += optind;
- execvp(argv[0], argv);
- perror("execvp");
- fprintf(stderr,"failed to execute %s\n", argv[0]);
-
- return EXIT_SUCCESS;
-}
-
diff --git a/src/applications/likwid-pin.lua b/src/applications/likwid-pin.lua
new file mode 100644
index 0000000..de57652
--- /dev/null
+++ b/src/applications/likwid-pin.lua
@@ -0,0 +1,275 @@
+#!<INSTALLED_BINPREFIX>/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ * Filename: likwid-pin.lua
+ *
+ * Description: An application to pin a program including threads
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+
+local function version()
+ print(string.format("likwid-pin.lua -- Version %d.%d",likwid.version,likwid.release))
+end
+
+local function examples()
+ print("Examples:")
+ print("There are three possibilities to provide a thread to processor list:")
+ print("1. Thread list with physical thread IDs")
+ print("Example: likwid-pin.lua -c 0,4-6 ./myApp")
+ print("Pins the application to cores 0,4,5 and 6")
+ print("2. Thread list with logical thread numberings in physical cores first sorted list.")
+ print("Example usage thread list: likwid-pin.lua -c N:0,4-6 ./myApp")
+ print("You can pin with the following numberings:")
+ print("\t2. Logical numbering inside node.\n\t e.g. -c N:0,1,2,3 for the first 4 physical cores of the node")
+ print("\t3. Logical numbering inside socket.\n\t e.g. -c S0:0-1 for the first 2 physical cores of the socket")
+ print("\t4. Logical numbering inside last level cache group.\n\t e.g. -c C0:0-3 for the first 4 physical cores in the first LLC")
+ print("\t5. Logical numbering inside NUMA domain.\n\t e.g. -c M0:0-3 for the first 4 physical cores in the first NUMA domain")
+ print("\tYou can also mix domains separated by @,\n\te.g. -c S0:0-3 at S1:0-3 for the 4 first physical cores on both sockets.")
+ print("3. Expressions based thread list generation with compact processor numbering.")
+ print("Example usage expression: likwid-pin.lua -c E:N:8 ./myApp")
+ print("This will generate a compact list of thread to processor mapping for the node domain")
+ print("with eight threads.")
+ print("The following syntax variants are available:")
+ print("\t1. -c E:<thread domain>:<number of threads>")
+ print("\t2. -c E:<thread domain>:<number of threads>:<chunk size>:<stride>")
+ print("\tFor two SMT threads per core on a SMT 4 machine use e.g. -c E:N:122:2:4")
+ print("4. Scatter policy among thread domain type.")
+ print("Example usage scatter: likwid-pin.lua -c M:scatter ./myApp")
+ print("This will generate a thread to processor mapping scattered among all memory domains")
+ print("with physical cores first.")
+ print("")
+ print("likwid-pin sets OMP_NUM_THREADS with as many threads as specified")
+ print("in your pin expression if OMP_NUM_THREADS is not present in your environment.")
+end
+
+local function usage()
+ version()
+ print("An application to pin a program including threads.\n")
+ print("Options:")
+ print("-h, --help\t\t Help message")
+ print("-v, --version\t\t Version information")
+ print("-V, --verbose <level>\t Verbose output, 0 (only errors), 1 (info), 2 (details), 3 (developer)")
+ print("-i\t\t\t Set numa interleave policy with all involved numa nodes")
+ print("-S, --sweep\t\t Sweep memory and LLC of involved NUMA nodes")
+ print("-c <list>\t\t Comma separated processor IDs or expression")
+ print("-s, --skip <hex>\t Bitmask with threads to skip")
+ print("-p\t\t\t Print available domains with mapping on physical IDs")
+ print("\t\t\t If used together with -p option outputs a physical processor IDs.")
+ print("-d <string>\t\t Delimiter used for using -p to output physical processor list, default is comma.")
+ print("-q, --quiet\t\t Silent without output")
+ print("\n")
+ examples()
+end
+
+delimiter = ','
+quiet = 0
+sweep_sockets = false
+interleaved_policy = false
+print_domains = false
+cpu_list = {}
+skip_mask = nil
+affinity = nil
+num_threads = 0
+
+config = likwid.getConfiguration()
+cputopo = likwid.getCpuTopology()
+affinity = likwid.getAffinityInfo()
+
+if (#arg == 0) then
+ usage()
+ os.exit(0)
+end
+
+for opt,arg in likwid.getopt(arg, {"c:", "d:", "h", "i", "p", "q", "s:", "S", "t:", "v", "V:", "verbose:", "help", "version", "skip","sweep", "quiet"}) do
+ if opt == "h" or opt == "help" then
+ usage()
+ likwid.putTopology()
+ likwid.putAffinityInfo()
+ likwid.putConfiguration()
+ os.exit(0)
+ elseif opt == "v" or opt == "version" then
+ version()
+ likwid.putTopology()
+ likwid.putAffinityInfo()
+ likwid.putConfiguration()
+ os.exit(0)
+ elseif opt == "V" or opt == "verbose" then
+ verbose = tonumber(arg)
+ likwid.setVerbosity(verbose)
+ elseif (opt == "c") then
+ if (affinity ~= nil) then
+ num_threads,cpu_list = likwid.cpustr_to_cpulist(arg)
+ else
+ num_threads,cpu_list = likwid.cpustr_to_cpulist_physical(arg)
+ end
+ if (num_threads == 0) then
+ print("Failed to parse cpulist " .. arg)
+ likwid.putTopology()
+ likwid.putAffinityInfo()
+ likwid.putConfiguration()
+ os.exit(1)
+ end
+ elseif (opt == "d") then
+ delimiter = arg
+ elseif opt == "S" or opt == "sweep" then
+ if (affinity == nil) then
+ print("Option -S is not supported for unknown processor!")
+ likwid.putTopology()
+ likwid.putAffinityInfo()
+ likwid.putConfiguration()
+ os.exit(1)
+ end
+ sweep_sockets = true
+ elseif (opt == "i") then
+ interleaved_policy = true
+ elseif (opt == "p") then
+ print_domains = true
+ elseif opt == "s" or opt == "skip" then
+ local s,e = arg:find("0x")
+ if s == nil then
+ print("Skip mask must be given in hex, hence start with 0x")
+ os.exit(1)
+ end
+ skip_mask = arg
+ elseif opt == "q" or opt == "quiet" then
+ likwid.setenv("LIKWID_SILENT","true")
+ quiet = 1
+ elseif opt == "?" then
+ print("Invalid commandline option -"..arg)
+ likwid.putTopology()
+ likwid.putAffinityInfo()
+ likwid.putConfiguration()
+ os.exit(1)
+ elseif opt == "!" then
+ print("Option requires an argument")
+ likwid.putTopology()
+ likwid.putAffinityInfo()
+ likwid.putConfiguration()
+ os.exit(1)
+ end
+end
+
+
+if print_domains and num_threads > 0 then
+ outstr = ""
+ for i, cpu in pairs(cpu_list) do
+ outstr = outstr .. delimiter .. cpu
+ end
+ print(outstr:sub(2,outstr:len()))
+ likwid.putTopology()
+ likwid.putAffinityInfo()
+ likwid.putConfiguration()
+ os.exit(0)
+elseif print_domains then
+ for k,v in pairs(affinity["domains"]) do
+ print(string.format("Domain %s:", v["tag"]))
+ print("\t" .. table.concat(v["processorList"], ","))
+ print("")
+ end
+ likwid.putTopology()
+ likwid.putAffinityInfo()
+ likwid.putConfiguration()
+ os.exit(0)
+end
+
+if num_threads == 0 then
+ num_threads, cpu_list = likwid.cpustr_to_cpulist("N:0-"..cputopo["numHWThreads"]-1)
+end
+if (#arg == 0) then
+ print("Executable must be given on commandline")
+ os.exit(1)
+end
+
+if interleaved_policy then
+ print("Set mem_policy to interleaved")
+ likwid.setMemInterleaved(num_threads, cpu_list)
+end
+
+if sweep_sockets then
+ print("Sweeping memory")
+ likwid.memSweep(num_threads, cpu_list)
+end
+
+local omp_threads = os.getenv("OMP_NUM_THREADS")
+if omp_threads == nil then
+ likwid.setenv("OMP_NUM_THREADS",tostring(math.tointeger(num_threads)))
+elseif num_threads > tonumber(omp_threads) then
+ print(string.format("Environment variable OMP_NUM_THREADS already set to %s but %d cpus required", omp_threads,num_threads))
+end
+
+likwid.setenv("KMP_AFFINITY","disabled")
+
+if os.getenv("CILK_NWORKERS") == nil then
+ likwid.setenv("CILK_NWORKERS", tostring(math.tointeger(num_threads)))
+end
+if skip_mask then
+ likwid.setenv("LIKWID_SKIP", skip_mask)
+end
+
+if num_threads > 1 then
+ local pinString = tostring(math.tointeger(cpu_list[2]))
+ for i=3,likwid.tablelength(cpu_list) do
+ pinString = pinString .. "," .. tostring(math.tointeger(cpu_list[i]))
+ end
+ pinString = pinString .. "," .. tostring(math.tointeger(cpu_list[1]))
+ likwid.setenv("LIKWID_PIN", pinString)
+
+ local preload = os.getenv("LD_PRELOAD")
+ if preload == nil then
+ likwid.setenv("LD_PRELOAD",likwid.pinlibpath)
+ else
+ likwid.setenv("LD_PRELOAD",likwid.pinlibpath .. ":" .. preload)
+ end
+ local ldpath = os.getenv("LD_LIBRARY_PATH")
+ local libpath = likwid.pinlibpath:match("([/%g]+)/%g+.so")
+ if ldpath == nil then
+ likwid.setenv("LD_LIBRARY_PATH", libpath)
+ elseif not ldpath:match(libpath) then
+ likwid.setenv("LD_LIBRARY_PATH", libpath..":"..ldpath)
+ end
+else
+ likwid.setenv("LIKWID_PIN", tostring(math.tointeger(cpu_list[1])))
+ likwid.pinProcess(cpu_list[1], quiet)
+end
+
+local exec = table.concat(arg," ",1, likwid.tablelength(arg)-2)
+local pid = likwid.startProgram(exec, num_threads, cpu_list)
+if (pid == nil) then
+ print("Failed to execute command: ".. exec)
+ likwid.putTopology()
+ likwid.putAffinityInfo()
+ likwid.putConfiguration()
+ os.exit(1)
+end
+
+likwid.waitpid(pid)
+
+likwid.putAffinityInfo()
+likwid.putTopology()
+likwid.putConfiguration()
+os.exit(0)
diff --git a/src/applications/likwid-powermeter.c b/src/applications/likwid-powermeter.c
deleted file mode 100644
index 4daa393..0000000
--- a/src/applications/likwid-powermeter.c
+++ /dev/null
@@ -1,507 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: likwid-powermeter.c
- *
- * Description: An application to get information about power
- * consumption on architectures implementing the RAPL interface.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sched.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <ctype.h>
-
-#include <types.h>
-#include <strUtil.h>
-#include <error.h>
-#include <lock.h>
-#include <timer.h>
-#include <cpuid.h>
-#include <numa.h>
-#include <accessClient.h>
-#include <msr.h>
-#include <affinity.h>
-#include <perfmon.h>
-#include <power.h>
-#include <thermal.h>
-#include <bstrlib.h>
-
-/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
-
-#define HELP_MSG \
-fprintf(stdout, "\nlikwid-powermeter -- Version %d.%d \n\n",VERSION,RELEASE); \
-fprintf(stdout, "A tool to print Power and Clocking information on Intel SandyBridge CPUS.\n"); \
-fprintf(stdout, "Options:\n"); \
-fprintf(stdout, "-h\t\t Help message\n"); \
-fprintf(stdout, "-v\t\t Version information\n"); \
-fprintf(stdout, "-M <0|1>\t set how MSR registers are accessed: 0=direct, 1=msrd \n"); \
-fprintf(stdout, "-c <list>\t specify sockets to measure\n"); \
-fprintf(stdout, "-i\t\t print information from MSR_PKG_POWER_INFO register and Turbo Mode\n"); \
-fprintf(stdout, "-s <duration>\t set measure duration in sec. (default 2s) \n"); \
-fprintf(stdout, "-p\t\t print dynamic clocking and CPI values (requires executable)\n\n"); \
-fprintf(stdout, "Usage: likwid-powermeter -s 4 -c 1 \n"); \
-fprintf(stdout, "Alternative as wrapper: likwid-powermeter -c 1 ./a.out\n"); \
-fflush(stdout);
-
-#define VERSION_MSG \
-fprintf(stdout, "likwid-powermeter %d.%d \n\n",VERSION,RELEASE); \
-fflush(stdout);
-
-
-int main (int argc, char** argv)
-{
- int socket_fd = -1;
- int optInfo = 0;
- int optClock = 0;
- int optStethoscope = 0;
- int optSockets = 0;
- int optTemp = 0;
- double runtime;
- int hasDRAM = 0;
- int hasPP0 = 0;
- int hasPP1 = 0;
- int c, i;
- bstring argString;
- bstring eventString = bfromcstr("CLOCK");
- int numSockets=1;
- int numThreads=0;
- int threadsSockets[MAX_NUM_NODES*2];
- int threads[MAX_NUM_THREADS];
- const AffinityDomain* socketDomains[MAX_NUM_NODES*2];
- threadsSockets[0] = 0;
-
- if (argc == 1)
- {
- HELP_MSG;
- exit (EXIT_SUCCESS);
- }
-
- while ((c = getopt (argc, argv, "+c:hiM:ps:vt")) != -1)
- {
- switch (c)
- {
- case 'c':
- CHECK_OPTION_STRING;
- numSockets = bstr_to_cpuset_physical((uint32_t*) threadsSockets, argString);
- bdestroy(argString);
- optSockets = 1;
- break;
-
- case 'h':
- HELP_MSG;
- exit (EXIT_SUCCESS);
- case 'i':
- optInfo = 1;
- break;
- case 'M': /* Set MSR Access mode */
- CHECK_OPTION_STRING;
- accessClient_setaccessmode(str2int((char*) argString->data));
- bdestroy(argString);
- break;
- case 'p':
- optClock = 1;
- break;
- case 's':
- CHECK_OPTION_STRING;
- optStethoscope = str2int((char*) argString->data);
- bdestroy(argString);
- break;
- case 'v':
- VERSION_MSG;
- exit (EXIT_SUCCESS);
- case 't':
- optTemp = 1;
- break;
- case '?':
- if (optopt == 's' || optopt == 'M' || optopt == 'c')
- {
- HELP_MSG;
- }
- else if (isprint (optopt))
- {
- fprintf (stderr, "Unknown option `-%c'.\n", optopt);
- }
- else
- {
- fprintf (stderr,
- "Unknown option character `\\x%x'.\n",
- optopt);
- }
- exit( EXIT_FAILURE);
- default:
- HELP_MSG;
- exit (EXIT_SUCCESS);
- }
- }
-
- if (!lock_check())
- {
- fprintf(stderr,"Access to performance counters is locked.\n");
- exit(EXIT_FAILURE);
- }
- if (optClock && optind == argc)
- {
- fprintf(stderr,"Commandline option -p requires an executable.\n");
- exit(EXIT_FAILURE);
- }
- if (optSockets && !optStethoscope && optind == argc)
- {
- fprintf(stderr,"Commandline option -c requires an executable if not used in combination with -s.\n");
- exit(EXIT_FAILURE);
- }
- if (optStethoscope == 0 && optind == argc && !optInfo)
- {
- fprintf(stderr,"Either -s <seconds> or executable must be given on commandline.\n");
- exit(EXIT_FAILURE);
- }
-
- if (cpuid_init() == EXIT_FAILURE)
- {
- fprintf(stderr, "CPU not supported\n");
- exit(EXIT_FAILURE);
- }
- if (numSockets > cpuid_topology.numSockets)
- {
- fprintf(stderr, "System has only %d sockets but %d are given on commandline.\n",
- cpuid_topology.numSockets, numSockets);
- exit(EXIT_FAILURE);
- }
-
- numa_init();
- affinity_init();
-
- for (c = 0; c < numSockets; c++)
- {
- if (threadsSockets[c] >= cpuid_topology.numSockets)
- {
- fprintf(stderr, "System has no socket %d\n", threadsSockets[c]);
- exit(EXIT_FAILURE);
- }
- bstring socketStr = bformat("S%d",threadsSockets[c]);
- socketDomains[threadsSockets[c]] = affinity_getDomain(socketStr);
- }
-
- accessClient_init(&socket_fd);
- msr_init(socket_fd);
- timer_init();
-
- /* check for supported processors */
- if ((cpuid_info.model == SANDYBRIDGE_EP) ||
- (cpuid_info.model == SANDYBRIDGE) ||
- (cpuid_info.model == IVYBRIDGE) ||
- (cpuid_info.model == IVYBRIDGE_EP) ||
- (cpuid_info.model == HASWELL) ||
- (cpuid_info.model == HASWELL_EX) ||
- (cpuid_info.model == NEHALEM_BLOOMFIELD) ||
- (cpuid_info.model == NEHALEM_LYNNFIELD) ||
- (cpuid_info.model == NEHALEM_WESTMERE) ||
- (cpuid_info.model == ATOM_SILVERMONT_C) ||
- (cpuid_info.model == ATOM_SILVERMONT_E) ||
- (cpuid_info.model == ATOM_SILVERMONT_F1) ||
- (cpuid_info.model == ATOM_SILVERMONT_F2) ||
- (cpuid_info.model == ATOM_SILVERMONT_F3))
- {
- if (numSockets == 0)
- {
- numSockets = numa_info.numberOfNodes;
- }
- for(int i=0; i<numSockets; i++)
- {
- power_init(socketDomains[threadsSockets[i]]->processorList[0]);
- }
- }
- else
- {
- fprintf (stderr, "Query Turbo Mode only supported on Intel Nehalem/Westmere/SandyBridge/IvyBridge/Haswell/Silvermont processors!\n");
- exit(EXIT_FAILURE);
- }
-
- double clock = (double) timer_getCpuClock();
-
- fprintf(stdout, HLINE);
- fprintf(stdout, "CPU name:\t%s \n",cpuid_info.name);
- fprintf(stdout, "CPU clock:\t%3.2f GHz \n", (float) clock * 1.E-09);
- fprintf(stdout, HLINE);
- fflush(stdout);
-
- if (optInfo)
- {
- if (power_info.turbo.numSteps != 0)
- {
- fprintf(stdout, "Base clock:\t%.2f MHz \n", power_info.baseFrequency );
- fprintf(stdout, "Minimal clock:\t%.2f MHz \n", power_info.minFrequency );
- fprintf(stdout, "Turbo Boost Steps:\n");
- for (int i=0; i < power_info.turbo.numSteps; i++ )
- {
- fprintf(stdout, "C%d %.2f MHz \n",i+1, power_info.turbo.steps[i] );
- }
- }
- fprintf(stdout, HLINE);
- fflush(stdout);
- }
-
- if ((cpuid_info.model == SANDYBRIDGE_EP) ||
- (cpuid_info.model == IVYBRIDGE_EP) ||
- (cpuid_info.model == HASWELL_EX) ||
- (cpuid_info.model == HASWELL))
- {
- hasDRAM = 1;
- }
- if ((cpuid_info.model == SANDYBRIDGE_EP) ||
- (cpuid_info.model == SANDYBRIDGE) ||
- (cpuid_info.model == IVYBRIDGE_EP) ||
- (cpuid_info.model == IVYBRIDGE) ||
- (cpuid_info.model == HASWELL) ||
- (cpuid_info.model == ATOM_SILVERMONT_E) ||
- (cpuid_info.model == ATOM_SILVERMONT_F1) ||
- (cpuid_info.model == ATOM_SILVERMONT_F2) ||
- (cpuid_info.model == ATOM_SILVERMONT_F3))
- {
- hasPP0 = 1;
- }
- if ((cpuid_info.model == HASWELL) ||
- (cpuid_info.model == SANDYBRIDGE) ||
- (cpuid_info.model == IVYBRIDGE))
- {
- hasPP1 = 1;
- }
- if ((cpuid_info.model != SANDYBRIDGE) &&
- (cpuid_info.model != SANDYBRIDGE_EP) &&
- (cpuid_info.model != IVYBRIDGE) &&
- (cpuid_info.model != IVYBRIDGE_EP) &&
- (cpuid_info.model != HASWELL) &&
- (cpuid_info.model != HASWELL_M1) &&
- (cpuid_info.model != HASWELL_M2) &&
- (cpuid_info.model != HASWELL_EX) &&
- (cpuid_info.model != ATOM_SILVERMONT_C) &&
- (cpuid_info.model != ATOM_SILVERMONT_E) &&
- (cpuid_info.model != ATOM_SILVERMONT_F1) &&
- (cpuid_info.model != ATOM_SILVERMONT_F2) &&
- (cpuid_info.model != ATOM_SILVERMONT_F3))
- {
- fprintf (stderr, "RAPL not supported on this processor!\n");
- exit(EXIT_FAILURE);
- }
-
- if (optInfo)
- {
- fprintf(stdout, "Thermal Spec Power: %g Watts \n", power_info.tdp );
- fprintf(stdout, "Minimum Power: %g Watts \n", power_info.minPower);
- fprintf(stdout, "Maximum Power: %g Watts \n", power_info.maxPower);
- fprintf(stdout, "Maximum Time Window: %g micro sec \n", power_info.maxTimeWindow);
- fprintf(stdout, HLINE);
- fflush(stdout);
- exit(EXIT_SUCCESS);
- }
-
- if (optClock)
- {
- affinity_init();
- argString = bformat("S%u:0-%u", threadsSockets[0],
- socketDomains[threadsSockets[0]]->numberOfProcessors-1);
- for (int i=1; i<numSockets; i++)
- {
- bstring tExpr = bformat("@S%u:0-%u", threadsSockets[i],
- socketDomains[threadsSockets[i]]->numberOfProcessors-1);
- bconcat(argString, tExpr);
- }
- numThreads = bstr_to_cpuset(threads, argString);
- bdestroy(argString);
- perfmon_init(numThreads, threads, stdout);
- perfmon_setupEventSet(eventString, NULL);
- }
-
- {
- PowerData pDataPkg[MAX_NUM_NODES*2];
- PowerData pDataDram[MAX_NUM_NODES*2];
- PowerData pDataPP0[MAX_NUM_NODES*2];
- PowerData pDataPP1[MAX_NUM_NODES*2];
- fprintf(stdout, "Measure on sockets: %d", threadsSockets[0]);
- for (int i=1; i<numSockets; i++)
- {
- fprintf(stdout, ", %d", threadsSockets[i]);
- }
- fprintf(stdout, "\n");
- fflush(stdout);
-
- if (optStethoscope)
- {
- if (optClock)
- {
- perfmon_startCounters();
- }
- else
- {
- for (int i=0; i<numSockets; i++)
- {
- int cpuId = socketDomains[threadsSockets[i]]->processorList[0];
- if (hasDRAM) power_start(&(pDataDram[i]), cpuId, DRAM);
- if (hasPP0) power_start(&(pDataPP0[i]), cpuId, PP0);
- if (hasPP1) power_start(&(pDataPP1[i]), cpuId, PP1);
- power_start(&(pDataPkg[i]), cpuId, PKG);
- }
- }
- sleep(optStethoscope);
-
- if (optClock)
- {
- perfmon_stopCounters();
- perfmon_printCounterResults();
- perfmon_finalize();
- }
- else
- {
- for (int i=0; i<numSockets; i++)
- {
- int cpuId = socketDomains[threadsSockets[i]]->processorList[0];
- power_stop(&(pDataPkg[i]), cpuId, PKG);
- if (hasPP1) power_stop(&(pDataPP1[i]), cpuId, PP1);
- if (hasPP0) power_stop(&(pDataPP0[i]), cpuId, PP0);
- if (hasDRAM) power_stop(&(pDataDram[i]), cpuId, DRAM);
- }
- }
- runtime = (double) optStethoscope;
- }
- else
- {
- TimerData time;
- argv += optind;
- bstring exeString = bfromcstr(argv[0]);
-
- for (int i=1; i<(argc-optind); i++)
- {
- bconchar(exeString, ' ');
- bcatcstr(exeString, argv[i]);
- }
- fprintf(stdout, "Executing: %s\n",bdata(exeString));
- fflush(stdout);
-
-
- if (optClock)
- {
- perfmon_startCounters();
- }
- else
- {
- for (int i=0; i<numSockets; i++)
- {
- int cpuId = socketDomains[threadsSockets[i]]->processorList[0];
- if (hasDRAM) power_start(&(pDataDram[i]), cpuId, DRAM);
- if (hasPP0) power_start(&(pDataPP0[i]), cpuId, PP0);
- if (hasPP1) power_start(&(pDataPP1[i]), cpuId, PP1);
- power_start(&(pDataPkg[i]), cpuId, PKG);
- }
-
- timer_start(&time);
- }
-
- if (system(bdata(exeString)) == EOF)
- {
- fprintf(stderr, "Failed to execute %s!\n", bdata(exeString));
- exit(EXIT_FAILURE);
- }
-
- if (optClock)
- {
- perfmon_stopCounters();
- perfmon_printCounterResults();
- perfmon_finalize();
- }
- else
- {
- timer_stop(&time);
-
- for (int i=0; i<numSockets; i++)
- {
- int cpuId = socketDomains[threadsSockets[i]]->processorList[0];
- power_stop(&(pDataPkg[i]), cpuId, PKG);
- if (hasDRAM) power_stop(&(pDataDram[i]), cpuId, DRAM);
- if (hasPP0) power_stop(&(pDataPP0[i]), cpuId, PP0);
- if (hasPP1) power_stop(&(pDataPP1[i]), cpuId, PP1);
- }
- runtime = timer_print(&time);
- }
- }
-
- if (!optClock)
- {
- fprintf(stdout, "Runtime: %g second \n",runtime);
- fprintf(stdout, HLINE);
- for (int i=0; i<numSockets; i++)
- {
- fprintf(stdout, "Socket %d (Measured on CPU %d)\n",threadsSockets[i],
- socketDomains[threadsSockets[i]]->processorList[0]);
- fprintf(stdout, "Domain: PKG \n");
- fprintf(stdout, "Energy consumed: %g Joules \n", power_printEnergy(&(pDataPkg[i])));
- fprintf(stdout, "Power consumed: %g Watts \n", power_printEnergy(&(pDataPkg[i])) / runtime );
- if (hasDRAM)
- {
- fprintf(stdout, "Domain: DRAM \n");
- fprintf(stdout, "Energy consumed: %g Joules \n", power_printEnergy(&(pDataDram[i])));
- fprintf(stdout, "Power consumed: %g Watts \n", power_printEnergy(&(pDataDram[i])) / runtime );
- }
- if (hasPP0)
- {
- fprintf(stdout, "Domain: PP0 \n");
- fprintf(stdout, "Energy consumed: %g Joules \n", power_printEnergy(&(pDataPP0[i])));
- fprintf(stdout, "Power consumed: %g Watts \n", power_printEnergy(&(pDataPP0[i])) / runtime );
- }
- if (hasPP1)
- {
- fprintf(stdout, "Domain: PP1 \n");
- fprintf(stdout, "Energy consumed: %g Joules \n", power_printEnergy(&(pDataPP1[i])));
- fprintf(stdout, "Power consumed: %g Watts \n", power_printEnergy(&(pDataPP1[i])) / runtime );
- }
- fprintf(stdout, "\n");
- }
- fflush(stdout);
- }
- }
-
-
- if ( optTemp && cpuid_hasFeature(TM2))
- {
- printf("Current core temperatures:\n");
- for (i = 0; i < numSockets; i++)
- {
- printf("Socket %d\n",threadsSockets[i]);
- for (c = 0; c < socketDomains[threadsSockets[i]]->numberOfProcessors; c++ )
- {
- thermal_init(i);
- printf("Core %d: %u C\n",
- socketDomains[threadsSockets[i]]->processorList[c],
- thermal_read(socketDomains[threadsSockets[i]]->processorList[c]));
- }
- }
- }
-
-
- msr_finalize();
- return EXIT_SUCCESS;
-}
-
diff --git a/src/applications/likwid-powermeter.lua b/src/applications/likwid-powermeter.lua
new file mode 100644
index 0000000..3aa742f
--- /dev/null
+++ b/src/applications/likwid-powermeter.lua
@@ -0,0 +1,388 @@
+#!<INSTALLED_BINPREFIX>/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ * Filename: likwid-powermeter.lua
+ *
+ * Description: An application to get information about power
+ * consumption on architectures implementing the RAPL interface.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+
+local function version()
+ print(string.format("likwid-powermeter -- Version %d.%d",likwid.version,likwid.release))
+end
+
+local function examples()
+ print("Examples:")
+ print("Measure the power consumption for 4 seconds on socket 1")
+ print("likwid-powermeter -s 4 -c 1")
+ print("")
+ print("Use it as wrapper for an application to measure the energy for the whole execution")
+ print("likwid-powermeter -c 1 ./a.out")
+end
+
+local function usage()
+ version()
+ print("A tool to print power and clocking information on x86 CPUs.\n")
+ print("Options:")
+ print("-h, --help\t Help message")
+ print("-v, --version\t Version information")
+ print("-V, --verbose <level>\t Verbose output, 0 (only errors), 1 (info), 2 (details), 3 (developer)")
+ print("-M <0|1>\t\t Set how MSR registers are accessed, 0=direct, 1=accessDaemon")
+ print("-c <list>\t\t Specify sockets to measure")
+ print("-i, --info\t Print information from MSR_PKG_POWER_INFO register and Turbo mode")
+ print("-s <duration>\t Set measure duration in us, ms or s. (default 2s)")
+ print("-p\t\t Print dynamic clocking and CPI values, uses likwid-perfctr")
+ print("-t\t\t Print current temperatures of all CPU cores")
+ print("-f\t\t Print current temperatures in Fahrenheit")
+ print("")
+ examples()
+end
+
+local config = likwid.getConfiguration();
+
+print_info = false
+use_perfctr = false
+stethoscope = false
+fahrenheit = false
+print_temp = false
+verbose = 0
+if config["daemonMode"] < 0 then
+ access_mode = 1
+else
+ access_mode = config["daemonMode"]
+end
+time_interval = 2.E06
+time_orig = "2s"
+read_interval = 30.E06
+sockets = {}
+domainList = {"PKG", "PP0", "PP1", "DRAM"}
+
+cpuinfo = likwid.getCpuInfo()
+cputopo = likwid.getCpuTopology()
+numatopo = likwid.getNumaInfo()
+affinity = likwid_getAffinityInfo()
+
+for opt,arg in likwid.getopt(arg, {"V:", "c:", "h", "i", "M:", "p", "s:", "v", "f", "t", "help", "info", "version", "verbose:"}) do
+ if (type(arg) == "string") then
+ local s,e = arg:find("-");
+ if s == 1 then
+ print(string.format("Argmument %s to option -%s starts with invalid character -.", arg, opt))
+ print("Did you forget an argument to an option?")
+ os.exit(1)
+ end
+ end
+ if opt == "h" or opt == "help" then
+ usage()
+ os.exit(0)
+ elseif opt == "v" or opt == "version" then
+ version()
+ os.exit(0)
+ elseif (opt == "c") then
+ num_sockets, sockets = likwid.sockstr_to_socklist(arg)
+ if num_sockets == 0 then
+ os.exit(1)
+ end
+ elseif (opt == "M") then
+ access_mode = tonumber(arg)
+ if (access_mode == nil) then
+ print("Access mode (-M) must be an number")
+ usage()
+ os.exit(1)
+ elseif (access_mode < 0) or (access_mode > 1) then
+ print(string.format("Access mode (-M) %d not valid.",access_mode))
+ usage()
+ os.exit(1)
+ end
+
+ elseif opt == "i" or opt == "info" then
+ print_info = true
+ elseif (opt == "p") then
+ use_perfctr = true
+ elseif (opt == "f") then
+ fahrenheit = true
+ print_temp = true
+ elseif (opt == "t") then
+ print_temp = true
+ elseif opt == "V" or opt == "verbose" then
+ verbose = tonumber(arg)
+ likwid.setVerbosity(verbose)
+ elseif (opt == "s") then
+ time_interval = likwid.parse_time(arg)
+ time_orig = arg
+ stethoscope = true
+ elseif opt == "?" then
+ print("Invalid commandline option -"..arg)
+ os.exit(1)
+ elseif opt == "!" then
+ print("Option requires an argument")
+ os.exit(1)
+ end
+end
+
+
+
+cpulist = {}
+before = {}
+after = {}
+if #sockets > 0 then
+ for i,socketId in pairs(sockets) do
+ local affinityID = "S"..tostring(socketId)
+ for j, domain in pairs(affinity["domains"]) do
+ if domain["tag"] == affinityID then
+ table.insert(cpulist,domain["processorList"][1])
+ before[domain["processorList"][1]] = {}
+ after[domain["processorList"][1]] = {}
+ for _, id in pairs(domainList) do
+ before[domain["processorList"][1]][id] = 0
+ after[domain["processorList"][1]][id] = 0
+ end
+ end
+ end
+ end
+else
+ for j, domain in pairs(affinity["domains"]) do
+ if domain["tag"]:match("S%d+") then
+ table.insert(cpulist,domain["processorList"][1])
+ table.insert(sockets, domain["tag"]:match("S(%d+)"))
+ before[domain["processorList"][1]] = {}
+ after[domain["processorList"][1]] = {}
+ for _, id in pairs(domainList) do
+ before[domain["processorList"][1]][id] = 0
+ after[domain["processorList"][1]][id] = 0
+ end
+ end
+ end
+end
+
+
+if likwid.setAccessClientMode(access_mode) ~= 0 then
+ os.exit(1)
+end
+
+power = likwid.getPowerInfo()
+if not power then
+ print(string.format("The %s does not support reading power data",cpuinfo["name"]))
+ os.exit(1)
+end
+
+
+if not use_perfctr then
+ print(likwid.hline);
+ print(string.format("CPU name:\t%s",cpuinfo["osname"]))
+ print(string.format("CPU type:\t%s",cpuinfo["name"]))
+ if cpuinfo["clock"] > 0 then
+ print(string.format("CPU clock:\t%3.2f GHz",cpuinfo["clock"] * 1.E-09))
+ else
+ print(string.format("CPU clock:\t%3.2f GHz",likwid.getCpuClock() * 1.E-09))
+ end
+ print(likwid.hline)
+end
+
+if print_info or verbose > 0 then
+ if (power["turbo"]["numSteps"] > 0) then
+ print(string.format("Base clock:\t%.2f MHz", power["baseFrequency"]))
+ print(string.format("Minimal clock:\t%.2f MHz", power["minFrequency"]))
+ print("Turbo Boost Steps:")
+ for i,step in pairs(power["turbo"]["steps"]) do
+ print(string.format("C%d %.2f MHz",i-1,power["turbo"]["steps"][i]))
+ end
+ end
+ print(likwid.hline)
+end
+
+if power["hasRAPL"] == 0 then
+ print("Measuring power is not supported on this machine")
+ os.exit(1)
+end
+
+if (print_info) then
+ for i, dname in pairs(domainList) do
+ local domain = power["domains"][dname]
+ if domain["supportInfo"] then
+ print(string.format("Info for RAPL domain %s:", dname));
+ print(string.format("Thermal Spec Power: %g Watt",domain["tdp"]*1E-6))
+ print(string.format("Minimum Power: %g Watt",domain["minPower"]*1E-6))
+ print(string.format("Maximum Power: %g Watt",domain["maxPower"]*1E-6))
+ print(string.format("Maximum Time Window: %g micro sec",domain["maxTimeWindow"]))
+ print()
+ end
+ end
+ print(likwid.hline)
+end
+
+if (stethoscope) and (time_interval < power["timeUnit"]) then
+ print("Time interval too short, minimum measurement time is "..tostring(power["timeUnit"]).. " us")
+ os.exit(1)
+end
+
+local execString = ""
+if use_perfctr then
+ affinity = likwid.getAffinityInfo()
+ argString = ""
+ for i,socket in pairs(sockets) do
+ argString = argString .. string.format("S%u:0-%u",socket,(cputopo["numCoresPerSocket"]*cputopo["numThreadsPerCore"])-1)
+ if (i < #sockets) then
+ argString = argString .. "@"
+ end
+ end
+ execString = string.format("<INSTALLED_PREFIX>/bin/likwid-perfctr -C %s -f -g CLOCK ",argString)
+end
+
+
+if #arg == 0 then
+ if use_perfctr then
+ execString = execString .. string.format(" -S %s ", time_orig)
+ stethoscope = false
+ else
+ stethoscope = true
+ end
+else
+ if use_perfctr then
+ execString = execString .. table.concat(arg," ",1, likwid.tablelength(arg)-2)
+ else
+ execString = table.concat(arg," ",1, likwid.tablelength(arg)-2)
+ end
+end
+
+if not print_info and not print_temp then
+ if stethoscope or (#arg > 0 and not use_perfctr) then
+ for i,socket in pairs(sockets) do
+ cpu = cpulist[i]
+ for idx, dom in pairs(domainList) do
+ if (power["domains"][dom]["supportStatus"]) then before[cpu][dom] = likwid.startPower(cpu, idx) end
+ end
+ end
+
+ time_before = likwid.startClock()
+ if stethoscope then
+ if read_interval < time_interval then
+ while ((read_interval <= time_interval) and (time_interval > 0)) do
+ likwid.sleep(read_interval)
+ for i,socket in pairs(sockets) do
+ cpu = cpulist[i]
+ for idx, dom in pairs(domainList) do
+ if (power["domains"][dom]["supportStatus"]) then after[cpu][dom] = likwid.stopPower(cpu, idx) end
+ end
+ end
+ time_interval = time_interval - read_interval
+ if time_interval < read_interval then
+ read_interval = time_interval
+ end
+ end
+ else
+ likwid.sleep(time_interval)
+ end
+ else
+ local pid = likwid.startProgram(execString, 0, {})
+ if not pid then
+ print(string.format("Failed to execute %s!",execString))
+ likwid.finalize()
+ os.exit(1)
+ end
+ while true do
+ if likwid.getSignalState() ~= 0 then
+ likwid.killProgram()
+ break
+ end
+ local remain = likwid.sleep(read_interval)
+ for i,socket in pairs(sockets) do
+ cpu = cpulist[i]
+ for idx, dom in pairs(domainList) do
+ if (power["domains"][dom]["supportStatus"]) then after[cpu][dom] = likwid.stopPower(cpu, idx) end
+ end
+ end
+ if remain > 0 or not likwid.checkProgram(pid) then
+ io.stdout:flush()
+ break
+ end
+ end
+ end
+ time_after = likwid.stopClock()
+
+ for i,socket in pairs(sockets) do
+ cpu = cpulist[i]
+ for idx, dom in pairs(domainList) do
+ if (power["domains"][dom]["supportStatus"]) then after[cpu][dom] = likwid.stopPower(cpu, idx) end
+ end
+ end
+ runtime = likwid.getClock(time_before, time_after)
+
+ print(likwid.hline)
+ print(string.format("Runtime: %g s",runtime))
+
+ for i,socket in pairs(sockets) do
+ cpu = cpulist[i]
+ print(string.format("Measure for socket %d on CPU %d", socket,cpu ))
+ for j, dom in pairs(domainList) do
+ if power["domains"][dom]["supportStatus"] then
+ local energy = likwid.calcPower(before[cpu][dom], after[cpu][dom], 0)
+ print(string.format("Domain %s:", dom))
+ print(string.format("Energy consumed: %g Joules",energy))
+ print(string.format("Power consumed: %g Watt",energy/runtime))
+ end
+ end
+ if i < #sockets then print("") end
+ end
+ print(likwid.hline)
+ else
+ err = os.execute(execString)
+ if err == false then
+ print(string.format("Failed to execute %s!",execString))
+ likwid.putPowerInfo()
+ likwid.finalize()
+ os.exit(1)
+ end
+ end
+end
+
+if print_temp and (string.find(cpuinfo["features"],"TM2") ~= nil) then
+ print(likwid.hline)
+ print("Current core temperatures:");
+ for i=1,cputopo["numSockets"] do
+ local tag = "S" .. tostring(i-1)
+ for _, domain in pairs(affinity["domains"]) do
+ if domain["tag"] == tag then
+ for j=1,#domain["processorList"] do
+ local cpuid = domain["processorList"][j]
+ likwid.initTemp(cpuid);
+ if (fahrenheit) then
+ local f = 1.8*tonumber(likwid.readTemp(cpuid))+32
+ print(string.format("Socket %d Core %d: %.0f F",i-1,cpuid, f));
+ else
+ print(string.format("Socket %d Core %d: %.0f C",i-1,cpuid, tonumber(likwid.readTemp(cpuid))));
+ end
+ end
+ end
+ end
+ end
+ print(likwid.hline)
+end
+
+likwid.putPowerInfo()
+likwid.finalize()
diff --git a/src/applications/likwid-setFrequencies.lua b/src/applications/likwid-setFrequencies.lua
new file mode 100644
index 0000000..7a56921
--- /dev/null
+++ b/src/applications/likwid-setFrequencies.lua
@@ -0,0 +1,396 @@
+#!<INSTALLED_BINPREFIX>/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ * Filename: likwid-setFrequencies.lua
+ *
+ * Description: A application to set the CPU frequency of CPU cores and domains.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+
+package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+
+sys_base_path = "/sys/devices/system/cpu"
+set_command = "<INSTALLED_PREFIX>/sbin/likwid-setFreq"
+
+
+function version()
+ print(string.format("likwid-setFrequencies -- Version %d.%d",likwid.version,likwid.release))
+end
+
+function usage()
+ version()
+ print("A tool to adjust frequencies and governors on x86 CPUs.\n")
+ print("Options:")
+ print("-h\t Help message")
+ print("-v\t Version information")
+ print("-c dom\t Likwid thread domain which to apply settings (default are all CPUs)")
+ print("\t See likwid-pin -h for details")
+ print("-g gov\t Set governor (" .. table.concat(getAvailGovs(nil), ", ") .. ") (set to ondemand if omitted)")
+ print("-f freq\t Set fixed frequency, implicitly sets userspace governor")
+ print("-p\t Print current frequencies")
+ print("-l\t List available frequencies")
+ print("-m\t List available governors")
+end
+
+function getCurrentMinFreq(cpuid)
+ local min = 10000000
+ if cpuid == nil or cpuid < 0 then
+ for cpuid=0,topo["numHWThreads"]-1 do
+ fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_min_freq")
+ if verbosity == 3 then
+ print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_min_freq" )
+ end
+ line = fp:read("*l")
+ if tonumber(line)/1E6 < min then
+ min = tonumber(line)/1E6
+ end
+ fp:close()
+ end
+ else
+ fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_min_freq")
+ if verbosity == 3 then
+ print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_min_freq" )
+ end
+ line = fp:read("*l")
+ if tonumber(line)/1E6 < min then
+ min = tonumber(line)/1E6
+ end
+ fp:close()
+ end
+ return min
+end
+
+function getCurrentMaxFreq(cpuid)
+ local max = 0
+ if cpuid == nil or cpuid < 0 then
+ for cpuid=0,topo["numHWThreads"]-1 do
+ fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_max_freq")
+ if verbosity == 3 then
+ print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_max_freq" )
+ end
+ line = fp:read("*l")
+ if tonumber(line)/1E6 > max then
+ max = tonumber(line)/1E6
+ end
+ fp:close()
+ end
+ else
+ fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_max_freq")
+ if verbosity == 3 then
+ print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_max_freq" )
+ end
+ line = fp:read("*l")
+ if tonumber(line)/1E6 > max then
+ max = tonumber(line)/1E6
+ end
+ fp:close()
+ end
+ return max
+end
+
+
+function getAvailFreq(cpuid)
+ if cpuid == nil then
+ cpuid = 0
+ end
+ if cpuid < 0 then
+ cpuid = 0
+ end
+ fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_available_frequencies")
+ if verbosity == 3 then
+ print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_available_frequencies" )
+ end
+ line = fp:read("*l")
+ fp:close()
+
+ local tmp = likwid.stringsplit(line:gsub("^%s*(.-)%s*$", "%1"), " ", nil, " ")
+ local avail = {}
+ local turbo = tonumber(tmp[1])/1E6
+ local j = 1
+ for i=2,#tmp do
+ local freq = tonumber(tmp[i])/1E6
+ avail[j] = tostring(freq)
+ if not avail[j]:match("%d+.%d+") then
+ avail[j] = avail[j] ..".0"
+ end
+ j = j + 1
+ end
+ if verbosity == 1 then
+ print(string.format("The system provides %d scaling frequencies, frequency %s is taken as turbo mode", #avail,turbo))
+ end
+ return avail, tostring(turbo)
+end
+
+function getCurFreq()
+ local freqs = {}
+ local govs = {}
+ for cpuid=0,topo["numHWThreads"]-1 do
+ local fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_cur_freq")
+ if verbosity == 3 then
+ print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_cur_freq" )
+ end
+ local line = fp:read("*l")
+ fp:close()
+ freqs[cpuid] = tostring(tonumber(line)/1E6)
+ if not freqs[cpuid]:match("%d.%d") then
+ freqs[cpuid] = freqs[cpuid] ..".0"
+ end
+ local fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_governor")
+ if verbosity == 3 then
+ print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_governor" )
+ end
+ local line = fp:read("*l")
+ fp:close()
+ govs[cpuid] = line
+ end
+ return freqs, govs
+end
+
+function getAvailGovs(cpuid)
+ if (cpuid == nil) or (cpuid < 1) then
+ cpuid = 0
+ end
+ local fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_available_governors")
+ if verbosity == 3 then
+ print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_available_governors" )
+ end
+ local line = fp:read("*l")
+ fp:close()
+ local avail = likwid.stringsplit(line:gsub("^%s*(.-)%s*$", "%1"), "%s+", nil, "%s+")
+ for i=1,#avail do
+ if avail[i] == "userspace" then
+ table.remove(avail, i)
+ break
+ end
+ end
+ table.insert(avail, "turbo")
+ if verbosity == 1 then
+ print(string.format("The system provides %d scaling governors", #avail))
+ end
+ return avail
+end
+
+local function testDriver()
+ local fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",0) .. "/cpufreq/scaling_driver")
+ if verbosity == 3 then
+ print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",0) .. "/cpufreq/scaling_driver" )
+ end
+ local line = fp:read("*l")
+ fp:close()
+ if line == "acpi-cpufreq" then
+ return true
+ end
+ return false
+end
+
+verbosity = 0
+governor = nil
+frequency = nil
+domain = nil
+printCurFreq = false
+printAvailFreq = false
+printAvailGovs = false
+
+if #arg == 0 then
+ usage()
+ os.exit(0)
+end
+
+
+for opt,arg in likwid.getopt(arg, {"g:", "c:", "f:", "l", "p", "h", "v", "m", "help","version","freq:"}) do
+ if opt == "h" or opt == "help" then
+ usage()
+ os.exit(0)
+ elseif opt == "v" or opt == "version" then
+ version()
+ os.exit(0)
+ elseif (opt == "c") then
+ domain = arg
+ elseif (opt == "g") then
+ governor = arg
+ elseif opt == "f" or opt == "freq" then
+ frequency = arg
+ elseif (opt == "p") then
+ printCurFreq = true
+ elseif (opt == "l") then
+ printAvailFreq = true
+ elseif (opt == "m") then
+ printAvailGovs = true
+ elseif opt == "?" then
+ print("Invalid commandline option -"..arg)
+ os.exit(1)
+ elseif opt == "!" then
+ print("Option requires an argument")
+ os.exit(1)
+ end
+end
+if not testDriver() then
+ print("The system does not use the acpi-cpufreq driver, other drivers are not usable with likwid-setFrequencies.")
+ os.exit(1)
+end
+
+topo = likwid.getCpuTopology()
+affinity = likwid.getAffinityInfo()
+if not domain or domain == "N" then
+ domain = "N:0-" .. tostring(topo["numHWThreads"]-1)
+end
+if domain:match("[SCM]%d") then
+ for i, dom in pairs(affinity["domains"]) do
+ if dom["tag"]:match(domain) then
+ domain = domain..":0-"..tostring(dom["numberOfProcessors"]-1)
+ end
+ end
+end
+cpulist = {}
+numthreads, cpulist = likwid.cpustr_to_cpulist(domain)
+if verbosity == 3 then
+ print(string.format("Given CPU expression expands to %d CPU cores:", numthreads))
+ local str = tostring(cpulist[1])
+ for i=2, numthreads do
+ str = str .. "," .. tostring(cpulist[i])
+ end
+ print(str)
+end
+
+
+if printAvailGovs then
+ local govs = getAvailGovs(nil)
+ print("Available governors:")
+ print(table.concat(govs, ", "))
+end
+
+if printAvailFreq then
+ print("Available frequencies:")
+ local out = {}
+ local i = 1;
+ local freqs, turbo = getAvailFreq(nil)
+ if turbo ~= "0" then
+ table.insert(out, turbo)
+ end
+ for i=1,#freqs do
+ table.insert(out, freqs[i])
+ end
+
+ print(table.concat(out, " "))
+end
+
+if printCurFreq then
+ print("Current frequencies:")
+ local freqs = {}
+ local govs = {}
+ freqs, govs = getCurFreq()
+ for i=1,#cpulist do
+ print(string.format("CPU %d: governor %12s frequency %5s GHz",cpulist[i],govs[cpulist[i]], freqs[cpulist[i]]))
+ end
+end
+
+if printAvailGovs or printAvailFreq or printCurFreq then
+ os.exit(0)
+end
+
+if numthreads > 0 and not (frequency or governor) then
+ print("You need to set either a frequency or governor for the selected CPUs on commandline")
+ os.exit(1)
+end
+
+if frequency then
+ for i=1,#cpulist do
+ local freqs, turbo = getAvailFreq(cpulist[i])
+ local valid_freq = false
+ for k,v in pairs(freqs) do
+ if (frequency == v) then
+ valid_freq = true
+ break
+ end
+ end
+ if frequency == turbo then
+ valid_freq = true
+ end
+ if not valid_freq then
+ print(string.format("Frequency %s not available for CPU %d! Please select one of\n%s", frequency, cpulist[i], table.concat(freqs, ", ")))
+ os.exit(1)
+ end
+
+ local cmd = set_command .. " " .. tostring(cpulist[i]) .. " " .. tostring(tonumber(frequency)*1E6)
+ if governor then
+ cmd = cmd .. " " .. governor
+ end
+ if verbosity == 3 then
+ print("Execute: ".. cmd)
+ end
+ local err = os.execute(cmd)
+ if err == false or err == nil then
+ print("Failed to set frequency for CPU "..tostring(cpulist[i]))
+ end
+ end
+ if governor then
+ governor = nil
+ end
+end
+
+if governor then
+ local govs = getAvailGovs(nil)
+ local freqs, turbo = getAvailFreq(nil)
+ local cur_freqs, cur_govs = getCurFreq()
+ local valid_gov = false
+ for k,v in pairs(govs) do
+ if (governor == v) then
+ valid_gov = true
+ break
+ end
+ end
+ if governor == "turbo" and turbo ~= "0" then
+ valid_gov = true
+ for i=1,#cpulist do
+ cur_freqs[cpulist[i]] = turbo
+ end
+ end
+ if not valid_gov then
+ print(string.format("Governor %s not available! Please select one of\n%s", governor, table.concat(govs, ", ")))
+ os.exit(1)
+ end
+ for i=1,#cpulist do
+ if governor ~= cur_govs[cpulist[i]] then
+ local cmd = set_command .. " " .. tostring(cpulist[i]) .. " "
+ if governor == "turbo" then
+ cmd = cmd .. tostring(tonumber(turbo)*1E6)
+ else
+ cmd = cmd .. tostring(tonumber(cur_freqs[cpulist[i]])*1E6) .. " " .. governor
+ end
+ if verbosity == 3 then
+ print("Execute: ".. cmd)
+ end
+ local err = os.execute(cmd)
+ if err == false or err == nil then
+ print("Failed to set governor for CPU "..tostring(cpulist[i]))
+ end
+ end
+ end
+end
+likwid.putAffinityInfo()
+likwid.putTopology()
+os.exit(0)
diff --git a/src/applications/likwid-topology.c b/src/applications/likwid-topology.c
deleted file mode 100644
index 7ba0e33..0000000
--- a/src/applications/likwid-topology.c
+++ /dev/null
@@ -1,509 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: likwid-topology.c
- *
- * Description: A application to determine the thread and cache topology
- * on x86 processors.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-/* ##### HEADER FILE INCLUDES ######################################### */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sched.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <ctype.h>
-
-#include <types.h>
-#include <error.h>
-#include <cpuid.h>
-#include <timer.h>
-#include <affinity.h>
-#include <numa.h>
-#include <cpuFeatures.h>
-#include <tree.h>
-#include <asciiBoxes.h>
-#include <strUtil.h>
-
-/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
-
-#define HELP_MSG \
- fprintf(OUTSTREAM, "\nlikwid-topology -- Version %d.%d \n\n",VERSION,RELEASE); \
- fprintf(OUTSTREAM, "A tool to print the thread and cache topology on x86 CPUs.\n"); \
- fprintf(OUTSTREAM, "Options:\n"); \
- fprintf(OUTSTREAM, "-h\t Help message\n"); \
- fprintf(OUTSTREAM, "-v\t Version information\n"); \
- fprintf(OUTSTREAM, "-c\t list cache information\n"); \
- fprintf(OUTSTREAM, "-C\t measure processor clock\n"); \
- fprintf(OUTSTREAM, "-o\t Store output to file, with output conversion according to file suffix\n"); \
- fprintf(OUTSTREAM, "\t Conversion scripts can be supplied in %s\n",TOSTRING(LIKWIDFILTERPATH)); \
- fprintf(OUTSTREAM, "-g\t graphical output\n\n"); \
- fflush(OUTSTREAM);
-
-#define VERSION_MSG \
- fprintf(OUTSTREAM, "likwid-topology %d.%d \n\n",VERSION,RELEASE); \
- fflush(OUTSTREAM);
-
-/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
-
-int main (int argc, char** argv)
-{
- int optGraphical = 0;
- int optCaches = 0;
- int optClock = 0;
- int c;
- int tmp;
- TreeNode* socketNode;
- TreeNode* coreNode;
- TreeNode* threadNode;
- BoxContainer* container;
- bstring argString;
- bstring filterScript = bfromcstr("NO");
- FILE* OUTSTREAM = stdout;
-
- while ((c = getopt (argc, argv, "hvcCgo:")) != -1)
- {
- switch (c)
- {
- case 'h':
- HELP_MSG;
- exit (EXIT_SUCCESS);
- case 'v':
- VERSION_MSG;
- exit (EXIT_SUCCESS);
- case 'g':
- optGraphical = 1;
- break;
- case 'c':
- optCaches = 1;
- break;
- case 'C':
- optClock = 1;
- break;
- case 'o':
- if (! (argString = bSecureInput(200,optarg)))
- {
- fprintf(stderr, "Failed to read argument string!\n");
- }
-
- OUTSTREAM = bstr_to_outstream(argString, filterScript);
-
- if(!OUTSTREAM)
- {
- fprintf(stderr, "Failed to parse out file pattern.\n");
- }
- break;
- case '?':
- if (isprint (optopt))
- {
- fprintf (stderr, "Unknown option `-%c'.\n", optopt);
- }
- else
- {
- fprintf (stderr,
- "Unknown option character `\\x%x'.\n",
- optopt);
- }
- return EXIT_FAILURE;
- default:
- HELP_MSG;
- exit (EXIT_SUCCESS);
- }
- }
-
- if (cpuid_init() == EXIT_FAILURE)
- {
- ERROR_PLAIN_PRINT(Unsupported processor!);
- }
- affinity_init();
- numa_init();
-
- fprintf(OUTSTREAM, HLINE);
- fprintf(OUTSTREAM, "CPU type:\t%s\n",cpuid_info.name);
-
- if (optClock)
- {
- timer_init();
- fprintf(OUTSTREAM, "CPU clock:\t%3.2f GHz\n", (float) timer_getCpuClock() * 1.E-09);
- }
-
- /*----------------------------------------------------------------------
- * Thread Topology
- *----------------------------------------------------------------------*/
- fprintf(OUTSTREAM, SLINE);
- fprintf(OUTSTREAM, "Hardware Thread Topology\n");
- fprintf(OUTSTREAM, SLINE);
- fprintf(OUTSTREAM, "Sockets:\t%u \n", cpuid_topology.numSockets);
- fprintf(OUTSTREAM, "Cores per socket:\t%u \n", cpuid_topology.numCoresPerSocket);
- fprintf(OUTSTREAM, "Threads per core:\t%u \n", cpuid_topology.numThreadsPerCore);
- fprintf(OUTSTREAM, HLINE);
- fprintf(OUTSTREAM, "HWThread\tThread\t\tCore\t\tSocket\n");
-
- for ( uint32_t i=0; i < cpuid_topology.numHWThreads; i++)
- {
- fprintf(OUTSTREAM, "%d\t\t%u\t\t%u\t\t%u\n",i
- ,cpuid_topology.threadPool[i].threadId
- ,cpuid_topology.threadPool[i].coreId
- ,cpuid_topology.threadPool[i].packageId);
- }
- fprintf(OUTSTREAM, HLINE);
-
- socketNode = tree_getChildNode(cpuid_topology.topologyTree);
- while (socketNode != NULL)
- {
- fprintf(OUTSTREAM, "Socket %d: ( ",socketNode->id);
- coreNode = tree_getChildNode(socketNode);
-
- while (coreNode != NULL)
- {
- threadNode = tree_getChildNode(coreNode);
-
- while (threadNode != NULL)
- {
- fprintf(OUTSTREAM, "%d ",threadNode->id);
- threadNode = tree_getNextNode(threadNode);
- }
- coreNode = tree_getNextNode(coreNode);
- }
- socketNode = tree_getNextNode(socketNode);
- fprintf(OUTSTREAM, ")\n");
- }
- fprintf(OUTSTREAM, HLINE"\n");
- fflush(OUTSTREAM);
-
- /*----------------------------------------------------------------------
- * Cache Topology
- *----------------------------------------------------------------------*/
- fprintf(OUTSTREAM, SLINE);
- fprintf(OUTSTREAM, "Cache Topology\n");
- fprintf(OUTSTREAM, SLINE);
-
- for ( uint32_t i=0; i < cpuid_topology.numCacheLevels; i++)
- {
- if (cpuid_topology.cacheLevels[i].type != INSTRUCTIONCACHE)
- {
- fprintf(OUTSTREAM, "Level:\t%d\n",cpuid_topology.cacheLevels[i].level);
- if (cpuid_topology.cacheLevels[i].size < 1048576)
- {
- fprintf(OUTSTREAM, "Size:\t%d kB\n",
- cpuid_topology.cacheLevels[i].size/1024);
- }
- else
- {
- fprintf(OUTSTREAM, "Size:\t%d MB\n",
- cpuid_topology.cacheLevels[i].size/1048576);
- }
-
- if( optCaches)
- {
- switch (cpuid_topology.cacheLevels[i].type) {
- case DATACACHE:
- fprintf(OUTSTREAM, "Type:\tData cache\n");
- break;
-
- case INSTRUCTIONCACHE:
- fprintf(OUTSTREAM, "Type:\tInstruction cache\n");
- break;
-
- case UNIFIEDCACHE:
- fprintf(OUTSTREAM, "Type:\tUnified cache\n");
- break;
- default:
- /* make the compiler happy */
- break;
- }
- fprintf(OUTSTREAM, "Associativity:\t%d\n",
- cpuid_topology.cacheLevels[i].associativity);
- fprintf(OUTSTREAM, "Number of sets:\t%d\n",
- cpuid_topology.cacheLevels[i].sets);
- fprintf(OUTSTREAM, "Cache line size:\t%d\n",
- cpuid_topology.cacheLevels[i].lineSize);
- if(cpuid_topology.cacheLevels[i].inclusive)
- {
- fprintf(OUTSTREAM, "Non Inclusive cache\n");
- }
- else
- {
- fprintf(OUTSTREAM, "Inclusive cache\n");
- }
- fprintf(OUTSTREAM, "Shared among %d threads\n",
- cpuid_topology.cacheLevels[i].threads);
- }
- fprintf(OUTSTREAM, "Cache groups:\t");
- tmp = cpuid_topology.cacheLevels[i].threads;
- socketNode = tree_getChildNode(cpuid_topology.topologyTree);
- fprintf(OUTSTREAM, "( ");
- while (socketNode != NULL)
- {
- coreNode = tree_getChildNode(socketNode);
-
- while (coreNode != NULL)
- {
- threadNode = tree_getChildNode(coreNode);
-
- while (threadNode != NULL)
- {
-
- if (tmp)
- {
- fprintf(OUTSTREAM, "%d ",threadNode->id);
- tmp--;
- }
- else
- {
- fprintf(OUTSTREAM, ") ( %d ",threadNode->id);
- tmp = cpuid_topology.cacheLevels[i].threads;
- tmp--;
- }
-
- threadNode = tree_getNextNode(threadNode);
- }
- coreNode = tree_getNextNode(coreNode);
- }
- socketNode = tree_getNextNode(socketNode);
- }
- fprintf(OUTSTREAM, ")\n");
-
- fprintf(OUTSTREAM, HLINE);
- }
- }
-
- fprintf(OUTSTREAM, "\n");
- fflush(OUTSTREAM);
-
- /*----------------------------------------------------------------------
- * NUMA Topology
- *----------------------------------------------------------------------*/
- fprintf(OUTSTREAM, SLINE);
- fprintf(OUTSTREAM, "NUMA Topology\n");
- fprintf(OUTSTREAM, SLINE);
-
- if (numa_init() < 0)
- {
- fprintf(OUTSTREAM, "NUMA is not supported on this node!\n");
- }
- else
- {
- fprintf(OUTSTREAM, "NUMA domains: %d \n",numa_info.numberOfNodes);
- fprintf(OUTSTREAM, HLINE);
-
- for ( uint32_t i = 0; i < numa_info.numberOfNodes; i++)
- {
- fprintf(OUTSTREAM, "Domain %d:\n", i);
- fprintf(OUTSTREAM, "Processors: ");
-
- for ( int j = 0; j < numa_info.nodes[i].numberOfProcessors; j++)
- {
- fprintf(OUTSTREAM, " %d",numa_info.nodes[i].processors[j]);
- }
- fprintf(OUTSTREAM, "\n");
-
- fprintf(OUTSTREAM, "Relative distance to nodes: ");
-
- for ( int j = 0; j < numa_info.nodes[i].numberOfDistances; j++)
- {
- fprintf(OUTSTREAM, " %d",numa_info.nodes[i].distances[j]);
- }
- fprintf(OUTSTREAM, "\n");
-
- fprintf(OUTSTREAM, "Memory: %g MB free of total %g MB\n",
- numa_info.nodes[i].freeMemory/1024.0, numa_info.nodes[i].totalMemory/1024.0);
- fprintf(OUTSTREAM, HLINE);
- }
- }
- fprintf(OUTSTREAM, "\n");
- fflush(OUTSTREAM);
-
- /*----------------------------------------------------------------------
- * Graphical topology
- *----------------------------------------------------------------------*/
- if(optGraphical)
- {
- int j;
- bstring boxLabel = bfromcstr("0");
-
- fprintf(OUTSTREAM, SLINE);
- fprintf(OUTSTREAM, "Graphical:\n");
- fprintf(OUTSTREAM, SLINE);
-
- /* Allocate without instruction cache */
- if ( cpuid_info.family == P6_FAMILY || cpuid_info.family == MIC_FAMILY )
- {
- container = asciiBoxes_allocateContainer(
- cpuid_topology.numCacheLevels,
- cpuid_topology.numCoresPerSocket);
- }
- else
- {
- container = asciiBoxes_allocateContainer(
- cpuid_topology.numCacheLevels+1,
- cpuid_topology.numCoresPerSocket);
- }
-
- socketNode = tree_getChildNode(cpuid_topology.topologyTree);
- while (socketNode != NULL)
- {
- fprintf(OUTSTREAM, "Socket %d:\n",socketNode->id);
- j=0;
- coreNode = tree_getChildNode(socketNode);
-
- /* add threads */
- while (coreNode != NULL)
- {
- threadNode = tree_getChildNode(coreNode);
- tmp =0;
-
- while (threadNode != NULL)
- {
- if (tmp > 0)
- {
- bformata(boxLabel," %d", threadNode->id);
- }
- else
- {
- boxLabel = bformat("%d",threadNode->id);
- }
- tmp++;
- threadNode = tree_getNextNode(threadNode);
- }
- asciiBoxes_addBox(container, 0, j, boxLabel);
- j++;
- coreNode = tree_getNextNode(coreNode);
- }
-
- /* add caches */
- {
- int columnCursor=0;
- int lineCursor=1;
- uint32_t sharedCores;
- int numCachesPerLevel;
- int cacheWidth;
-
- for ( uint32_t i=0; i < cpuid_topology.numCacheLevels; i++ )
- {
- sharedCores = cpuid_topology.cacheLevels[i].threads /
- cpuid_topology.numThreadsPerCore;
-
- if (cpuid_topology.cacheLevels[i].type != INSTRUCTIONCACHE)
- {
- if ( sharedCores > cpuid_topology.numCoresPerSocket )
- {
- numCachesPerLevel = 1;
- }
- else
- {
- numCachesPerLevel =
- cpuid_topology.numCoresPerSocket/sharedCores;
- }
-
- columnCursor=0;
- for ( j=0; j < numCachesPerLevel; j++ )
- {
- if (cpuid_topology.cacheLevels[i].size < 1048576)
- {
- boxLabel = bformat("%dkB",
- cpuid_topology.cacheLevels[i].size/1024);
- }
- else
- {
- boxLabel = bformat("%dMB",
- cpuid_topology.cacheLevels[i].size/1048576);
- }
-
- if (sharedCores > 1)
- {
- if (sharedCores > cpuid_topology.numCoresPerSocket)
- {
- cacheWidth = cpuid_topology.numCoresPerSocket-1;
- }
- else
- {
- cacheWidth = sharedCores-1;
- }
- asciiBoxes_addJoinedBox(
- container,
- lineCursor,
- columnCursor,
- columnCursor+cacheWidth,
- boxLabel);
-
- columnCursor += sharedCores;
- }
- else
- {
- asciiBoxes_addBox(
- container,
- lineCursor,
- columnCursor,
- boxLabel);
-
- columnCursor++;
- }
-
- }
- lineCursor++;
- }
- }
- }
-
- asciiBoxes_print(OUTSTREAM, container);
- socketNode = tree_getNextNode(socketNode);
- }
- bdestroy(boxLabel);
- }
-
- fflush(OUTSTREAM);
-
- /* call filterscript if specified */
- if (!biseqcstr(filterScript,"NO"))
- {
- struct bstrList* tokens;
- tokens = bsplit(filterScript,' ');
- if (access(bdata(tokens->entry[0]), F_OK))
- {
- fprintf(stderr, "Cannot find filter %s!\n", bdata(tokens->entry[0]));
- bstrListDestroy(tokens);
- exit(EXIT_FAILURE);
- }
- if (access(bdata(tokens->entry[0]), X_OK))
- {
- fprintf(stderr, "Cannot execute filter %s!\n", bdata(tokens->entry[0]));
- bstrListDestroy(tokens);
- exit(EXIT_FAILURE);
- }
- bstrListDestroy(tokens);
- bcatcstr(filterScript, " topology");
-
- if (system(bdata(filterScript)) == EOF)
- {
- fprintf(stderr, "Failed to execute filter %s!\n", bdata(filterScript));
- exit(EXIT_FAILURE);
- }
- }
-
- return EXIT_SUCCESS;
-}
-
diff --git a/src/applications/likwid-topology.lua b/src/applications/likwid-topology.lua
new file mode 100644
index 0000000..0123f65
--- /dev/null
+++ b/src/applications/likwid-topology.lua
@@ -0,0 +1,394 @@
+#!<INSTALLED_BINPREFIX>/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ * Filename: likwid-topology.lua
+ *
+ * Description: A application to determine the thread and cache topology
+ * on x86 processors.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+stdout_print = print
+
+function version()
+ print(string.format("likwid-topology -- Version %d.%d",likwid.version,likwid.release))
+end
+
+function usage()
+ version()
+ print("A tool to print the thread and cache topology on x86 CPUs.\n")
+ print("Options:")
+ print("-h, --help\t\t Help message")
+ print("-v, --version\t\t Version information")
+ print("-V, --verbose <level>\t Set verbosity")
+ print("-c, --caches\t\t List cache information")
+ print("-C, --clock\t\t Measure processor clock")
+ print("-O\t\t\t CSV output")
+ print("-o, --output <file>\t Store output to file. (Optional: Apply text filter)")
+ print("-g\t\t\t Graphical output")
+end
+
+print_caches = false
+print_graphical = false
+measure_clock = false
+outfile = nil
+output_csv = {}
+
+for opt,arg in likwid.getopt(arg, {"h","v","c","C","g","o:","V:","O","help","version","verbose:","clock","caches","output:"}) do
+ if (type(arg) == "string") then
+ local s,e = arg:find("-");
+ if s == 1 then
+ print(string.format("Argmument %s to option -%s starts with invalid character -.", arg, opt))
+ print("Did you forget an argument to an option?")
+ os.exit(1)
+ end
+ end
+ if opt == "h" or opt == "help" then
+ usage()
+ os.exit(0)
+ elseif opt == "v" or opt == "version" then
+ version()
+ os.exit(0)
+ elseif opt == "V" or opt == "verbose" then
+ if tonumber(arg) >= 0 and tonumber(arg) <=3 then
+ likwid.setVerbosity(tonumber(arg))
+ else
+ print("Verbosity level not valid. Must be between 0 (only errors) and 3 (developer output)")
+ end
+ elseif opt == "c" or opt == "caches" then
+ print_caches = true
+ elseif opt == "C" or opt == "clock" then
+ measure_clock = true
+ elseif opt == "g" then
+ print_graphical = true
+ elseif opt == "O" then
+ print_csv = true
+ elseif opt == "o" or opt == "output" then
+ local suffix = ""
+ if string.match(arg, "%.") then
+ suffix = string.match(arg, ".-[^\\/]-%.?([^%.\\/]*)$")
+ end
+ if suffix ~= "txt" then
+ print_csv = true
+ end
+ outfile = arg:gsub("%%h", likwid.gethostname())
+ io.output(arg..".tmp")
+ print = function(...) for k,v in pairs({...}) do io.write(v .. "\n") end end
+ elseif opt == "?" then
+ print("Invalid commandline option -"..arg)
+ os.exit(1)
+ elseif opt == "!" then
+ print("Option requires an argument")
+ os.exit(1)
+ end
+end
+
+local config = likwid.getConfiguration()
+local cpuinfo = likwid.getCpuInfo()
+local cputopo = likwid.getCpuTopology()
+local numainfo = likwid.getNumaInfo()
+local affinity = likwid.getAffinityInfo()
+
+
+table.insert(output_csv, likwid.hline)
+local lines = 3
+if measure_clock then
+ lines = 4
+end
+table.insert(output_csv, "STRUCT,Info,"..tostring(lines))
+table.insert(output_csv, string.format("CPU name:\t%s",cpuinfo["osname"]))
+table.insert(output_csv, string.format("CPU type:\t%s",cpuinfo["name"]))
+table.insert(output_csv, string.format("CPU stepping:\t%s",cpuinfo["stepping"]))
+if (measure_clock) then
+ if cpuinfo["clock"] == 0 then
+ table.insert(output_csv, string.format("CPU clock:\t%3.2f GHz", likwid.getCpuClock() * 1.E-09))
+ else
+ table.insert(output_csv, string.format("CPU clock:\t%3.2f GHz", cpuinfo["clock"] * 1.E-09))
+ end
+end
+
+table.insert(output_csv, likwid.sline)
+table.insert(output_csv, "STRUCT,Hardware Thread Topology,3")
+table.insert(output_csv, "Hardware Thread Topology")
+table.insert(output_csv, likwid.sline)
+table.insert(output_csv, string.format("Sockets:\t\t%u",cputopo["numSockets"]))
+table.insert(output_csv, string.format("Cores per socket:\t%u",cputopo["numCoresPerSocket"]))
+table.insert(output_csv, string.format("Threads per core:\t%u",cputopo["numThreadsPerCore"]))
+table.insert(output_csv, likwid.hline)
+table.insert(output_csv, "TABLE,Topology,"..tostring(cputopo["numHWThreads"]))
+table.insert(output_csv, "HWThread\tThread\t\tCore\t\tSocket\t\tAvailable")
+
+for cntr=0,cputopo["numHWThreads"]-1 do
+ if cputopo["threadPool"][cntr]["inCpuSet"] then
+ table.insert(output_csv, string.format("%d\t\t%u\t\t%u\t\t%u\t\t*",cntr,
+ cputopo["threadPool"][cntr]["threadId"],
+ cputopo["threadPool"][cntr]["coreId"],
+ cputopo["threadPool"][cntr]["packageId"]))
+ else
+ table.insert(output_csv, string.format("%d\t\t%u\t\t%u\t\t%u",cntr,
+ cputopo["threadPool"][cntr]["threadId"],
+ cputopo["threadPool"][cntr]["coreId"],
+ cputopo["threadPool"][cntr]["packageId"]))
+ end
+end
+table.insert(output_csv, likwid.hline)
+
+table.insert(output_csv, "STRUCT,Sockets,"..tostring(cputopo["numSockets"]))
+for socket=0,cputopo["numSockets"]-1 do
+ csv_str = string.format("Socket %d:\t\t( ",cputopo["topologyTree"][socket]["ID"])
+ for core=0,cputopo["numCoresPerSocket"]-1 do
+ for thread=0, cputopo["numThreadsPerCore"]-1 do
+ csv_str = csv_str ..tostring(cputopo["topologyTree"][socket]["Childs"][core]["Childs"][thread]).. ","
+ end
+ end
+ table.insert(output_csv, csv_str:sub(1,#csv_str-1).." )")
+end
+
+table.insert(output_csv, likwid.hline)
+
+
+table.insert(output_csv, likwid.sline)
+table.insert(output_csv, "Cache Topology")
+table.insert(output_csv, likwid.sline)
+
+for level=1,cputopo["numCacheLevels"] do
+ if (cputopo["cacheLevels"][level]["type"] ~= "INSTRUCTIONCACHE") then
+ lines = 3
+ if print_caches then lines = 9 end
+ table.insert(output_csv, string.format("STRUCT,Cache Topology L%d,%d", cputopo["cacheLevels"][level]["level"],lines))
+ table.insert(output_csv, string.format("Level:\t\t\t%d",cputopo["cacheLevels"][level]["level"]))
+ if (cputopo["cacheLevels"][level]["size"] < 1048576) then
+ table.insert(output_csv, string.format("Size:\t\t\t%.0f kB",cputopo["cacheLevels"][level]["size"]/1024))
+ else
+ table.insert(output_csv, string.format("Size:\t\t\t%.0f MB",cputopo["cacheLevels"][level]["size"]/1048576))
+ end
+
+ if (print_caches) then
+ if (cputopo["cacheLevels"][level]["type"] == "DATACACHE") then
+ table.insert(output_csv, "Type:\t\t\tData cache")
+ elseif (cputopo["cacheLevels"][level]["type"] == "UNIFIEDCACHE") then
+ table.insert(output_csv, "Type:\t\t\tUnified cache")
+ end
+
+ table.insert(output_csv, string.format("Associativity:\t\t%d",cputopo["cacheLevels"][level]["associativity"]))
+ table.insert(output_csv, string.format("Number of sets:\t\t%d",cputopo["cacheLevels"][level]["sets"]))
+ table.insert(output_csv, string.format("Cache line size:\t%d",cputopo["cacheLevels"][level]["lineSize"]))
+
+ if (cputopo["cacheLevels"][level]["inclusive"] == 0) then
+ table.insert(output_csv, "Cache type:\t\tNon Inclusive")
+ else
+ table.insert(output_csv, "Cache type:\t\tInclusive")
+ end
+ table.insert(output_csv, string.format("Shared by threads:\t%d",cputopo["cacheLevels"][level]["threads"]))
+ end
+ local threads = cputopo["cacheLevels"][level]["threads"]
+ str = "Cache groups:\t\t( "
+ for socket=0,cputopo["numSockets"]-1 do
+ for core=0,cputopo["numCoresPerSocket"]-1 do
+ for cpu=0,cputopo["numThreadsPerCore"]-1 do
+ if (threads ~= 0) then
+ str = str .. cputopo["topologyTree"][socket]["Childs"][core]["Childs"][cpu] .. " "
+ threads = threads - 1
+ else
+ str = str .. string.format(") ( %d ",cputopo["topologyTree"][socket]["Childs"][core]["Childs"][cpu])
+ threads = cputopo["cacheLevels"][level]["threads"]
+ threads = threads - 1
+ end
+ end
+ end
+ end
+ str = str .. ")"
+ table.insert(output_csv, str)
+ table.insert(output_csv, likwid.hline)
+ end
+end
+
+
+table.insert(output_csv, likwid.sline)
+table.insert(output_csv, "NUMA Topology")
+table.insert(output_csv, likwid.sline)
+
+if (numainfo["numberOfNodes"] == 0) then
+ table.insert(output_csv, "No NUMA")
+else
+ table.insert(output_csv, string.format("NUMA domains:\t\t%d",numainfo["numberOfNodes"]))
+ table.insert(output_csv, likwid.hline)
+ for node=1,numainfo["numberOfNodes"] do
+ table.insert(output_csv, string.format("STRUCT,NUMA Topology %d,5",numainfo["nodes"][node]["id"]))
+ table.insert(output_csv, string.format("Domain:\t\t\t%d",numainfo["nodes"][node]["id"]))
+ csv_str = "Processors:\t\t( "
+ for cpu=1,numainfo["nodes"][node]["numberOfProcessors"] do
+ csv_str = csv_str .. numainfo["nodes"][node]["processors"][cpu] .. ","
+ end
+ table.insert(output_csv, csv_str:sub(1,#csv_str-1).. " )")
+ csv_str = "Distances:\t\t"
+ for cpu=1,numainfo["nodes"][node]["numberOfDistances"] do
+ csv_str = csv_str .. numainfo["nodes"][node]["distances"][cpu][cpu-1] .. ","
+ end
+ table.insert(output_csv, csv_str:sub(1,#csv_str-1))
+ table.insert(output_csv, string.format("Free memory:\t\t%g MB",tonumber(numainfo["nodes"][node]["freeMemory"]/1024.0)))
+ table.insert(output_csv, string.format("Total memory:\t\t%g MB",tonumber(numainfo["nodes"][node]["totalMemory"]/1024.0)))
+ table.insert(output_csv, likwid.hline)
+ end
+end
+
+
+
+if print_csv then
+ longest_line = 0
+ local tmpList = {}
+ for i=#output_csv,1,-1 do
+ output_csv[i] = output_csv[i]:gsub("[\t]+",",")
+ output_csv[i] = output_csv[i]:gsub("%( ","")
+ output_csv[i] = output_csv[i]:gsub(" %)[%s]*",",")
+ output_csv[i] = output_csv[i]:gsub(",$","")
+ if output_csv[i]:sub(1,1) == "*" or
+ output_csv[i]:sub(1,1) == "-" or
+ output_csv[i]:match("^Hardware Thread Topology") or
+ output_csv[i]:match("^Cache Topology") or
+ output_csv[i]:match("^NUMA Topology") then
+ table.remove(output_csv,i)
+ end
+ tmpList = likwid.stringsplit(output_csv[i],",")
+ if #tmpList > longest_line then longest_line = #tmpList end
+ end
+ for i=1,#output_csv do
+ tmpList = likwid.stringsplit(output_csv[i],",")
+ if #tmpList < longest_line then
+ output_csv[i] = output_csv[i]..string.rep(",",longest_line-#tmpList)
+ end
+ end
+else
+ for i=#output_csv,1,-1 do
+ output_csv[i] = output_csv[i]:gsub(","," ")
+ if output_csv[i]:match("^TABLE") or
+ output_csv[i]:match("^STRUCT") then
+ table.remove(output_csv,i)
+ end
+ end
+end
+
+for _,line in pairs(output_csv) do print(line) end
+
+if print_graphical and not print_csv then
+ print("\n")
+ print(likwid.sline)
+ print("Graphical Topology")
+ print(likwid.sline)
+ for socket=0,cputopo["numSockets"]-1 do
+ print(string.format("Socket %d:",cputopo["topologyTree"][socket]["ID"]))
+ container = {}
+ for core=0,cputopo["numCoresPerSocket"]-1 do
+ local tmpString = ""
+ for thread=0,cputopo["numThreadsPerCore"]-1 do
+ if thread == 0 then
+ tmpString = tmpString .. tostring(cputopo["topologyTree"][socket]["Childs"][core]["Childs"][thread])
+ else
+ tmpString = tmpString .. " " .. tostring(cputopo["topologyTree"][socket]["Childs"][core]["Childs"][thread]).. " "
+ end
+ end
+ likwid.addSimpleAsciiBox(container, 1, core+1, tmpString)
+ end
+
+ local columnCursor = 1
+ local lineCursor = 2
+ for cache=1,cputopo["numCacheLevels"] do
+ if cputopo["cacheLevels"][cache]["type"] ~= "INSTRUCTIONCACHE" then
+ local cachesAtCurLevel = 0
+ local sharedCores = cputopo["cacheLevels"][cache]["threads"]/cputopo["numThreadsPerCore"]
+ if sharedCores >= cputopo["numCoresPerSocket"] then
+ cachesAtCurLevel = 1
+ else
+ cachesAtCurLevel = cputopo["numCoresPerSocket"]/sharedCores
+ end
+ columnCursor = 1
+ for cachesAtLevel=1,cachesAtCurLevel do
+ local tmpString = ""
+ local cacheWidth = 0
+ if cputopo["cacheLevels"][cache]["size"] < 1048576 then
+ tmpString = string.format("%dkB", cputopo["cacheLevels"][cache]["size"]/1024)
+ else
+ tmpString = string.format("%dMB", cputopo["cacheLevels"][cache]["size"]/1048576)
+ end
+ if sharedCores > 1 then
+ if sharedCores > cputopo["numCoresPerSocket"] then
+ cacheWidth = sharedCores
+ else
+ cacheWidth = sharedCores - 1
+ end
+ likwid.addJoinedAsciiBox(container, lineCursor, columnCursor,columnCursor + cacheWidth, tmpString)
+ columnCursor = columnCursor + cacheWidth
+ else
+ likwid.addSimpleAsciiBox(container, lineCursor, columnCursor, tmpString)
+ columnCursor = columnCursor + 1
+ end
+ end
+ lineCursor = lineCursor + 1
+ end
+ end
+ likwid.printAsciiBox(container);
+ end
+end
+
+if outfile then
+ local suffix = ""
+ if string.match(outfile, "%.") then
+ suffix = string.match(outfile, ".-[^\\/]-%.?([^%.\\/]*)$")
+ end
+ local command = "<INSTALLED_PREFIX>/share/likwid/filter/" .. suffix
+ local tmpfile = outfile..".tmp"
+ if suffix == "" then
+ os.rename(tmpfile, outfile)
+ elseif suffix ~= "txt" and suffix ~= "csv" and likwid.access(command,"x") then
+ stdout_print("Cannot find filter script, save output in CSV format to file "..outfile)
+ os.rename(tmpfile, outfile)
+ else
+ if suffix ~= "txt" and suffix ~= "csv" then
+ command = command .." ".. tmpfile .. " topology"
+ local f = assert(io.popen(command))
+ if f ~= nil then
+ local o = f:read("*a")
+ if o:len() > 0 then
+ stdout_print(string.format("Failed to executed filter script %s.",command))
+ end
+ else
+ stdout_print("Failed to call filter script, save output in CSV format to file "..outfile)
+ os.rename(tmpfile, outfile)
+ os.remove(tmpfile)
+ end
+ else
+ os.rename(tmpfile, outfile)
+ os.remove(tmpfile)
+ end
+ end
+end
+
+likwid.putAffinityInfo()
+likwid.putNumaInfo()
+likwid.putTopology()
+likwid.putConfiguration()
+os.exit(0)
diff --git a/src/applications/likwid.lua b/src/applications/likwid.lua
new file mode 100644
index 0000000..a6ffee3
--- /dev/null
+++ b/src/applications/likwid.lua
@@ -0,0 +1,1142 @@
+--[[
+ * =======================================================================================
+ *
+ * Filename: likwid.lua
+ *
+ * Description: Lua LIKWID interface library
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+
+local likwid = {}
+package.cpath = '<INSTALLED_LIBPREFIX>/?.so;' .. package.cpath
+require("liblikwid")
+require("math")
+
+likwid.groupfolder = "<LIKWIDGROUPPATH>"
+
+likwid.version = <VERSION>
+likwid.release = <RELEASE>
+likwid.pinlibpath = "<LIBLIKWIDPIN>"
+likwid.dline = string.rep("=",80)
+likwid.hline = string.rep("-",80)
+likwid.sline = string.rep("*",80)
+
+
+
+likwid.getConfiguration = likwid_getConfiguration
+likwid.setGroupPath = likwid_setGroupPath
+likwid.putConfiguration = likwid_putConfiguration
+likwid.setAccessClientMode = likwid_setAccessClientMode
+likwid.init = likwid_init
+likwid.addEventSet = likwid_addEventSet
+likwid.setupCounters = likwid_setupCounters
+likwid.startCounters = likwid_startCounters
+likwid.stopCounters = likwid_stopCounters
+likwid.readCounters = likwid_readCounters
+likwid.switchGroup = likwid_switchGroup
+likwid.finalize = likwid_finalize
+likwid.getEventsAndCounters = likwid_getEventsAndCounters
+likwid.getResult = likwid_getResult
+likwid.getLastResult = likwid_getLastResult
+likwid.getMetric = likwid_getMetric
+likwid.getLastMetric = likwid_getLastMetric
+likwid.getNumberOfGroups = likwid_getNumberOfGroups
+likwid.getRuntimeOfGroup = likwid_getRuntimeOfGroup
+likwid.getIdOfActiveGroup = likwid_getIdOfActiveGroup
+likwid.getNumberOfEvents = likwid_getNumberOfEvents
+likwid.getNumberOfThreads = likwid_getNumberOfThreads
+likwid.getNumberOfMetrics = likwid_getNumberOfMetrics
+likwid.getNameOfMetric = likwid_getNameOfMetric
+likwid.getNameOfEvent = likwid_getNameOfEvent
+likwid.getNameOfCounter = likwid_getNameOfCounter
+likwid.getNameOfGroup = likwid_getNameOfGroup
+likwid.getGroups = likwid_getGroups
+likwid.getShortInfoOfGroup = likwid_getShortInfoOfGroup
+likwid.getLongInfoOfGroup = likwid_getLongInfoOfGroup
+likwid.getCpuInfo = likwid_getCpuInfo
+likwid.getCpuTopology = likwid_getCpuTopology
+likwid.putTopology = likwid_putTopology
+likwid.getNumaInfo = likwid_getNumaInfo
+likwid.putNumaInfo = likwid_putNumaInfo
+likwid.setMemInterleaved = likwid_setMemInterleaved
+likwid.getAffinityInfo = likwid_getAffinityInfo
+likwid.putAffinityInfo = likwid_putAffinityInfo
+likwid.getPowerInfo = likwid_getPowerInfo
+likwid.putPowerInfo = likwid_putPowerInfo
+likwid.getOnlineDevices = likwid_getOnlineDevices
+likwid.printSupportedCPUs = likwid_printSupportedCPUs
+likwid.getCpuClock = likwid_getCpuClock
+likwid.getCycleClock = likwid_getCycleClock
+likwid.startClock = likwid_startClock
+likwid.stopClock = likwid_stopClock
+likwid.getClockCycles = likwid_getClockCycles
+likwid.getClock = likwid_getClock
+likwid.sleep = sleep
+likwid.startPower = likwid_startPower
+likwid.stopPower = likwid_stopPower
+likwid.calcPower = likwid_printEnergy
+likwid.getPowerLimit = likwid_powerLimitGet
+likwid.setPowerLimit = likwid_powerLimitSet
+likwid.statePowerLimit = likwid_powerLimitState
+likwid.initTemp = likwid_initTemp
+likwid.readTemp = likwid_readTemp
+likwid.memSweep = likwid_memSweep
+likwid.memSweepDomain = likwid_memSweepDomain
+likwid.pinProcess = likwid_pinProcess
+likwid.setenv = likwid_setenv
+likwid.getpid = likwid_getpid
+likwid.setVerbosity = likwid_setVerbosity
+likwid.access = likwid_access
+likwid.startProgram = likwid_startProgram
+likwid.checkProgram = likwid_checkProgram
+likwid.killProgram = likwid_killProgram
+likwid.catchSignal = likwid_catchSignal
+likwid.getSignalState = likwid_getSignalState
+likwid.waitpid = likwid_waitwid
+likwid.cpustr_to_cpulist = likwid_cpustr_to_cpulist
+likwid.nodestr_to_nodelist = likwid_nodestr_to_nodelist
+likwid.sockstr_to_socklist = likwid_sockstr_to_socklist
+likwid.markerInit = likwid_markerInit
+likwid.markerThreadInit = likwid_markerThreadInit
+likwid.markerClose = likwid_markerClose
+likwid.markerNextGroup = likwid_markerNextGroup
+likwid.registerRegion = likwid_registerRegion
+likwid.startRegion = likwid_startRegion
+likwid.stopRegion = likwid_stopRegion
+likwid.getRegion = likwid_getRegion
+likwid.initCpuFeatures = likwid_cpuFeaturesInit
+likwid.getCpuFeatures = likwid_cpuFeaturesGet
+likwid.enableCpuFeatures = likwid_cpuFeaturesEnable
+likwid.disableCpuFeatures = likwid_cpuFeaturesDisable
+likwid.readMarkerFile = likwid_readMarkerFile
+likwid.destroyMarkerFile = likwid_destroyMarkerFile
+likwid.markerNumRegions = likwid_markerNumRegions
+likwid.markerRegionGroup = likwid_markerRegionGroup
+likwid.markerRegionTag = likwid_markerRegionTag
+likwid.markerRegionEvents = likwid_markerRegionEvents
+likwid.markerRegionCpulist = likwid_markerRegionCpulist
+likwid.markerRegionThreads = likwid_markerRegionThreads
+likwid.markerRegionTime = likwid_markerRegionTime
+likwid.markerRegionCount = likwid_markerRegionCount
+likwid.markerRegionResult = likwid_markerRegionResult
+likwid.markerRegionMetric = likwid_markerRegionMetric
+
+likwid.cpuFeatures = { [0]="HW_PREFETCHER", [1]="CL_PREFETCHER", [2]="DCU_PREFETCHER", [3]="IP_PREFETCHER",
+ [4]="FAST_STRINGS", [5]="THERMAL_CONTROL", [6]="PERF_MON", [7]="FERR_MULTIPLEX",
+ [8]="BRANCH_TRACE_STORAGE", [9]="XTPR_MESSAGE", [10]="PEBS", [11]="SPEEDSTEP",
+ [12]="MONITOR", [13]="SPEEDSTEP_LOCK", [14]="CPUID_MAX_VAL", [15]="XD_BIT",
+ [16]="DYN_ACCEL", [17]="TURBO_MODE", [18]="TM2" }
+
+infinity = math.huge
+
+
+local function getopt(args, ostrlist)
+ local arg, place,placeend = nil, 0, 0;
+ return function ()
+ if place == 0 then -- update scanning pointer
+ place = 1
+ if #args == 0 or args[1]:sub(1, 1) ~= '-' then place = 0; return nil end
+ if #args[1] >= 2 then
+ if args[1]:sub(2, 2) == '-' then
+ if #args[1] == 2 then -- found "--"
+ place = 0
+ table.remove(args, 1)
+ return args[1], nil
+ end
+ place = place + 1
+ end
+ if args[1]:sub(3, 3) == '-' then
+ place = 0
+ table.remove(args, 1)
+ return args[1], nil
+ end
+ place = place + 1
+ placeend = #args[1]
+ end
+ end
+ local optopt = args[1]:sub(place, placeend)
+ place = place + 1;
+ local givopt = ""
+ local needarg = false
+ for _, ostr in pairs(ostrlist) do
+ local matchstring = "^"..ostr.."$"
+ placeend = place + #ostr -1
+ if ostr:sub(#ostr,#ostr) == ":" then
+ matchstring = "^"..ostr:sub(1,#ostr-1).."$"
+ needarg = true
+ placeend = place + #ostr -2
+ end
+ if optopt:match(matchstring) then
+ givopt = ostr
+ break
+ end
+ needarg = false
+ end
+ if givopt == "" then -- unknown option
+ if optopt == '-' then return nil end
+ if place > #args[1] then
+ table.remove(args, 1)
+ place = 0;
+ end
+ return '?', optopt;
+ end
+
+ if not needarg then -- do not need argument
+ arg = true;
+ table.remove(args, 1)
+ place = 0;
+ else -- need an argument
+ if placeend < #args[1] then -- no white space
+ arg = args[1]:sub(placeend,#args[1])
+ else
+ table.remove(args, 1);
+ if #args == 0 then -- an option requiring argument is the last one
+ place = 0
+ if givopt:sub(placeend, placeend) == ':' then return ':' end
+ return '!', optopt
+ else arg = args[1] end
+ end
+ table.remove(args, 1)
+ place = 0;
+ end
+ return optopt, arg
+ end
+end
+
+
+likwid.getopt = getopt
+
+local function tablelength(T)
+ local count = 0
+ if T == nil then return count end
+ if type(T) ~= "table" then return count end
+ for _ in pairs(T) do count = count + 1 end
+ return count
+end
+
+likwid.tablelength = tablelength
+
+local function tableprint(T, long)
+ if T == nil or type(T) ~= "table" or tablelength(T) == 0 then
+ print("[]")
+ return
+ end
+ local start_index = 0
+ local end_index = #T
+ if T[start_index] == nil then
+ start_index = 1
+ end_index = #T
+ end
+ outstr = ""
+ if T[start_index] ~= nil then
+ for i=start_index,end_index do
+ if not long then
+ outstr = outstr .. "," .. tostring(T[i])
+ else
+ outstr = outstr .. "," .. "[" .. tostring(i) .. "] = ".. tostring(T[i])
+ end
+ end
+ else
+ for k,v in pairs(T) do
+ if not long then
+ outstr = outstr .. "," .. tostring(v)
+ else
+ outstr = outstr .. "," .. "[" .. tostring(k) .. "] = ".. tostring(v)
+ end
+ end
+ end
+ print("["..outstr:sub(2,outstr:len()).."]")
+end
+
+likwid.tableprint = tableprint
+
+local function get_spaces(str, min_space, max_space)
+ local length = str:len()
+ local back = 0
+ local front = 0
+ back = math.ceil((max_space-str:len()) /2)
+ front = max_space - back - str:len()
+
+ if (front < back) then
+ local tmp = front
+ front = back
+ back = tmp
+ end
+ return string.rep(" ", front),string.rep(" ", back)
+end
+
+local function calculate_metric(formula, counters_to_values)
+ local function cmp(a,b)
+ if a:len() > b:len() then return true end
+ return false
+ end
+ local result = "Nan"
+ local err = false
+ local clist = {}
+ for counter,value in pairs(counters_to_values) do
+ table.insert(clist, counter)
+ end
+ table.sort(clist, cmp)
+ for _,counter in pairs(clist) do
+ formula = string.gsub(formula, tostring(counter), tostring(counters_to_values[counter]))
+ end
+ for c in formula:gmatch"." do
+ if c ~= "+" and c ~= "-" and c ~= "*" and c ~= "/" and c ~= "(" and c ~= ")" and c ~= "." and c:lower() ~= "e" then
+ local tmp = tonumber(c)
+ if type(tmp) ~= "number" then
+ print("Not all formula entries can be substituted with measured values")
+ print("Current formula: "..formula)
+ err = true
+ break
+ end
+ end
+ end
+ if not err then
+ if formula then
+ result = assert(load("return (" .. formula .. ")")())
+ if (result == nil or result ~= result or result == infinity or result == -infinity) then
+ result = 0
+ end
+ else
+ result = 0
+ end
+ end
+ return result
+end
+
+likwid.calculate_metric = calculate_metric
+
+local function printtable(tab)
+ local nr_columns = tablelength(tab)
+ if nr_columns == 0 then
+ print("Table has no columns. Empty table?")
+ return
+ end
+ local nr_lines = tablelength(tab[1])
+ local min_lengths = {}
+ local max_lengths = {}
+ for i, col in pairs(tab) do
+ if tablelength(col) ~= nr_lines then
+ print("Not all columns have the same row count, nr_lines"..tostring(nr_lines)..", current "..tablelength(col))
+ return
+ end
+ if min_lengths[i] == nil then
+ min_lengths[i] = 10000000
+ max_lengths[i] = 0
+ end
+ for j, field in pairs(col) do
+ if tostring(field):len() > max_lengths[i] then
+ max_lengths[i] = tostring(field):len()
+ end
+ if tostring(field):len() < min_lengths[i] then
+ min_lengths[i] = tostring(field):len()
+ end
+ end
+ end
+ hline = ""
+ for i=1,#max_lengths do
+ hline = hline .. "+-"..string.rep("-",max_lengths[i]).."-"
+ end
+ hline = hline .. "+"
+ print(hline)
+
+ str = "| "
+ for i=1,nr_columns do
+ front, back = get_spaces(tostring(tab[i][1]), min_lengths[i],max_lengths[i])
+ str = str .. front.. tostring(tab[i][1]) ..back
+ if i<nr_columns then
+ str = str .. " | "
+ else
+ str = str .. " |"
+ end
+ end
+ print(str)
+ print(hline)
+
+ for j=2,nr_lines do
+ str = "| "
+ for i=1,nr_columns do
+ front, back = get_spaces(tostring(tab[i][j]), min_lengths[i],max_lengths[i])
+ str = str .. front.. tostring(tab[i][j]) ..back
+ if i<nr_columns then
+ str = str .. " | "
+ else
+ str = str .. " |"
+ end
+ end
+ print(str)
+ end
+ if nr_lines > 1 then
+ print(hline)
+ end
+ print()
+end
+
+likwid.printtable = printtable
+
+local function printcsv(tab, linelength)
+ local nr_columns = tablelength(tab)
+ if nr_columns == 0 then
+ print("Table has no columns. Empty table?")
+ return
+ end
+ local nr_lines = tablelength(tab[1])
+ local str = ""
+ for j=1,nr_lines do
+ str = ""
+ for i=1,nr_columns do
+ str = str .. tostring(tab[i][j])
+ if (i ~= nr_columns) then
+ str = str .. ","
+ end
+ end
+ if nr_columns < linelength then
+ str = str .. string.rep(",", linelength-nr_columns)
+ end
+ print(str)
+ end
+
+end
+
+likwid.printcsv = printcsv
+
+local function stringsplit(astr, sSeparator, nMax, bRegexp)
+ assert(sSeparator ~= '')
+ assert(nMax == nil or nMax >= 1)
+ if astr == nil then return {} end
+ local aRecord = {}
+
+ if astr:len() > 0 then
+ local bPlain = not bRegexp
+ nMax = nMax or -1
+
+ local nField=1 nStart=1
+ local nFirst,nLast = astr:find(sSeparator, nStart, bPlain)
+ while nFirst and nMax ~= 0 do
+ aRecord[nField] = astr:sub(nStart, nFirst-1)
+ nField = nField+1
+ nStart = nLast+1
+ nFirst,nLast = astr:find(sSeparator, nStart, bPlain)
+ nMax = nMax-1
+ end
+ aRecord[nField] = astr:sub(nStart)
+ end
+
+ return aRecord
+end
+
+likwid.stringsplit = stringsplit
+
+local function get_groups()
+ groups = {}
+ local cpuinfo = likwid.getCpuInfo()
+ if cpuinfo == nil then return 0, {} end
+ local f = io.popen("ls " .. likwid.groupfolder .. "/" .. cpuinfo["short_name"] .."/*.txt 2>/dev/null")
+ if f ~= nil then
+ t = stringsplit(f:read("*a"),"\n")
+ f:close()
+ for i, a in pairs(t) do
+ if a ~= "" then
+ table.insert(groups,a:sub((a:match'^.*()/')+1,a:len()-4))
+ end
+ end
+ end
+ f = io.popen("ls " ..os.getenv("HOME") .. "/.likwid/groups/" .. cpuinfo["short_name"] .."/*.txt 2>/dev/null")
+ if f ~= nil then
+ t = stringsplit(f:read("*a"),"\n")
+ f:close()
+ for i, a in pairs(t) do
+ if a ~= "" then
+ table.insert(groups,a:sub((a:match'^.*()/')+1,a:len()-4))
+ end
+ end
+ end
+ return #groups,groups
+end
+
+likwid.get_groups = get_groups
+
+local function new_groupdata(eventString, fix_ctrs)
+ local gdata = {}
+ local num_events = 1
+ gdata["Events"] = {}
+ gdata["EventString"] = ""
+ gdata["GroupString"] = ""
+ local s,e = eventString:find(":")
+ if s == nil then
+ return gdata
+ end
+ if fix_ctrs > 0 then
+ if not eventString:match("FIXC0") and fix_ctrs >= 1 then
+ eventString = eventString..",INSTR_RETIRED_ANY:FIXC0"
+ end
+ if not eventString:match("FIXC1") and fix_ctrs >= 2 then
+ eventString = eventString..",CPU_CLK_UNHALTED_CORE:FIXC1"
+ end
+ if not eventString:match("FIXC2") and fix_ctrs == 3 then
+ eventString = eventString..",CPU_CLK_UNHALTED_REF:FIXC2"
+ end
+
+
+ end
+ gdata["EventString"] = eventString
+ gdata["GroupString"] = eventString
+ local eventslist = likwid.stringsplit(eventString,",")
+ for i,e in pairs(eventslist) do
+ eventlist = likwid.stringsplit(e,":")
+ gdata["Events"][num_events] = {}
+ gdata["Events"][num_events]["Event"] = eventlist[1]
+ gdata["Events"][num_events]["Counter"] = eventlist[2]
+ if #eventlist > 2 then
+ table.remove(eventlist, 2)
+ table.remove(eventlist, 1)
+ gdata["Events"][num_events]["Options"] = eventlist
+ end
+ num_events = num_events + 1
+ end
+ return gdata
+end
+
+
+local function get_groupdata(group)
+ groupdata = {}
+ local group_exist = 0
+ local cpuinfo = likwid.getCpuInfo()
+ if cpuinfo == nil then return nil end
+
+ num_groups, groups = get_groups()
+ for i, a in pairs(groups) do
+ if (a == group) then group_exist = 1 end
+ end
+ if (group_exist == 0) then return new_groupdata(group, cpuinfo["perf_num_fixed_ctr"]) end
+
+ local f = io.open(likwid.groupfolder .. "/" .. cpuinfo["short_name"] .. "/" .. group .. ".txt", "r")
+ if f == nil then
+ f = io.open(os.getenv("HOME") .. "/.likwid/groups/" .. cpuinfo["short_name"] .."/" .. group .. ".txt", "r")
+ if f == nil then
+ print("Cannot read data for group "..group)
+ print("Tried folders:")
+ print(likwid.groupfolder .. "/" .. cpuinfo["short_name"] .. "/" .. group .. ".txt")
+ print(os.getenv("HOME") .. "/.likwid/groups/" .. cpuinfo["short_name"] .."/*.txt")
+ return groupdata
+ end
+ end
+ local t = f:read("*all")
+ f:close()
+ local parse_eventset = false
+ local parse_metrics = false
+ local parse_long = false
+ groupdata["EventString"] = ""
+ groupdata["Events"] = {}
+ groupdata["Metrics"] = {}
+ groupdata["LongDescription"] = ""
+ groupdata["GroupString"] = group
+ nr_events = 1
+ nr_metrics = 1
+ for i, line in pairs(stringsplit(t,"\n")) do
+
+ if (parse_eventset or parse_metrics or parse_long) and line:len() == 0 then
+ parse_eventset = false
+ parse_metrics = false
+ parse_long = false
+ end
+
+ if line:match("^SHORT%a*") ~= nil then
+ linelist = stringsplit(line, "%s+", nil, "%s+")
+ table.remove(linelist, 1)
+ groupdata["ShortDescription"] = table.concat(linelist, " ")
+ end
+
+ if line:match("^EVENTSET$") ~= nil then
+ parse_eventset = true
+ end
+
+ if line:match("^METRICS$") ~= nil then
+ parse_metrics = true
+ end
+
+ if line:match("^LONG$") ~= nil then
+ parse_long = true
+ end
+
+ if parse_eventset and line:match("^EVENTSET$") == nil then
+ linelist = stringsplit(line:gsub("^%s*(.-)%s*$", "%1"), "%s+", nil, "%s+")
+ eventstring = linelist[2] .. ":" .. linelist[1]
+ if #linelist > 2 then
+ table.remove(linelist,2)
+ table.remove(linelist,1)
+ eventstring = eventstring .. ":".. table.concat(":",linelist)
+ end
+ groupdata["EventString"] = groupdata["EventString"] .. "," .. eventstring
+ groupdata["Events"][nr_events] = {}
+ groupdata["Events"][nr_events]["Event"] = linelist[2]:gsub("^%s*(.-)%s*$", "%1")
+ groupdata["Events"][nr_events]["Counter"] = linelist[1]:gsub("^%s*(.-)%s*$", "%1")
+ nr_events = nr_events + 1
+ end
+
+ if parse_metrics and line:match("^METRICS$") == nil then
+ linelist = stringsplit(line:gsub("^%s*(.-)%s*$", "%1"), "%s+", nil, "%s+")
+ formula = linelist[#linelist]
+ table.remove(linelist)
+ groupdata["Metrics"][nr_metrics] = {}
+ groupdata["Metrics"][nr_metrics]["description"] = table.concat(linelist, " ")
+ groupdata["Metrics"][nr_metrics]["formula"] = formula
+ nr_metrics = nr_metrics + 1
+ end
+
+ if parse_long and line:match("^LONG$") == nil then
+ groupdata["LongDescription"] = groupdata["LongDescription"] .. "\n" .. line
+ end
+ end
+ groupdata["LongDescription"] = groupdata["LongDescription"]:sub(2)
+ groupdata["EventString"] = groupdata["EventString"]:sub(2)
+
+ return groupdata
+
+end
+
+likwid.get_groupdata = get_groupdata
+
+
+
+
+local function parse_time(timestr)
+ local duration = 0
+ local s1,e1 = timestr:find("ms")
+ local s2,e2 = timestr:find("us")
+ if s1 ~= nil then
+ duration = tonumber(timestr:sub(1,s1-1)) * 1.E03
+ elseif s2 ~= nil then
+ duration = tonumber(timestr:sub(1,s2-1))
+ else
+ s1,e1 = timestr:find("s")
+ if s1 == nil then
+ print("Cannot parse time, '" .. timestr .. "' not well formatted, we need a time unit like s, ms, us")
+ os.exit(1)
+ end
+ duration = tonumber(timestr:sub(1,s1-1)) * 1.E06
+ end
+ return duration
+end
+
+likwid.parse_time = parse_time
+
+local function num2str(value)
+ local tmp = "0"
+ if value ~= 0 then
+ if tostring(value):match("%.0$") or value == math.tointeger(value) then
+ tmp = tostring(math.tointeger(value))
+ elseif string.format("%.4f", value):len() < 12 and
+ tonumber(string.format("%.4f", value)) ~= 0 then
+ tmp = string.format("%.4f", value)
+ else
+ tmp = string.format("%e", value)
+ end
+ end
+ return tmp
+end
+
+likwid.num2str = num2str
+
+local function min_max_avg(values)
+ min = math.huge
+ max = 0.0
+ sum = 0.0
+ count = 0
+ for _, value in pairs(values) do
+ if value ~= nil then
+ if (value < min) then min = value end
+ if (value > max) then max = value end
+ sum = sum + value
+ count = count + 1
+ end
+ end
+ return min, max, sum/count
+end
+
+local function tableMinMaxAvgSum(inputtable, skip_cols, skip_lines)
+ local outputtable = {}
+ local nr_columns = #inputtable
+ if nr_columns == 0 then
+ return {}
+ end
+ local nr_lines = #inputtable[1]
+ if nr_lines == 0 then
+ return {}
+ end
+ minOfLine = {"Min"}
+ maxOfLine = {"Max"}
+ sumOfLine = {"Sum"}
+ avgOfLine = {"Avg"}
+ for i=skip_lines+1,nr_lines do
+ minOfLine[i-skip_lines+1] = math.huge
+ maxOfLine[i-skip_lines+1] = 0
+ sumOfLine[i-skip_lines+1] = 0
+ avgOfLine[i-skip_lines+1] = 0
+ end
+ for j=skip_cols+1,nr_columns do
+ for i=skip_lines+1, nr_lines do
+ local res = tonumber(inputtable[j][i])
+ if res ~= nil then
+ minOfLine[i-skip_lines+1] = math.min(res, minOfLine[i-skip_lines+1])
+ maxOfLine[i-skip_lines+1] = math.max(res, maxOfLine[i-skip_lines+1])
+ sumOfLine[i-skip_lines+1] = sumOfLine[i-skip_lines+1] + res
+ else
+ minOfLine[i-skip_lines+1] = 0
+ maxOfLine[i-skip_lines+1] = 0
+ sumOfLine[i-skip_lines+1] = 0
+ end
+ avgOfLine[i-skip_lines+1] = sumOfLine[i-skip_lines+1]/(nr_columns-skip_cols)
+ end
+ end
+ for i=2,#minOfLine do
+ minOfLine[i] = likwid.num2str(minOfLine[i])
+ maxOfLine[i] = likwid.num2str(maxOfLine[i])
+ sumOfLine[i] = likwid.num2str(sumOfLine[i])
+ avgOfLine[i] = likwid.num2str(avgOfLine[i])
+ end
+
+ local tmptable = {}
+ table.insert(tmptable, inputtable[1][1])
+ for j=2,#inputtable[1] do
+ table.insert(tmptable, inputtable[1][j].." STAT")
+ end
+ table.insert(outputtable, tmptable)
+ for i=2,skip_cols do
+ local tmptable = {}
+ table.insert(tmptable, inputtable[i][1])
+ for j=2,#inputtable[i] do
+ table.insert(tmptable, inputtable[i][j])
+ end
+ table.insert(outputtable, tmptable)
+ end
+ table.insert(outputtable, sumOfLine)
+ table.insert(outputtable, minOfLine)
+ table.insert(outputtable, maxOfLine)
+ table.insert(outputtable, avgOfLine)
+ return outputtable
+end
+
+likwid.tableToMinMaxAvgSum = tableMinMaxAvgSum
+
+local function printOutput(results, metrics, cpulist, region, stats)
+ local maxLineFields = 0
+ local cpuinfo = likwid_getCpuInfo()
+ local clock = likwid.getCpuClock()
+ local regionName = likwid.markerRegionTag(region)
+ local regionThreads = likwid.markerRegionThreads(region)
+ local cur_cpulist = cpulist
+ if region ~= nil then
+ cur_cpulist = likwid.markerRegionCpulist(region)
+ end
+
+ for g, group in pairs(results) do
+ local infotab = {}
+ local firsttab = {}
+ local firsttab_combined = {}
+ local secondtab = {}
+ local secondtab_combined = {}
+ local runtime = likwid.getRuntimeOfGroup(g)
+ local groupName = likwid.getNameOfGroup(g)
+ if region ~= nil then
+ infotab[1] = {"Region Info","RDTSC Runtime [s]","call count"}
+ for c, cpu in pairs(cur_cpulist) do
+ local tmpList = {}
+ table.insert(tmpList, "Core "..tostring(cpu))
+ table.insert(tmpList, string.format("%.6f", likwid.markerRegionTime(region, c)))
+ table.insert(tmpList, tostring(likwid.markerRegionCount(region, c)))
+ table.insert(infotab, tmpList)
+ end
+ end
+ firsttab[1] = {"Event"}
+ firsttab_combined[1] = {"Event"}
+ firsttab[2] = {"Counter"}
+ firsttab_combined[2] = {"Counter"}
+ if likwid.getNumberOfMetrics(g) == 0 then
+ table.insert(firsttab[1],"Runtime (RDTSC) [s]")
+ table.insert(firsttab[2],"TSC")
+ end
+ for e, event in pairs(group) do
+ eventname = likwid.getNameOfEvent(g, e)
+ countername = likwid.getNameOfCounter(g, e)
+ table.insert(firsttab[1], eventname)
+ table.insert(firsttab[2], countername)
+ table.insert(firsttab_combined[1], eventname .. " STAT")
+ table.insert(firsttab_combined[2], countername)
+ end
+ for c, cpu in pairs(cur_cpulist) do
+ local tmpList = {"Core "..tostring(cpu)}
+ if likwid.getNumberOfMetrics(g) == 0 then
+ if region == nil then
+ table.insert(tmpList, string.format("%e", runtime))
+ else
+ table.insert(tmpList, string.format("%e", likwid.markerRegionTime(region, c)))
+ end
+ end
+
+ for e, event in pairs(group) do
+ local tmp = tostring(likwid.num2str(event[c]))
+ table.insert(tmpList, tmp)
+ end
+ table.insert(firsttab, tmpList)
+ end
+ if #cpulist > 1 or stats == true then
+ firsttab_combined = tableMinMaxAvgSum(firsttab, 2, 1)
+ end
+ if likwid.getNumberOfMetrics(g) > 0 then
+ secondtab[1] = {"Metric"}
+ secondtab_combined[1] = {"Metric"}
+ for m=1, likwid.getNumberOfMetrics(g) do
+ table.insert(secondtab[1], likwid.getNameOfMetric(g, m))
+ table.insert(secondtab_combined[1], likwid.getNameOfMetric(g, m).." STAT" )
+ end
+ for c, cpu in pairs(cur_cpulist) do
+ local tmpList = {"Core "..tostring(cpu)}
+ for m=1, likwid.getNumberOfMetrics(g) do
+ local tmp = tostring(likwid.num2str(metrics[g][m][c]))
+ table.insert(tmpList, tmp)
+ end
+ table.insert(secondtab, tmpList)
+ end
+ if #cpulist > 1 or stats == true then
+ secondtab_combined = tableMinMaxAvgSum(secondtab, 1, 1)
+ end
+ end
+ maxLineFields = math.max(#firsttab, #firsttab_combined,
+ #secondtab, #secondtab_combined)
+ if use_csv then
+ print(string.format("STRUCT,Info,3%s",string.rep(",",maxLineFields-3)))
+ print(string.format("CPU name:,%s%s", cpuinfo["osname"],string.rep(",",maxLineFields-2)))
+ print(string.format("CPU type:,%s%s", cpuinfo["name"],string.rep(",",maxLineFields-2)))
+ print(string.format("CPU clock:,%s GHz%s", clock*1.E-09,string.rep(",",maxLineFields-2)))
+ if region == nil then
+ print(string.format("TABLE,Group %d Raw,%s,%d%s",g,groupName,#firsttab[1]-1,string.rep(",",maxLineFields-4)))
+ else
+ print(string.format("TABLE,Region %s,Group %d Raw,%s,%d%s",regionName,g,groupName,#firsttab[1]-1,string.rep(",",maxLineFields-5)))
+ end
+ if #infotab > 0 then
+ likwid.printcsv(infotab, maxLineFields)
+ end
+ likwid.printcsv(firsttab, maxLineFields)
+ else
+ if outfile ~= nil then
+ print(likwid.hline)
+ print(string.format("CPU name:\t%s",cpuinfo["osname"]))
+ print(string.format("CPU type:\t%s",cpuinfo["name"]))
+ print(string.format("CPU clock:\t%3.2f GHz",clock * 1.E-09))
+ print(likwid.hline)
+ end
+ if region == nil then
+ print("Group "..tostring(g)..": "..groupName)
+ else
+ print("Region "..regionName..", Group "..tostring(g)..": "..groupName)
+ end
+ if #infotab > 0 then
+ likwid.printtable(infotab)
+ end
+ likwid.printtable(firsttab)
+ end
+ if #cur_cpulist > 1 or stats == true then
+ if use_csv then
+ if region == nil then
+ print(string.format("TABLE,Group %d Raw Stat,%s,%d%s",g,groupName,#firsttab_combined[1]-1,string.rep(",",maxLineFields-4)))
+ else
+ print(string.format("TABLE,Region %s,Group %d Raw Stat,%s,%d%s",regionName, g,groupName,#firsttab_combined[1]-1,string.rep(",",maxLineFields-5)))
+ end
+ likwid.printcsv(firsttab_combined, maxLineFields)
+ else
+ likwid.printtable(firsttab_combined)
+ end
+ end
+ if likwid.getNumberOfMetrics(g) > 0 then
+ if use_csv then
+ if region == nil then
+ print(string.format("TABLE,Group %d Metric,%s,%d%s",g,groupName,#secondtab[1]-1,string.rep(",",maxLineFields-4)))
+ else
+ print(string.format("TABLE,Region %s,Group %d Metric,%s,%d%s",regionName,g,groupName,#secondtab[1]-1,string.rep(",",maxLineFields-5)))
+ end
+ likwid.printcsv(secondtab, maxLineFields)
+ else
+ likwid.printtable(secondtab)
+ end
+ if #cur_cpulist > 1 or stats == true then
+ if use_csv then
+ if region == nil then
+ print(string.format("TABLE,Group %d Metric Stat,%s,%d%s",g,groupName,#secondtab_combined[1]-1,string.rep(",",maxLineFields-4)))
+ else
+ print(string.format("TABLE,Region %s,Group %d Metric Stat,%s,%d%s",regionName,g,groupName,#secondtab_combined[1]-1,string.rep(",",maxLineFields-5)))
+ end
+ likwid.printcsv(secondtab_combined, maxLineFields)
+ else
+ likwid.printtable(secondtab_combined)
+ end
+ end
+ end
+ end
+end
+
+likwid.printOutput = printOutput
+
+
+
+local function getResults()
+ local results = {}
+ local nr_groups = likwid_getNumberOfGroups()
+ local nr_threads = likwid_getNumberOfThreads()
+ for i=1,nr_groups do
+ results[i] = {}
+ local nr_events = likwid_getNumberOfEvents(i)
+ for j=1,nr_events do
+ results[i][j] = {}
+ for k=1, nr_threads do
+ results[i][j][k] = likwid_getResult(i,j,k)
+ end
+ end
+ end
+ return results
+end
+
+likwid.getResults = getResults
+
+local function getLastResults()
+ local results = {}
+ local nr_groups = likwid_getNumberOfGroups()
+ local nr_threads = likwid_getNumberOfThreads()
+ for i=1,nr_groups do
+ results[i] = {}
+ local nr_events = likwid_getNumberOfEvents(i)
+ for j=1,nr_events do
+ results[i][j] = {}
+ for k=1, nr_threads do
+ results[i][j][k] = likwid_getLastResult(i,j,k)
+ end
+ end
+ end
+ return results
+end
+
+likwid.getLastResults = getLastResults
+
+local function getMetrics()
+ local results = {}
+ local nr_groups = likwid_getNumberOfGroups()
+ local nr_threads = likwid_getNumberOfThreads()
+ for i=1,nr_groups do
+ results[i] = {}
+ local nr_metrics = likwid_getNumberOfMetrics(i)
+ for j=1,nr_metrics do
+ results[i][j] = {}
+ for k=1, nr_threads do
+ results[i][j][k] = likwid_getMetric(i,j, k)
+ end
+ end
+ end
+ return results
+end
+
+likwid.getMetrics = getMetrics
+
+local function getLastMetrics()
+ local results = {}
+ local nr_groups = likwid_getNumberOfGroups()
+ local nr_threads = likwid_getNumberOfThreads()
+ for i=1,nr_groups do
+ results[i] = {}
+ local nr_metrics = likwid_getNumberOfMetrics(i)
+ for j=1,nr_metrics do
+ results[i][j] = {}
+ for k=1, nr_threads do
+ results[i][j][k] = likwid_getLastMetric(i,j, k)
+ end
+ end
+ end
+ return results
+end
+
+likwid.getLastMetrics = getLastMetrics
+
+local function getMarkerResults(filename, cpulist)
+ local cpuinfo = likwid.getCpuInfo()
+ likwid.readMarkerFile(filename)
+ results = {}
+ metrics = {}
+ for i=1, likwid.markerNumRegions() do
+ local regionName = likwid.markerRegionTag(i)
+ local groupID = likwid.markerRegionGroup(i)
+ local regionThreads = likwid.markerRegionThreads(i)
+ results[i] = {}
+ metrics[i] = {}
+ results[i][groupID] = {}
+ metrics[i][groupID] = {}
+ for k=1, likwid.markerRegionEvents(i) do
+ local eventName = likwid.getNameOfEvent(groupID, k)
+ local counterName = likwid.getNameOfCounter(groupID, k)
+ results[i][groupID][k] = {}
+ for j=1, regionThreads do
+ results[i][groupID][k][j] = likwid.markerRegionResult(i,k,j)
+ end
+ end
+ if likwid.getNumberOfMetrics(groupID) > 0 then
+ for k=1, likwid.getNumberOfMetrics(likwid.markerRegionGroup(i)) do
+ local metricName = likwid.getNameOfMetric(groupID, k)
+ metrics[i][groupID][k] = {}
+ for j=1, regionThreads do
+ metrics[i][groupID][k][j] = likwid.markerRegionMetric(i,k,j)
+ end
+ end
+ end
+ end
+ return results, metrics
+end
+
+likwid.getMarkerResults = getMarkerResults
+
+
+local function msr_available(flags)
+ local ret = likwid_access("/dev/cpu/0/msr", flags)
+ if ret == 0 then
+ return true
+ else
+ local ret = likwid_access("/dev/msr0", flags)
+ if ret == 0 then
+ return true
+ end
+ end
+ return false
+end
+likwid.msr_available = msr_available
+
+
+local function addSimpleAsciiBox(container,lineIdx, colIdx, label)
+ local box = {}
+ if container[lineIdx] == nil then
+ container[lineIdx] = {}
+ end
+ box["width"] = 1
+ box["label"] = label
+ table.insert(container[lineIdx], box)
+end
+likwid.addSimpleAsciiBox = addSimpleAsciiBox
+
+local function addJoinedAsciiBox(container,lineIdx, startColIdx, endColIdx, label)
+ local box = {}
+ if container[lineIdx] == nil then
+ container[lineIdx] = {}
+ end
+ box["width"] = endColIdx-startColIdx+1
+ box["label"] = label
+ table.insert(container[lineIdx], box)
+end
+likwid.addJoinedAsciiBox = addJoinedAsciiBox
+
+local function printAsciiBox(container)
+ local boxwidth = 0
+ local numLines = #container
+ local maxNumColumns = 0
+ for i=1,numLines do
+ if #container[i] > maxNumColumns then
+ maxNumColumns = #container[i]
+ end
+ for j=1,#container[i] do
+ if container[i][j]["label"]:len() > boxwidth then
+ boxwidth = container[i][j]["label"]:len()
+ end
+ end
+ end
+ boxwidth = boxwidth + 2
+ boxline = "+" .. string.rep("-",((maxNumColumns * (boxwidth+2)) + maxNumColumns+1)) .. "+"
+ print(boxline)
+ for i=1,numLines do
+ innerboxline = "| "
+ local numColumns = #container[i]
+ for j=1,numColumns do
+ innerboxline = innerboxline .. "+"
+ if container[i][j]["width"] == 1 then
+ innerboxline = innerboxline .. string.rep("-", boxwidth)
+ else
+ innerboxline = innerboxline .. string.rep("-", (container[i][j]["width"] * boxwidth + (container[i][j]["width"]-1)*3))
+ end
+ innerboxline = innerboxline .. "+ "
+ end
+
+ boxlabelline = "| "
+ for j=1,numColumns do
+ local offset = 0
+ local width = 0
+ local labellen = container[i][j]["label"]:len()
+ local boxlen = container[i][j]["width"]
+ if container[i][j]["width"] == 1 then
+ width = (boxwidth - labellen)/2;
+ offset = (boxwidth - labellen)%2;
+ else
+ width = (boxlen * boxwidth + ((boxlen-1)*3) - labellen)/2;
+ offset = (boxlen * boxwidth + ((boxlen-1)*3) - labellen)%2;
+ end
+ boxlabelline = boxlabelline .. "|" .. string.rep(" ", math.floor(width+offset))
+ boxlabelline = boxlabelline .. container[i][j]["label"]
+ boxlabelline = boxlabelline .. string.rep(" ",math.floor(width)) .. "| "
+ end
+ print(innerboxline .. "|")
+ print(boxlabelline .. "|")
+ print(innerboxline .. "|")
+ end
+ print(boxline)
+end
+likwid.printAsciiBox = printAsciiBox
+
+-- Some helpers for output file substitutions
+-- getpid already defined by Lua-C-Interface
+local function gethostname()
+ local f = io.popen("hostname -s","r")
+ local hostname = f:read("*all"):gsub("^%s*(.-)%s*$", "%1")
+ f:close()
+ return hostname
+end
+
+likwid.gethostname = gethostname
+
+local function getjid()
+ local jid = os.getenv("PBS_JOBID")
+ if jid == nil then
+ jid = "X"
+ end
+ return jid
+end
+
+likwid.getjid = getjid
+
+local function getMPIrank()
+ local rank = os.getenv("PMI_RANK")
+ if rank == nil then
+ rank = os.getenv("OMPI_COMM_WORLD_RANK")
+ if rank == nil then
+ rank = "X"
+ end
+ end
+ return rank
+end
+
+likwid.getMPIrank = getMPIrank
+
+return likwid
diff --git a/src/asciiBoxes.c b/src/asciiBoxes.c
deleted file mode 100644
index a22dab5..0000000
--- a/src/asciiBoxes.c
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: asciiBoxes.c
- *
- * Description: Module implementing output of nested ascii art boxes
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-/* ##### HEADER FILE INCLUDES ######################################### */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <string.h>
-
-#include <error.h>
-#include <types.h>
-#include <asciiBoxes.h>
-
-/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
-
-BoxContainer*
-asciiBoxes_allocateContainer(int numLines, int numColumns)
-{
- BoxContainer* container;
-
- container = (BoxContainer*) malloc(sizeof(BoxContainer));
- container->numLines = numLines;
- container->numColumns = numColumns;
-
- container->boxes = (Box**) malloc(numLines * sizeof(Box*));
-
- for ( int i=0; i < numLines; i++ )
- {
- container->boxes[i] = (Box*) malloc(numColumns * sizeof(Box));
- }
-
- for(int i=0; i<numLines; i++)
- {
- for(int j=0; j<numColumns; j++)
- {
- container->boxes[i][j].width = 0;
- container->boxes[i][j].label = NULL;
- }
- }
-
- return container;
-}
-
-void
-asciiBoxes_addBox(BoxContainer* container, int line, int column, bstring label)
-{
- if ( line >= container->numLines )
- {
- ERROR_PRINT(line id %d too large,line);
- }
- if ( column >= container->numColumns )
- {
- ERROR_PRINT(column id %d too large,column);
- }
-
- container->boxes[line][column].width = 1;
- container->boxes[line][column].label = bstrcpy(label);
-}
-
-
-void
-asciiBoxes_addJoinedBox(
- BoxContainer* container,
- int line,
- int startColumn,
- int endColumn,
- bstring label)
-{
- if ( line >= container->numLines )
- {
- ERROR_PRINT(line id %d too large,line);
- }
-
- if ( endColumn >= container->numColumns )
- {
- ERROR_PRINT(column id %d too large,endColumn);
- }
-
- container->boxes[line][startColumn].width = (endColumn-startColumn)+1;
- container->boxes[line][startColumn].label = bstrcpy(label);
-}
-
-void
-asciiBoxes_print(FILE* OUTSTREAM, BoxContainer* container)
-{
- int width;
- int boxwidth=0; /* box width is inner width of box */
-
- /* determine maximum label width */
- for ( int i=0; i < container->numLines; i++ )
- {
- for ( int j=0; j < container->numColumns; j++ )
- {
- btrimws(container->boxes[i][j].label);
- boxwidth = MAX(boxwidth,blength(container->boxes[i][j].label));
-
- /* if box is joined increase counter */
- if ( container->boxes[i][j].width > 1 )
- {
- j += container->boxes[i][j].width;
- }
- }
- }
- boxwidth += 2; /* add one space each side */
-
- /* top line */
- printf("+");
-
- for ( int i=0; i < (container->numColumns * (boxwidth+2) +
- (container->numColumns+1)); /* one space between boxes */
- i++ )
- {
- printf("-");
- }
- printf("+\n");
-
- for ( int i=0; i < container->numLines; i++ )
- {
- /* Box top line */
- printf("| ");
-
- for ( int j=0; j < container->numColumns; j++ )
- {
- printf("+");
-
- if ( container->boxes[i][j].width == 1 )
- {
- for ( int k=0; k < boxwidth; k++ )
- {
- printf("-");
- }
- }
- else
- {
- for ( int k=0; k < (container->boxes[i][j].width * boxwidth +
- (container->boxes[i][j].width-1)*3);
- k++)
- {
- printf("-");
- }
- j += container->boxes[i][j].width-1;
- }
- printf("+ ");
- }
- printf("|\n");
- printf("| ");
-
- /* Box label line */
- for ( int j=0; j < container->numColumns; j++ )
- {
- int offset=0;
-
- /* center label */
- if ( container->boxes[i][j].width == 1 )
- {
- width = (boxwidth - blength(container->boxes[i][j].label))/2;
- offset = (boxwidth - blength(container->boxes[i][j].label))%2;
- }
- else
- {
- width = (container->boxes[i][j].width * boxwidth +
- ((container->boxes[i][j].width-1)*3) -
- blength(container->boxes[i][j].label))/2;
-
- offset = (container->boxes[i][j].width * boxwidth +
- ((container->boxes[i][j].width-1)*3) -
- blength(container->boxes[i][j].label))%2;
- }
- printf("|");
-
- for ( int k=0; k < (width+offset); k++ )
- {
- printf(" ");
- }
-
- printf("%s",container->boxes[i][j].label->data);
-
- for ( int k=0; k < width; k++ )
- {
- printf(" ");
- }
- printf("| ");
-
- if ( container->boxes[i][j].width != 1 )
- {
- j+= container->boxes[i][j].width-1;
- }
- }
- printf("|\n");
- printf("| ");
-
- /* Box bottom line */
- for ( int j=0; j < container->numColumns; j++ )
- {
- printf("+");
-
- if ( container->boxes[i][j].width == 1 )
- {
- for ( int k=0; k < boxwidth; k++ )
- {
- printf("-");
- }
- }
- else
- {
- for ( int k=0; k < (container->boxes[i][j].width * boxwidth +
- (container->boxes[i][j].width-1)*3);
- k++ )
- {
- printf("-");
- }
- j+= container->boxes[i][j].width-1;
- }
- printf("+ ");
- }
- printf("|\n");
- }
-
- /* bottom line */
- printf("+");
- for ( int i=0; i < (container->numColumns * (boxwidth+2) +
- container->numColumns+1); i++ )
- {
- printf("-");
- }
- printf("+\n");
- fflush(stdout);
-}
-
diff --git a/src/asciiTable.c b/src/asciiTable.c
deleted file mode 100644
index 29b615a..0000000
--- a/src/asciiTable.c
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: asciiTable.c
- *
- * Description: Module implementing output of ascii table.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-
-/* ##### HEADER FILE INCLUDES ######################################### */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <string.h>
-
-#include <error.h>
-#include <types.h>
-#include <strUtil.h>
-#include <asciiTable.h>
-
-/* ##### LOCAL VARIABLES ########################################### */
-
-static FILE* OUTPUT;
-
-/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
-
-void
-asciiTable_setOutput(FILE* stream)
-{
- OUTPUT = stream;
-}
-
-TableContainer*
-asciiTable_allocate(int numRows,int numColumns, bstrList* headerLabels)
-{
- int i;
- TableContainer* container;
- OUTPUT = stdout;
-
- container = (TableContainer*) malloc(sizeof(TableContainer));
- container->numRows = numRows;
- container->numColumns = numColumns;
- container->currentRow = 0;
- container->printed = 0;
-
- if (numColumns != headerLabels->qty)
- {
- ERROR_PRINT(Number of columns %d not equal to number of header labels %d,numColumns,headerLabels->qty);
- }
-
- container->header = bstrListCreate();
- bstrListAlloc (container->header, numColumns);
-
- for(i=0; i<numColumns; i++)
- {
- container->header->entry[i] = bstrcpy(headerLabels->entry[i]);
- }
-
- container->rows = (bstrList**) malloc( numRows * sizeof(bstrList*));
-
- for(i=0; i<numRows; i++)
- {
- container->rows[i] = bstrListCreate();
- bstrListAlloc (container->rows[i], numColumns);
- }
-
- return container;
-}
-
-void
-asciiTable_free(TableContainer* container)
-{
- int i;
-
- if(container == NULL)
- {
- ERROR_PLAIN_PRINT(Cannot free NULL reference);
- }
-
- bstrListDestroy(container->header);
-
- for(i=0; i<container->numRows; i++)
- {
- bstrListDestroy(container->rows[i]);
- }
-
- free(container->rows);
-}
-
-void
-asciiTable_insertRow(TableContainer* container, int row, bstrList* fields)
-{
- int i;
-
- if (container->numColumns != fields->qty)
- {
- ERROR_PRINT(Number of colummns %d not equal to number of field labels %d,container->numColumns,fields->qty);
- }
-
- if (row >= container->numRows)
- {
- ERROR_PRINT(Number of Rows %d smaller than requested row index %d, container->numRows,row);
- }
-
- for(i=0; i<container->numColumns; i++)
- {
- container->rows[row]->entry[i] = bstrcpy(fields->entry[i]);
- container->rows[row]->qty++;
- }
-}
-
-void
-asciiTable_appendRow(TableContainer* container, bstrList* fields)
-{
- asciiTable_insertRow(container, container->currentRow++, fields);
-}
-
-void
-asciiTable_setCurrentRow(TableContainer* container, int row)
-{
- container->currentRow = row;
-}
-
-void
-asciiTable_print(TableContainer* container)
-{
- int i;
- int j;
- int* boxwidth;
-
- boxwidth = (int*) malloc(container->numColumns * sizeof(int));
-
- for (j=0; j<container->numColumns; j++) boxwidth[j] = 0;
-
- for (j=0; j<container->numColumns; j++)
- {
- boxwidth[j] = MAX(boxwidth[j],blength(container->header->entry[j]));
- }
-
- /* determine maximum label width in each column */
- for (i=0; i<container->numRows; i++)
- {
- for (j=0; j<container->numColumns; j++)
- {
- // btrimws(container->rows[i]->entry[j]);
- boxwidth[j] = MAX(boxwidth[j],blength(container->rows[i]->entry[j]));
- }
- }
-
- if (! container->printed)
- {
- /* Increase boxwidth with two spaces */
- for (j=0; j<container->numColumns; j++) boxwidth[j] +=2;
- }
-
- /* print header */
-
- for (j=0; j<container->numColumns; j++)
- {
- fprintf(OUTPUT,"+");
- for (i=0;i<boxwidth[j];i++)
- {
- fprintf(OUTPUT,"-");
- }
- }
- fprintf(OUTPUT,"+\n");
-
- for (j=0; j<container->numColumns; j++)
- {
- fprintf(OUTPUT,"|");
- bJustifyCenter(container->header->entry[j],boxwidth[j]);
- fprintf(OUTPUT,"%s",bdata(container->header->entry[j]));
- }
- fprintf(OUTPUT,"|\n");
-
- for (j=0; j<container->numColumns; j++)
- {
- fprintf(OUTPUT,"+");
- for (i=0;i<boxwidth[j];i++)
- {
- fprintf(OUTPUT,"-");
- }
- }
- fprintf(OUTPUT,"+\n");
-
- for (i=0; i<container->numRows; i++)
- {
- for (j=0; j<container->numColumns; j++)
- {
- fprintf(OUTPUT,"|");
- bJustifyCenter(container->rows[i]->entry[j],boxwidth[j]);
- fprintf(OUTPUT,"%s",bdata(container->rows[i]->entry[j]));
- }
- fprintf(OUTPUT,"|\n");
- }
-
- for (j=0; j<container->numColumns; j++)
- {
- fprintf(OUTPUT,"+");
- for (i=0;i<boxwidth[j];i++)
- {
- fprintf(OUTPUT,"-");
- }
- }
- fprintf(OUTPUT,"+\n");
- container->printed = 1;
-
- free(boxwidth);
-}
-
-
-
-
diff --git a/src/barrier.c b/src/barrier.c
deleted file mode 100644
index 3a93f92..0000000
--- a/src/barrier.c
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: barrier.c
- *
- * Description: Implementation of threaded spin loop barrier
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-/* ##### HEADER FILE INCLUDES ######################################### */
-#include <stdlib.h>
-#include <stdio.h>
-
-#include <error.h>
-#include <types.h>
-#include <barrier.h>
-
-/* ##### EXPORTED VARIABLES ########################################### */
-
-
-/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
-
-#define CACHELINE_SIZE 64
-
-/* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */
-
-static BarrierGroup* groups;
-static int currentGroupId = 0;
-static int maxGroupId = 0;
-
-/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
-
-
-/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
-
-int
-barrier_registerGroup(int numThreads)
-{
- int ret;
-
- if (currentGroupId > maxGroupId)
- {
- ERROR_PRINT(Group ID %d larger than maxGroupID %d,currentGroupId,maxGroupId);
- }
-
- groups[currentGroupId].numberOfThreads = numThreads;
- ret = posix_memalign(
- (void**) &groups[currentGroupId].groupBval,
- CACHELINE_SIZE,
- numThreads * 32 * sizeof(int));
-
- if (ret < 0)
- {
- ERROR;
- }
-
-
- return currentGroupId++;
-}
-
-void
-barrier_registerThread(BarrierData* barr, int groupId, int threadId)
-{
- int ret;
- int i;
- int j = 1;
- if (groupId > currentGroupId)
- {
- ERROR_PLAIN_PRINT(Group not yet registered);
- }
- if (threadId > groups[groupId].numberOfThreads)
- {
- ERROR_PRINT(Thread ID %d too large,threadId);
- }
-
- barr->numberOfThreads = groups[groupId].numberOfThreads;
- barr->offset = 0;
- barr->val = 1;
- barr->bval = groups[groupId].groupBval;
- ret = posix_memalign(
- (void**) &(barr->index),
- CACHELINE_SIZE,
- barr->numberOfThreads * sizeof(int));
-
- if (ret < 0)
- {
- ERROR;
- }
-
-
- barr->index[0] = threadId;
-
- for (i = 0; i < barr->numberOfThreads; i++)
- {
- if (!(i == threadId))
- {
- barr->index[j++] = i;
- }
- }
-}
-
-
-void
-barrier_init(int numberOfGroups)
-{
- maxGroupId = numberOfGroups-1;
- groups = (BarrierGroup*) malloc(numberOfGroups * sizeof(BarrierGroup));
-}
-
-void
-barrier_synchronize(BarrierData* barr)
-{
- int i;
-
- barr->bval[barr->index[0] * 32 + barr->offset * 16] = barr->val;
-
- for (i = 1; i < barr->numberOfThreads; i++)
- {
- while (barr->bval[barr->index[i] * 32 + barr->offset * 16] != barr->val)
- {
- __asm__ ("pause");
- }
- }
-
- if (barr->offset)
- {
- barr->val = !barr->val;
- }
- barr->offset = !barr->offset;
-}
-
-void barrier_destroy(void)
-{
- free(groups);
-}
diff --git a/src/bench.c b/src/bench.c
deleted file mode 100644
index 3a0b81b..0000000
--- a/src/bench.c
+++ /dev/null
@@ -1,537 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: bench.c
- *
- * Description: Benchmarking framework for likwid-bench
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-/* ##### HEADER FILE INCLUDES ######################################### */
-
-#include <pthread.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <sys/syscall.h>
-#include <string.h>
-#include <sched.h>
-#include <types.h>
-#include <unistd.h>
-
-#include <timer.h>
-#include <threads.h>
-#include <affinity.h>
-#include <barrier.h>
-#include <likwid.h>
-#ifdef PAPI
-#include <papi.h>
-#endif
-
-/* ##### EXPORTED VARIABLES ########################################### */
-
-
-/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
-
-//#define BARRIER pthread_barrier_wait(&threads_barrier)
-#define BARRIER barrier_synchronize(&barr)
-
-#ifdef PERFMON
-#define START_PERFMON likwid_markerStartRegion("bench");
-#define STOP_PERFMON likwid_markerStopRegion("bench");
-#define LIKWID_THREAD_INIT likwid_markerThreadInit();
-#define EXECUTE EXECUTE_LIKWID
-#else
-#ifdef PAPI
-#define START_PERFMON(event_set) PAPI_start(event_set);
-#define STOP_PERFMON(event_set, result) PAPI_stop ( event_set ,result );
-#define LIKWID_THREAD_INIT
-#define EXECUTE EXECUTE_PAPI
-#else
-#define START_PERFMON
-#define STOP_PERFMON
-#define LIKWID_THREAD_INIT
-#define EXECUTE EXECUTE_LIKWID
-#endif
-#endif
-
-#define EXECUTE_LIKWID(func) \
- BARRIER; \
- if (data->threadId == 0) \
- { \
- timer_start(&time); \
- } \
- START_PERFMON \
- for (i=0; i< data->data.iter; i++) \
- { \
- func; \
- } \
- BARRIER; \
- STOP_PERFMON \
- if (data->threadId == 0) \
- { \
- timer_stop(&time); \
- data->cycles = timer_printCycles(&time); \
- } \
- BARRIER
-
-#define EXECUTE_PAPI(func) \
- BARRIER; \
- if (data->threadId == 0) \
- { \
- timer_start(&time); \
- } \
- START_PERFMON(event_set) \
- for (i=0; i< data->data.iter; i++) \
- { \
- func; \
- } \
- BARRIER; \
- STOP_PERFMON(event_set, &(result[0])) \
- if (data->threadId == 0) \
- { \
- timer_stop(&time); \
- data->cycles = timer_printCycles(&time); \
- } \
- BARRIER
-/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
-
-void* runTest(void* arg)
-{
- int threadId;
- int offset;
- size_t size;
- size_t i;
- BarrierData barr;
- ThreadData* data;
- ThreadUserData* myData;
- TimerData time;
- FuncPrototype func;
- FILE* OUTSTREAM;
-#ifdef PAPI
- int event_set = PAPI_NULL;
- char groupname[50];
- char* group_ptr = &(groupname[0]);
- long long int result[4] = {0,0,0,0};
- group_ptr = getenv("PAPI_BENCH");
- PAPI_create_eventset(&event_set);
- PAPI_add_event(event_set, PAPI_TOT_CYC);
- // L3 group
- if (strncmp(group_ptr,"L3",2) == 0)
- {
- PAPI_add_event(event_set, PAPI_L3_TCA);
- }
- // L2 group
- else if (strncmp(group_ptr,"L2",2) == 0)
- {
- PAPI_add_event(event_set, PAPI_L2_TCA);
- }
- // FLOPS_AVX
- else if (strncmp(group_ptr,"FLOPS_AVX",9) == 0)
- {
- PAPI_add_event(event_set, PAPI_VEC_SP);
- PAPI_add_event(event_set, PAPI_VEC_DP);
- PAPI_add_event(event_set, PAPI_FP_INS);
- }
- // FLOPS_DP
- else if (strncmp(group_ptr,"FLOPS_DP",8) == 0)
- {
- PAPI_add_event(event_set, PAPI_DP_OPS);
- }
- // FLOPS_SP
- else if (strncmp(group_ptr,"FLOPS_SP",8) == 0)
- {
- PAPI_add_event(event_set, PAPI_SP_OPS);
- }
-#endif
-
- data = (ThreadData*) arg;
- myData = &(data->data);
- func = myData->test->kernel;
- threadId = data->threadId;
- OUTSTREAM = data->output;
- barrier_registerThread(&barr, 0, data->globalThreadId);
-
- /* Prepare ptrs for thread */
- size = myData->size / data->numberOfThreads;
- size -= (size%myData->test->stride);
- offset = data->threadId * size;
- myData->size = size;
-
- switch ( myData->test->type )
- {
- case SINGLE_RAND:
- case SINGLE:
- {
- float* sptr;
- for (i=0; i < myData->test->streams; i++)
- {
- sptr = (float*) myData->streams[i];
- sptr += offset;
- // sptr += size;
- myData->streams[i] = (float*) sptr;
- }
- }
- break;
- case DOUBLE_RAND:
- case DOUBLE:
- {
- double* dptr;
- for (i=0; i < myData->test->streams; i++)
- {
- dptr = (double*) myData->streams[i];
- dptr += offset;
- // dptr += size;
- myData->streams[i] = (double*) dptr;
- }
- }
- break;
- }
-
- /* pint the thread */
- affinity_pinThread(myData->processors[threadId]);
-
- sleep(1);
- LIKWID_THREAD_INIT;
- BARRIER;
- if (OUTSTREAM)
- {
- fprintf(OUTSTREAM, "Group: %d Thread %d Global Thread %d running on core %d - Vector length %llu Offset %d\n",
- data->groupId,
- threadId,
- data->globalThreadId,
- affinity_threadGetProcessorId(),
- LLU_CAST size,
- offset);
- }
- BARRIER;
-
- /* Up to 10 streams the following registers are used for Array ptr:
- * Size rdi
- * in Registers: rsi rdx rcx r8 r9
- * passed on stack, then: r10 r11 r12 r13 r14 r15
- * If more than 10 streams are used first 5 streams are in register, above 5 a macro must be used to
- * load them from stack
- * */
-
- switch ( myData->test->streams ) {
- case STREAM_1:
- EXECUTE(func(size,myData->streams[0]));
- break;
- case STREAM_2:
- EXECUTE(func(size,myData->streams[0],myData->streams[1]));
- break;
- case STREAM_3:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2]));
- break;
- case STREAM_4:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3]));
- break;
- case STREAM_5:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4]));
- break;
- case STREAM_6:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5]));
- break;
- case STREAM_7:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6]));
- break;
- case STREAM_8:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7]));
- break;
- case STREAM_9:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8]));
- break;
- case STREAM_10:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9]));
- break;
- case STREAM_11:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10]));
- break;
- case STREAM_12:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11]));
- break;
- case STREAM_13:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12]));
- break;
- case STREAM_14:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13]));
- break;
- case STREAM_15:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13],myData->streams[14]));
- break;
- case STREAM_16:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15]));
- break;
- case STREAM_17:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
- myData->streams[16]));
- break;
- case STREAM_18:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
- myData->streams[16],myData->streams[17]));
- break;
- case STREAM_19:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
- myData->streams[16],myData->streams[17],myData->streams[18]));
- break;
- case STREAM_20:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
- myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19]));
- break;
- case STREAM_21:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
- myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
- myData->streams[20]));
- break;
- case STREAM_22:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
- myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
- myData->streams[20],myData->streams[21]));
- break;
- case STREAM_23:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
- myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
- myData->streams[20],myData->streams[21],myData->streams[22]));
- break;
- case STREAM_24:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
- myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
- myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23]));
- break;
- case STREAM_25:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
- myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
- myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
- myData->streams[24]));
- break;
- case STREAM_26:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
- myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
- myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
- myData->streams[24],myData->streams[25]));
- break;
- case STREAM_27:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
- myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
- myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
- myData->streams[24],myData->streams[25],myData->streams[26]));
- break;
- case STREAM_28:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
- myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
- myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
- myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27]));
- break;
- case STREAM_29:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
- myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
- myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
- myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
- myData->streams[28]));
- break;
- case STREAM_30:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
- myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
- myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
- myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
- myData->streams[28],myData->streams[29]));
- break;
- case STREAM_31:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
- myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
- myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
- myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
- myData->streams[28],myData->streams[29],myData->streams[30]));
- break;
- case STREAM_32:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
- myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
- myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
- myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
- myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31]));
- break;
- case STREAM_33:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
- myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
- myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
- myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
- myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
- myData->streams[32]));
- break;
- case STREAM_34:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
- myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
- myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
- myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
- myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
- myData->streams[32],myData->streams[33]));
- break;
- case STREAM_35:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
- myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
- myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
- myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
- myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
- myData->streams[32],myData->streams[33],myData->streams[34]));
- break;
- case STREAM_36:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
- myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
- myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
- myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
- myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
- myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35]));
- break;
- case STREAM_37:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
- myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
- myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
- myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
- myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
- myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35],
- myData->streams[36]));
- break;
- case STREAM_38:
- EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
- myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
- myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
- myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
- myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
- myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
- myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
- myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
- myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35],
- myData->streams[36],myData->streams[37]));
- break;
- default:
- break;
- }
-#ifdef PAPI
- double papi_result = 0.0;
- // L2 & L3 group
- if (strncmp(group_ptr,"L3",2) == 0 ||
- strncmp(group_ptr,"L2",2) == 0)
- {
- papi_result = ((double)result[1]) * 64.0;
- }
- // FLOPS_AVX
- else if (strncmp(group_ptr,"FLOPS",5) == 0)
- {
- papi_result = (double) result[1]+ (double) result[2];
- }
- if (OUTSTREAM)
- {
- fprintf(OUTSTREAM, "Thread %d Result %f\n",threadId, papi_result);
- }
-#endif
- pthread_exit(NULL);
-}
-
-
diff --git a/src/bitUtil.c b/src/bitUtil.c
index cdce490..099626c 100644
--- a/src/bitUtil.c
+++ b/src/bitUtil.c
@@ -5,13 +5,13 @@
*
* Description: Utility routines manipulating bit arrays.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -72,18 +72,18 @@ extractBitField(uint32_t inField, uint32_t width, uint32_t offset)
uint32_t
getBitFieldWidth(uint32_t number)
{
- uint32_t fieldWidth;
+ uint32_t fieldWidth=0;
number--;
if (number == 0)
{
return 0;
}
-
+#ifdef __x86_64
__asm__ volatile ( "bsr %%eax, %%ecx\n\t"
: "=c" (fieldWidth)
: "a"(number));
-
+#endif
return fieldWidth+1; /* bsr returns the position, we want the width */
}
diff --git a/src/bstrlib.c b/src/bstrlib.c
index 52f5a99..380269c 100644
--- a/src/bstrlib.c
+++ b/src/bstrlib.c
@@ -64,27 +64,27 @@
/* Compute the snapped size for a given requested size. By snapping to powers
of 2 like this, repeated reallocations are avoided. */
static int snapUpSize (int i) {
- if (i < 8) {
- i = 8;
- } else {
- unsigned int j;
- j = (unsigned int) i;
-
- j |= (j >> 1);
- j |= (j >> 2);
- j |= (j >> 4);
- j |= (j >> 8); /* Ok, since int >= 16 bits */
+ if (i < 8) {
+ i = 8;
+ } else {
+ unsigned int j;
+ j = (unsigned int) i;
+
+ j |= (j >> 1);
+ j |= (j >> 2);
+ j |= (j >> 4);
+ j |= (j >> 8); /* Ok, since int >= 16 bits */
#if (UINT_MAX != 0xffff)
- j |= (j >> 16); /* For 32 bit int systems */
+ j |= (j >> 16); /* For 32 bit int systems */
#if (UINT_MAX > 0xffffffffUL)
- j |= (j >> 32); /* For 64 bit int systems */
+ j |= (j >> 32); /* For 64 bit int systems */
#endif
#endif
- /* Least power of two greater than i */
- j++;
- if ((int) j >= i) i = (int) j;
- }
- return i;
+ /* Least power of two greater than i */
+ j++;
+ if ((int) j >= i) i = (int) j;
+ }
+ return i;
}
/* int balloc (bstring b, int len)
@@ -92,59 +92,59 @@ static int snapUpSize (int i) {
* Increase the size of the memory backing the bstring b to at least len.
*/
int balloc (bstring b, int olen) {
- int len;
- if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen <= 0 ||
- b->mlen < b->slen || olen <= 0) {
- return BSTR_ERR;
- }
+ int len;
+ if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen <= 0 ||
+ b->mlen < b->slen || olen <= 0) {
+ return BSTR_ERR;
+ }
- if (olen >= b->mlen) {
- unsigned char * x;
+ if (olen >= b->mlen) {
+ unsigned char * x;
- if ((len = snapUpSize (olen)) <= b->mlen) return BSTR_OK;
+ if ((len = snapUpSize (olen)) <= b->mlen) return BSTR_OK;
- /* Assume probability of a non-moving realloc is 0.125 */
- if (7 * b->mlen < 8 * b->slen) {
+ /* Assume probability of a non-moving realloc is 0.125 */
+ if (7 * b->mlen < 8 * b->slen) {
- /* If slen is close to mlen in size then use realloc to reduce
- the memory defragmentation */
+ /* If slen is close to mlen in size then use realloc to reduce
+ the memory defragmentation */
- reallocStrategy:;
+ reallocStrategy:;
- x = (unsigned char *) bstr__realloc (b->data, (size_t) len);
- if (x == NULL) {
+ x = (unsigned char *) bstr__realloc (b->data, (size_t) len);
+ if (x == NULL) {
- /* Since we failed, try allocating the tighest possible
- allocation */
+ /* Since we failed, try allocating the tighest possible
+ allocation */
- if (NULL == (x = (unsigned char *) bstr__realloc (b->data, (size_t) (len = olen)))) {
- return BSTR_ERR;
- }
- }
- } else {
+ if (NULL == (x = (unsigned char *) bstr__realloc (b->data, (size_t) (len = olen)))) {
+ return BSTR_ERR;
+ }
+ }
+ } else {
- /* If slen is not close to mlen then avoid the penalty of copying
- the extra bytes that are allocated, but not considered part of
- the string */
+ /* If slen is not close to mlen then avoid the penalty of copying
+ the extra bytes that are allocated, but not considered part of
+ the string */
- if (NULL == (x = (unsigned char *) bstr__alloc ((size_t) len))) {
+ if (NULL == (x = (unsigned char *) bstr__alloc ((size_t) len))) {
- /* Perhaps there is no available memory for the two
- allocations to be in memory at once */
+ /* Perhaps there is no available memory for the two
+ allocations to be in memory at once */
- goto reallocStrategy;
+ goto reallocStrategy;
- } else {
- if (b->slen) bstr__memcpy ((char *) x, (char *) b->data, (size_t) b->slen);
- bstr__free (b->data);
- }
- }
- b->data = x;
- b->mlen = len;
- b->data[b->slen] = (unsigned char) '\0';
- }
+ } else {
+ if (b->slen) bstr__memcpy ((char *) x, (char *) b->data, (size_t) b->slen);
+ bstr__free (b->data);
+ }
+ }
+ b->data = x;
+ b->mlen = len;
+ b->data[b->slen] = (unsigned char) '\0';
+ }
- return BSTR_OK;
+ return BSTR_OK;
}
/* int ballocmin (bstring b, int len)
@@ -154,24 +154,24 @@ int balloc (bstring b, int olen) {
* performance.
*/
int ballocmin (bstring b, int len) {
- unsigned char * s;
+ unsigned char * s;
- if (b == NULL || b->data == NULL || (b->slen+1) < 0 || b->mlen <= 0 ||
- b->mlen < b->slen || len <= 0) {
- return BSTR_ERR;
- }
+ if (b == NULL || b->data == NULL || (b->slen+1) < 0 || b->mlen <= 0 ||
+ b->mlen < b->slen || len <= 0) {
+ return BSTR_ERR;
+ }
- if (len < b->slen + 1) len = b->slen + 1;
+ if (len < b->slen + 1) len = b->slen + 1;
- if (len != b->mlen) {
- s = (unsigned char *) bstr__realloc (b->data, (size_t) len);
- if (NULL == s) return BSTR_ERR;
- s[b->slen] = (unsigned char) '\0';
- b->data = s;
- b->mlen = len;
- }
+ if (len != b->mlen) {
+ s = (unsigned char *) bstr__realloc (b->data, (size_t) len);
+ if (NULL == s) return BSTR_ERR;
+ s[b->slen] = (unsigned char) '\0';
+ b->data = s;
+ b->mlen = len;
+ }
- return BSTR_OK;
+ return BSTR_OK;
}
/* bstring bfromcstr (const char * str)
@@ -184,21 +184,21 @@ bstring b;
int i;
size_t j;
- if (str == NULL) return NULL;
- j = (strlen) (str);
- i = snapUpSize ((int) (j + (2 - (j != 0))));
- if (i <= (int) j) return NULL;
+ if (str == NULL) return NULL;
+ j = (strlen) (str);
+ i = snapUpSize ((int) (j + (2 - (j != 0))));
+ if (i <= (int) j) return NULL;
- b = (bstring) bstr__alloc (sizeof (struct tagbstring));
- if (NULL == b) return NULL;
- b->slen = (int) j;
- if (NULL == (b->data = (unsigned char *) bstr__alloc (b->mlen = i))) {
- bstr__free (b);
- return NULL;
- }
+ b = (bstring) bstr__alloc (sizeof (struct tagbstring));
+ if (NULL == b) return NULL;
+ b->slen = (int) j;
+ if (NULL == (b->data = (unsigned char *) bstr__alloc (b->mlen = i))) {
+ bstr__free (b);
+ return NULL;
+ }
- bstr__memcpy (b->data, str, j+1);
- return b;
+ bstr__memcpy (b->data, str, j+1);
+ return b;
}
/* bstring bfromcstralloc (int mlen, const char * str)
@@ -212,23 +212,23 @@ bstring b;
int i;
size_t j;
- if (str == NULL) return NULL;
- j = (strlen) (str);
- i = snapUpSize ((int) (j + (2 - (j != 0))));
- if (i <= (int) j) return NULL;
+ if (str == NULL) return NULL;
+ j = (strlen) (str);
+ i = snapUpSize ((int) (j + (2 - (j != 0))));
+ if (i <= (int) j) return NULL;
- b = (bstring) bstr__alloc (sizeof (struct tagbstring));
- if (b == NULL) return NULL;
- b->slen = (int) j;
- if (i < mlen) i = mlen;
+ b = (bstring) bstr__alloc (sizeof (struct tagbstring));
+ if (b == NULL) return NULL;
+ b->slen = (int) j;
+ if (i < mlen) i = mlen;
- if (NULL == (b->data = (unsigned char *) bstr__alloc (b->mlen = i))) {
- bstr__free (b);
- return NULL;
- }
+ if (NULL == (b->data = (unsigned char *) bstr__alloc (b->mlen = i))) {
+ bstr__free (b);
+ return NULL;
+ }
- bstr__memcpy (b->data, str, j+1);
- return b;
+ bstr__memcpy (b->data, str, j+1);
+ return b;
}
/* bstring blk2bstr (const void * blk, int len)
@@ -240,26 +240,26 @@ bstring blk2bstr (const void * blk, int len) {
bstring b;
int i;
- if (blk == NULL || len < 0) return NULL;
- b = (bstring) bstr__alloc (sizeof (struct tagbstring));
- if (b == NULL) return NULL;
- b->slen = len;
+ if (blk == NULL || len < 0) return NULL;
+ b = (bstring) bstr__alloc (sizeof (struct tagbstring));
+ if (b == NULL) return NULL;
+ b->slen = len;
- i = len + (2 - (len != 0));
- i = snapUpSize (i);
+ i = len + (2 - (len != 0));
+ i = snapUpSize (i);
- b->mlen = i;
+ b->mlen = i;
- b->data = (unsigned char *) bstr__alloc ((size_t) b->mlen);
- if (b->data == NULL) {
- bstr__free (b);
- return NULL;
- }
+ b->data = (unsigned char *) bstr__alloc ((size_t) b->mlen);
+ if (b->data == NULL) {
+ bstr__free (b);
+ return NULL;
+ }
- if (len > 0) bstr__memcpy (b->data, blk, (size_t) len);
- b->data[len] = (unsigned char) '\0';
+ if (len > 0) bstr__memcpy (b->data, blk, (size_t) len);
+ b->data[len] = (unsigned char) '\0';
- return b;
+ return b;
}
/* char * bstr2cstr (const_bstring s, char z)
@@ -273,18 +273,18 @@ char * bstr2cstr (const_bstring b, char z) {
int i, l;
char * r;
- if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
- l = b->slen;
- r = (char *) bstr__alloc ((size_t) (l + 1));
- if (r == NULL) return r;
+ if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
+ l = b->slen;
+ r = (char *) bstr__alloc ((size_t) (l + 1));
+ if (r == NULL) return r;
- for (i=0; i < l; i ++) {
- r[i] = (char) ((b->data[i] == '\0') ? z : (char) (b->data[i]));
- }
+ for (i=0; i < l; i ++) {
+ r[i] = (char) ((b->data[i] == '\0') ? z : (char) (b->data[i]));
+ }
- r[l] = (unsigned char) '\0';
+ r[l] = (unsigned char) '\0';
- return r;
+ return r;
}
/* int bcstrfree (char * s)
@@ -299,11 +299,11 @@ char * r;
* redefinitions.
*/
int bcstrfree (char * s) {
- if (s) {
- bstr__free (s);
- return BSTR_OK;
- }
- return BSTR_ERR;
+ if (s) {
+ bstr__free (s);
+ return BSTR_OK;
+ }
+ return BSTR_ERR;
}
/* int bconcat (bstring b0, const_bstring b1)
@@ -314,28 +314,28 @@ int bconcat (bstring b0, const_bstring b1) {
int len, d;
bstring aux = (bstring) b1;
- if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL) return BSTR_ERR;
+ if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL) return BSTR_ERR;
- d = b0->slen;
- len = b1->slen;
- if ((d | (b0->mlen - d) | len | (d + len)) < 0) return BSTR_ERR;
+ d = b0->slen;
+ len = b1->slen;
+ if ((d | (b0->mlen - d) | len | (d + len)) < 0) return BSTR_ERR;
- if (b0->mlen <= d + len + 1) {
- ptrdiff_t pd = b1->data - b0->data;
- if (0 <= pd && pd < b0->mlen) {
- if (NULL == (aux = bstrcpy (b1))) return BSTR_ERR;
- }
- if (balloc (b0, d + len + 1) != BSTR_OK) {
- if (aux != b1) bdestroy (aux);
- return BSTR_ERR;
- }
- }
+ if (b0->mlen <= d + len + 1) {
+ ptrdiff_t pd = b1->data - b0->data;
+ if (0 <= pd && pd < b0->mlen) {
+ if (NULL == (aux = bstrcpy (b1))) return BSTR_ERR;
+ }
+ if (balloc (b0, d + len + 1) != BSTR_OK) {
+ if (aux != b1) bdestroy (aux);
+ return BSTR_ERR;
+ }
+ }
- bBlockCopy (&b0->data[d], &aux->data[0], (size_t) len);
- b0->data[d + len] = (unsigned char) '\0';
- b0->slen = d + len;
- if (aux != b1) bdestroy (aux);
- return BSTR_OK;
+ bBlockCopy (&b0->data[d], &aux->data[0], (size_t) len);
+ b0->data[d + len] = (unsigned char) '\0';
+ b0->slen = d + len;
+ if (aux != b1) bdestroy (aux);
+ return BSTR_OK;
}
/* int bconchar (bstring b, char c)
@@ -345,13 +345,13 @@ bstring aux = (bstring) b1;
int bconchar (bstring b, char c) {
int d;
- if (b == NULL) return BSTR_ERR;
- d = b->slen;
- if ((d | (b->mlen - d)) < 0 || balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
- b->data[d] = (unsigned char) c;
- b->data[d + 1] = (unsigned char) '\0';
- b->slen++;
- return BSTR_OK;
+ if (b == NULL) return BSTR_ERR;
+ d = b->slen;
+ if ((d | (b->mlen - d)) < 0 || balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
+ b->data[d] = (unsigned char) c;
+ b->data[d + 1] = (unsigned char) '\0';
+ b->slen++;
+ return BSTR_OK;
}
/* int bcatcstr (bstring b, const char * s)
@@ -362,22 +362,22 @@ int bcatcstr (bstring b, const char * s) {
char * d;
int i, l;
- if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen < b->slen
- || b->mlen <= 0 || s == NULL) return BSTR_ERR;
+ if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen < b->slen
+ || b->mlen <= 0 || s == NULL) return BSTR_ERR;
- /* Optimistically concatenate directly */
- l = b->mlen - b->slen;
- d = (char *) &b->data[b->slen];
- for (i=0; i < l; i++) {
- if ((*d++ = *s++) == '\0') {
- b->slen += i;
- return BSTR_OK;
- }
- }
- b->slen += i;
+ /* Optimistically concatenate directly */
+ l = b->mlen - b->slen;
+ d = (char *) &b->data[b->slen];
+ for (i=0; i < l; i++) {
+ if ((*d++ = *s++) == '\0') {
+ b->slen += i;
+ return BSTR_OK;
+ }
+ }
+ b->slen += i;
- /* Need to explicitely resize and concatenate tail */
- return bcatblk (b, (const void *) s, (int) strlen (s));
+ /* Need to explicitely resize and concatenate tail */
+ return bcatblk (b, (const void *) s, (int) strlen (s));
}
/* int bcatblk (bstring b, const void * s, int len)
@@ -387,16 +387,16 @@ int i, l;
int bcatblk (bstring b, const void * s, int len) {
int nl;
- if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen < b->slen
- || b->mlen <= 0 || s == NULL || len < 0) return BSTR_ERR;
+ if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen < b->slen
+ || b->mlen <= 0 || s == NULL || len < 0) return BSTR_ERR;
- if (0 > (nl = b->slen + len)) return BSTR_ERR; /* Overflow? */
- if (b->mlen <= nl && 0 > balloc (b, nl + 1)) return BSTR_ERR;
+ if (0 > (nl = b->slen + len)) return BSTR_ERR; /* Overflow? */
+ if (b->mlen <= nl && 0 > balloc (b, nl + 1)) return BSTR_ERR;
- bBlockCopy (&b->data[b->slen], s, (size_t) len);
- b->slen = nl;
- b->data[nl] = (unsigned char) '\0';
- return BSTR_OK;
+ bBlockCopy (&b->data[b->slen], s, (size_t) len);
+ b->slen = nl;
+ b->data[nl] = (unsigned char) '\0';
+ return BSTR_OK;
}
/* bstring bstrcpy (const_bstring b)
@@ -407,36 +407,36 @@ bstring bstrcpy (const_bstring b) {
bstring b0;
int i,j;
- /* Attempted to copy an invalid string? */
- if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
+ /* Attempted to copy an invalid string? */
+ if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
- b0 = (bstring) bstr__alloc (sizeof (struct tagbstring));
- if (b0 == NULL) {
- /* Unable to allocate memory for string header */
- return NULL;
- }
+ b0 = (bstring) bstr__alloc (sizeof (struct tagbstring));
+ if (b0 == NULL) {
+ /* Unable to allocate memory for string header */
+ return NULL;
+ }
- i = b->slen;
- j = snapUpSize (i + 1);
+ i = b->slen;
+ j = snapUpSize (i + 1);
- b0->data = (unsigned char *) bstr__alloc (j);
- if (b0->data == NULL) {
- j = i + 1;
- b0->data = (unsigned char *) bstr__alloc (j);
- if (b0->data == NULL) {
- /* Unable to allocate memory for string data */
- bstr__free (b0);
- return NULL;
- }
- }
+ b0->data = (unsigned char *) bstr__alloc (j);
+ if (b0->data == NULL) {
+ j = i + 1;
+ b0->data = (unsigned char *) bstr__alloc (j);
+ if (b0->data == NULL) {
+ /* Unable to allocate memory for string data */
+ bstr__free (b0);
+ return NULL;
+ }
+ }
- b0->mlen = j;
- b0->slen = i;
+ b0->mlen = j;
+ b0->slen = i;
- if (i) bstr__memcpy ((char *) b0->data, (char *) b->data, i);
- b0->data[b0->slen] = (unsigned char) '\0';
+ if (i) bstr__memcpy ((char *) b0->data, (char *) b->data, i);
+ b0->data[b0->slen] = (unsigned char) '\0';
- return b0;
+ return b0;
}
/* int bassign (bstring a, const_bstring b)
@@ -444,19 +444,19 @@ int i,j;
* Overwrite the string a with the contents of string b.
*/
int bassign (bstring a, const_bstring b) {
- if (b == NULL || b->data == NULL || b->slen < 0)
- return BSTR_ERR;
- if (b->slen != 0) {
- if (balloc (a, b->slen) != BSTR_OK) return BSTR_ERR;
- bstr__memmove (a->data, b->data, b->slen);
- } else {
- if (a == NULL || a->data == NULL || a->mlen < a->slen ||
- a->slen < 0 || a->mlen == 0)
- return BSTR_ERR;
- }
- a->data[b->slen] = (unsigned char) '\0';
- a->slen = b->slen;
- return BSTR_OK;
+ if (b == NULL || b->data == NULL || b->slen < 0)
+ return BSTR_ERR;
+ if (b->slen != 0) {
+ if (balloc (a, b->slen) != BSTR_OK) return BSTR_ERR;
+ bstr__memmove (a->data, b->data, b->slen);
+ } else {
+ if (a == NULL || a->data == NULL || a->mlen < a->slen ||
+ a->slen < 0 || a->mlen == 0)
+ return BSTR_ERR;
+ }
+ a->data[b->slen] = (unsigned char) '\0';
+ a->slen = b->slen;
+ return BSTR_OK;
}
/* int bassignmidstr (bstring a, const_bstring b, int left, int len)
@@ -466,29 +466,29 @@ int bassign (bstring a, const_bstring b) {
* len are clamped to the ends of b as with the function bmidstr.
*/
int bassignmidstr (bstring a, const_bstring b, int left, int len) {
- if (b == NULL || b->data == NULL || b->slen < 0)
- return BSTR_ERR;
+ if (b == NULL || b->data == NULL || b->slen < 0)
+ return BSTR_ERR;
- if (left < 0) {
- len += left;
- left = 0;
- }
+ if (left < 0) {
+ len += left;
+ left = 0;
+ }
- if (len > b->slen - left) len = b->slen - left;
+ if (len > b->slen - left) len = b->slen - left;
- if (a == NULL || a->data == NULL || a->mlen < a->slen ||
- a->slen < 0 || a->mlen == 0)
- return BSTR_ERR;
+ if (a == NULL || a->data == NULL || a->mlen < a->slen ||
+ a->slen < 0 || a->mlen == 0)
+ return BSTR_ERR;
- if (len > 0) {
- if (balloc (a, len) != BSTR_OK) return BSTR_ERR;
- bstr__memmove (a->data, b->data + left, len);
- a->slen = len;
- } else {
- a->slen = 0;
- }
- a->data[a->slen] = (unsigned char) '\0';
- return BSTR_OK;
+ if (len > 0) {
+ if (balloc (a, len) != BSTR_OK) return BSTR_ERR;
+ bstr__memmove (a->data, b->data + left, len);
+ a->slen = len;
+ } else {
+ a->slen = 0;
+ }
+ a->data[a->slen] = (unsigned char) '\0';
+ return BSTR_OK;
}
/* int bassigncstr (bstring a, const char * str)
@@ -500,24 +500,24 @@ int bassignmidstr (bstring a, const_bstring b, int left, int len) {
int bassigncstr (bstring a, const char * str) {
int i;
size_t len;
- if (a == NULL || a->data == NULL || a->mlen < a->slen ||
- a->slen < 0 || a->mlen == 0 || NULL == str)
- return BSTR_ERR;
-
- for (i=0; i < a->mlen; i++) {
- if ('\0' == (a->data[i] = str[i])) {
- a->slen = i;
- return BSTR_OK;
- }
- }
-
- a->slen = i;
- len = strlen (str + i);
- if (len > INT_MAX || i + len + 1 > INT_MAX ||
- 0 > balloc (a, (int) (i + len + 1))) return BSTR_ERR;
- bBlockCopy (a->data + i, str + i, (size_t) len + 1);
- a->slen += (int) len;
- return BSTR_OK;
+ if (a == NULL || a->data == NULL || a->mlen < a->slen ||
+ a->slen < 0 || a->mlen == 0 || NULL == str)
+ return BSTR_ERR;
+
+ for (i=0; i < a->mlen; i++) {
+ if ('\0' == (a->data[i] = str[i])) {
+ a->slen = i;
+ return BSTR_OK;
+ }
+ }
+
+ a->slen = i;
+ len = strlen (str + i);
+ if (len > INT_MAX || i + len + 1 > INT_MAX ||
+ 0 > balloc (a, (int) (i + len + 1))) return BSTR_ERR;
+ bBlockCopy (a->data + i, str + i, (size_t) len + 1);
+ a->slen += (int) len;
+ return BSTR_OK;
}
/* int bassignblk (bstring a, const void * s, int len)
@@ -527,14 +527,14 @@ size_t len;
* occurs BSTR_ERR is returned and a is not overwritten.
*/
int bassignblk (bstring a, const void * s, int len) {
- if (a == NULL || a->data == NULL || a->mlen < a->slen ||
- a->slen < 0 || a->mlen == 0 || NULL == s || len + 1 < 1)
- return BSTR_ERR;
- if (len + 1 > a->mlen && 0 > balloc (a, len + 1)) return BSTR_ERR;
- bBlockCopy (a->data, s, (size_t) len);
- a->data[len] = (unsigned char) '\0';
- a->slen = len;
- return BSTR_OK;
+ if (a == NULL || a->data == NULL || a->mlen < a->slen ||
+ a->slen < 0 || a->mlen == 0 || NULL == s || len + 1 < 1)
+ return BSTR_ERR;
+ if (len + 1 > a->mlen && 0 > balloc (a, len + 1)) return BSTR_ERR;
+ bBlockCopy (a->data, s, (size_t) len);
+ a->data[len] = (unsigned char) '\0';
+ a->slen = len;
+ return BSTR_OK;
}
/* int btrunc (bstring b, int n)
@@ -542,13 +542,13 @@ int bassignblk (bstring a, const void * s, int len) {
* Truncate the bstring to at most n characters.
*/
int btrunc (bstring b, int n) {
- if (n < 0 || b == NULL || b->data == NULL || b->mlen < b->slen ||
- b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
- if (b->slen > n) {
- b->slen = n;
- b->data[n] = (unsigned char) '\0';
- }
- return BSTR_OK;
+ if (n < 0 || b == NULL || b->data == NULL || b->mlen < b->slen ||
+ b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+ if (b->slen > n) {
+ b->slen = n;
+ b->data[n] = (unsigned char) '\0';
+ }
+ return BSTR_OK;
}
#define upcase(c) (toupper ((unsigned char) c))
@@ -561,12 +561,12 @@ int btrunc (bstring b, int n) {
*/
int btoupper (bstring b) {
int i, len;
- if (b == NULL || b->data == NULL || b->mlen < b->slen ||
- b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
- for (i=0, len = b->slen; i < len; i++) {
- b->data[i] = (unsigned char) upcase (b->data[i]);
- }
- return BSTR_OK;
+ if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+ b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+ for (i=0, len = b->slen; i < len; i++) {
+ b->data[i] = (unsigned char) upcase (b->data[i]);
+ }
+ return BSTR_OK;
}
/* int btolower (bstring b)
@@ -575,12 +575,12 @@ int i, len;
*/
int btolower (bstring b) {
int i, len;
- if (b == NULL || b->data == NULL || b->mlen < b->slen ||
- b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
- for (i=0, len = b->slen; i < len; i++) {
- b->data[i] = (unsigned char) downcase (b->data[i]);
- }
- return BSTR_OK;
+ if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+ b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+ for (i=0, len = b->slen; i < len; i++) {
+ b->data[i] = (unsigned char) downcase (b->data[i]);
+ }
+ return BSTR_OK;
}
/* int bstricmp (const_bstring b0, const_bstring b1)
@@ -595,28 +595,28 @@ int i, len;
int bstricmp (const_bstring b0, const_bstring b1) {
int i, v, n;
- if (bdata (b0) == NULL || b0->slen < 0 ||
- bdata (b1) == NULL || b1->slen < 0) return SHRT_MIN;
- if ((n = b0->slen) > b1->slen) n = b1->slen;
- else if (b0->slen == b1->slen && b0->data == b1->data) return BSTR_OK;
-
- for (i = 0; i < n; i ++) {
- v = (char) downcase (b0->data[i])
- - (char) downcase (b1->data[i]);
- if (0 != v) return v;
- }
-
- if (b0->slen > n) {
- v = (char) downcase (b0->data[n]);
- if (v) return v;
- return UCHAR_MAX + 1;
- }
- if (b1->slen > n) {
- v = - (char) downcase (b1->data[n]);
- if (v) return v;
- return - (int) (UCHAR_MAX + 1);
- }
- return BSTR_OK;
+ if (bdata (b0) == NULL || b0->slen < 0 ||
+ bdata (b1) == NULL || b1->slen < 0) return SHRT_MIN;
+ if ((n = b0->slen) > b1->slen) n = b1->slen;
+ else if (b0->slen == b1->slen && b0->data == b1->data) return BSTR_OK;
+
+ for (i = 0; i < n; i ++) {
+ v = (char) downcase (b0->data[i])
+ - (char) downcase (b1->data[i]);
+ if (0 != v) return v;
+ }
+
+ if (b0->slen > n) {
+ v = (char) downcase (b0->data[n]);
+ if (v) return v;
+ return UCHAR_MAX + 1;
+ }
+ if (b1->slen > n) {
+ v = - (char) downcase (b1->data[n]);
+ if (v) return v;
+ return - (int) (UCHAR_MAX + 1);
+ }
+ return BSTR_OK;
}
/* int bstrnicmp (const_bstring b0, const_bstring b1, int n)
@@ -632,31 +632,31 @@ int i, v, n;
int bstrnicmp (const_bstring b0, const_bstring b1, int n) {
int i, v, m;
- if (bdata (b0) == NULL || b0->slen < 0 ||
- bdata (b1) == NULL || b1->slen < 0 || n < 0) return SHRT_MIN;
- m = n;
- if (m > b0->slen) m = b0->slen;
- if (m > b1->slen) m = b1->slen;
+ if (bdata (b0) == NULL || b0->slen < 0 ||
+ bdata (b1) == NULL || b1->slen < 0 || n < 0) return SHRT_MIN;
+ m = n;
+ if (m > b0->slen) m = b0->slen;
+ if (m > b1->slen) m = b1->slen;
- if (b0->data != b1->data) {
- for (i = 0; i < m; i ++) {
- v = (char) downcase (b0->data[i]);
- v -= (char) downcase (b1->data[i]);
- if (v != 0) return b0->data[i] - b1->data[i];
- }
- }
+ if (b0->data != b1->data) {
+ for (i = 0; i < m; i ++) {
+ v = (char) downcase (b0->data[i]);
+ v -= (char) downcase (b1->data[i]);
+ if (v != 0) return b0->data[i] - b1->data[i];
+ }
+ }
- if (n == m || b0->slen == b1->slen) return BSTR_OK;
+ if (n == m || b0->slen == b1->slen) return BSTR_OK;
- if (b0->slen > m) {
- v = (char) downcase (b0->data[m]);
- if (v) return v;
- return UCHAR_MAX + 1;
- }
+ if (b0->slen > m) {
+ v = (char) downcase (b0->data[m]);
+ if (v) return v;
+ return UCHAR_MAX + 1;
+ }
- v = - (char) downcase (b1->data[m]);
- if (v) return v;
- return - (int) (UCHAR_MAX + 1);
+ v = - (char) downcase (b1->data[m]);
+ if (v) return v;
+ return - (int) (UCHAR_MAX + 1);
}
/* int biseqcaseless (const_bstring b0, const_bstring b1)
@@ -670,17 +670,17 @@ int i, v, m;
int biseqcaseless (const_bstring b0, const_bstring b1) {
int i, n;
- if (bdata (b0) == NULL || b0->slen < 0 ||
- bdata (b1) == NULL || b1->slen < 0) return BSTR_ERR;
- if (b0->slen != b1->slen) return BSTR_OK;
- if (b0->data == b1->data || b0->slen == 0) return 1;
- for (i=0, n=b0->slen; i < n; i++) {
- if (b0->data[i] != b1->data[i]) {
- unsigned char c = (unsigned char) downcase (b0->data[i]);
- if (c != (unsigned char) downcase (b1->data[i])) return 0;
- }
- }
- return 1;
+ if (bdata (b0) == NULL || b0->slen < 0 ||
+ bdata (b1) == NULL || b1->slen < 0) return BSTR_ERR;
+ if (b0->slen != b1->slen) return BSTR_OK;
+ if (b0->data == b1->data || b0->slen == 0) return 1;
+ for (i=0, n=b0->slen; i < n; i++) {
+ if (b0->data[i] != b1->data[i]) {
+ unsigned char c = (unsigned char) downcase (b0->data[i]);
+ if (c != (unsigned char) downcase (b1->data[i])) return 0;
+ }
+ }
+ return 1;
}
/* int bisstemeqcaselessblk (const_bstring b0, const void * blk, int len)
@@ -695,18 +695,18 @@ int i, n;
int bisstemeqcaselessblk (const_bstring b0, const void * blk, int len) {
int i;
- if (bdata (b0) == NULL || b0->slen < 0 || NULL == blk || len < 0)
- return BSTR_ERR;
- if (b0->slen < len) return BSTR_OK;
- if (b0->data == (const unsigned char *) blk || len == 0) return 1;
+ if (bdata (b0) == NULL || b0->slen < 0 || NULL == blk || len < 0)
+ return BSTR_ERR;
+ if (b0->slen < len) return BSTR_OK;
+ if (b0->data == (const unsigned char *) blk || len == 0) return 1;
- for (i = 0; i < len; i ++) {
- if (b0->data[i] != ((const unsigned char *) blk)[i]) {
- if (downcase (b0->data[i]) !=
- downcase (((const unsigned char *) blk)[i])) return 0;
- }
- }
- return 1;
+ for (i = 0; i < len; i ++) {
+ if (b0->data[i] != ((const unsigned char *) blk)[i]) {
+ if (downcase (b0->data[i]) !=
+ downcase (((const unsigned char *) blk)[i])) return 0;
+ }
+ }
+ return 1;
}
/*
@@ -717,18 +717,18 @@ int i;
int bltrimws (bstring b) {
int i, len;
- if (b == NULL || b->data == NULL || b->mlen < b->slen ||
- b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+ if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+ b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
- for (len = b->slen, i = 0; i < len; i++) {
- if (!wspace (b->data[i])) {
- return bdelete (b, 0, i);
- }
- }
+ for (len = b->slen, i = 0; i < len; i++) {
+ if (!wspace (b->data[i])) {
+ return bdelete (b, 0, i);
+ }
+ }
- b->data[0] = (unsigned char) '\0';
- b->slen = 0;
- return BSTR_OK;
+ b->data[0] = (unsigned char) '\0';
+ b->slen = 0;
+ return BSTR_OK;
}
/*
@@ -739,20 +739,20 @@ int i, len;
int brtrimws (bstring b) {
int i;
- if (b == NULL || b->data == NULL || b->mlen < b->slen ||
- b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+ if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+ b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
- for (i = b->slen - 1; i >= 0; i--) {
- if (!wspace (b->data[i])) {
- if (b->mlen > i) b->data[i+1] = (unsigned char) '\0';
- b->slen = i + 1;
- return BSTR_OK;
- }
- }
+ for (i = b->slen - 1; i >= 0; i--) {
+ if (!wspace (b->data[i])) {
+ if (b->mlen > i) b->data[i+1] = (unsigned char) '\0';
+ b->slen = i + 1;
+ return BSTR_OK;
+ }
+ }
- b->data[0] = (unsigned char) '\0';
- b->slen = 0;
- return BSTR_OK;
+ b->data[0] = (unsigned char) '\0';
+ b->slen = 0;
+ return BSTR_OK;
}
/*
@@ -763,21 +763,21 @@ int i;
int btrimws (bstring b) {
int i, j;
- if (b == NULL || b->data == NULL || b->mlen < b->slen ||
- b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+ if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+ b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
- for (i = b->slen - 1; i >= 0; i--) {
- if (!wspace (b->data[i])) {
- if (b->mlen > i) b->data[i+1] = (unsigned char) '\0';
- b->slen = i + 1;
- for (j = 0; wspace (b->data[j]); j++) {}
- return bdelete (b, 0, j);
- }
- }
+ for (i = b->slen - 1; i >= 0; i--) {
+ if (!wspace (b->data[i])) {
+ if (b->mlen > i) b->data[i+1] = (unsigned char) '\0';
+ b->slen = i + 1;
+ for (j = 0; wspace (b->data[j]); j++) {}
+ return bdelete (b, 0, j);
+ }
+ }
- b->data[0] = (unsigned char) '\0';
- b->slen = 0;
- return BSTR_OK;
+ b->data[0] = (unsigned char) '\0';
+ b->slen = 0;
+ return BSTR_OK;
}
/* int biseq (const_bstring b0, const_bstring b1)
@@ -788,11 +788,11 @@ int i, j;
* O(1). '\0' termination characters are not treated in any special way.
*/
int biseq (const_bstring b0, const_bstring b1) {
- if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
- b0->slen < 0 || b1->slen < 0) return BSTR_ERR;
- if (b0->slen != b1->slen) return BSTR_OK;
- if (b0->data == b1->data || b0->slen == 0) return 1;
- return !bstr__memcmp (b0->data, b1->data, b0->slen);
+ if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
+ b0->slen < 0 || b1->slen < 0) return BSTR_ERR;
+ if (b0->slen != b1->slen) return BSTR_OK;
+ if (b0->data == b1->data || b0->slen == 0) return 1;
+ return !bstr__memcmp (b0->data, b1->data, b0->slen);
}
/* int bisstemeqblk (const_bstring b0, const void * blk, int len)
@@ -806,15 +806,15 @@ int biseq (const_bstring b0, const_bstring b1) {
int bisstemeqblk (const_bstring b0, const void * blk, int len) {
int i;
- if (bdata (b0) == NULL || b0->slen < 0 || NULL == blk || len < 0)
- return BSTR_ERR;
- if (b0->slen < len) return BSTR_OK;
- if (b0->data == (const unsigned char *) blk || len == 0) return 1;
+ if (bdata (b0) == NULL || b0->slen < 0 || NULL == blk || len < 0)
+ return BSTR_ERR;
+ if (b0->slen < len) return BSTR_OK;
+ if (b0->data == (const unsigned char *) blk || len == 0) return 1;
- for (i = 0; i < len; i ++) {
- if (b0->data[i] != ((const unsigned char *) blk)[i]) return BSTR_OK;
- }
- return 1;
+ for (i = 0; i < len; i ++) {
+ if (b0->data[i] != ((const unsigned char *) blk)[i]) return BSTR_OK;
+ }
+ return 1;
}
/* int biseqcstr (const_bstring b, const char *s)
@@ -830,11 +830,11 @@ int i;
*/
int biseqcstr (const_bstring b, const char * s) {
int i;
- if (b == NULL || s == NULL || b->data == NULL || b->slen < 0) return BSTR_ERR;
- for (i=0; i < b->slen; i++) {
- if (s[i] == '\0' || b->data[i] != (unsigned char) s[i]) return BSTR_OK;
- }
- return s[i] == '\0';
+ if (b == NULL || s == NULL || b->data == NULL || b->slen < 0) return BSTR_ERR;
+ for (i=0; i < b->slen; i++) {
+ if (s[i] == '\0' || b->data[i] != (unsigned char) s[i]) return BSTR_OK;
+ }
+ return s[i] == '\0';
}
/* int biseqcstrcaseless (const_bstring b, const char *s)
@@ -851,14 +851,14 @@ int i;
*/
int biseqcstrcaseless (const_bstring b, const char * s) {
int i;
- if (b == NULL || s == NULL || b->data == NULL || b->slen < 0) return BSTR_ERR;
- for (i=0; i < b->slen; i++) {
- if (s[i] == '\0' ||
- (b->data[i] != (unsigned char) s[i] &&
- downcase (b->data[i]) != (unsigned char) downcase (s[i])))
- return BSTR_OK;
- }
- return s[i] == '\0';
+ if (b == NULL || s == NULL || b->data == NULL || b->slen < 0) return BSTR_ERR;
+ for (i=0; i < b->slen; i++) {
+ if (s[i] == '\0' ||
+ (b->data[i] != (unsigned char) s[i] &&
+ downcase (b->data[i]) != (unsigned char) downcase (s[i])))
+ return BSTR_OK;
+ }
+ return s[i] == '\0';
}
/* int bstrcmp (const_bstring b0, const_bstring b1)
@@ -878,21 +878,21 @@ int i;
int bstrcmp (const_bstring b0, const_bstring b1) {
int i, v, n;
- if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
- b0->slen < 0 || b1->slen < 0) return SHRT_MIN;
- n = b0->slen; if (n > b1->slen) n = b1->slen;
- if (b0->slen == b1->slen && (b0->data == b1->data || b0->slen == 0))
- return BSTR_OK;
+ if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
+ b0->slen < 0 || b1->slen < 0) return SHRT_MIN;
+ n = b0->slen; if (n > b1->slen) n = b1->slen;
+ if (b0->slen == b1->slen && (b0->data == b1->data || b0->slen == 0))
+ return BSTR_OK;
- for (i = 0; i < n; i ++) {
- v = ((char) b0->data[i]) - ((char) b1->data[i]);
- if (v != 0) return v;
- if (b0->data[i] == (unsigned char) '\0') return BSTR_OK;
- }
+ for (i = 0; i < n; i ++) {
+ v = ((char) b0->data[i]) - ((char) b1->data[i]);
+ if (v != 0) return v;
+ if (b0->data[i] == (unsigned char) '\0') return BSTR_OK;
+ }
- if (b0->slen > n) return 1;
- if (b1->slen > n) return -1;
- return BSTR_OK;
+ if (b0->slen > n) return 1;
+ if (b1->slen > n) return -1;
+ return BSTR_OK;
}
/* int bstrncmp (const_bstring b0, const_bstring b1, int n)
@@ -908,24 +908,24 @@ int i, v, n;
int bstrncmp (const_bstring b0, const_bstring b1, int n) {
int i, v, m;
- if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
- b0->slen < 0 || b1->slen < 0) return SHRT_MIN;
- m = n;
- if (m > b0->slen) m = b0->slen;
- if (m > b1->slen) m = b1->slen;
+ if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
+ b0->slen < 0 || b1->slen < 0) return SHRT_MIN;
+ m = n;
+ if (m > b0->slen) m = b0->slen;
+ if (m > b1->slen) m = b1->slen;
- if (b0->data != b1->data) {
- for (i = 0; i < m; i ++) {
- v = ((char) b0->data[i]) - ((char) b1->data[i]);
- if (v != 0) return v;
- if (b0->data[i] == (unsigned char) '\0') return BSTR_OK;
- }
- }
+ if (b0->data != b1->data) {
+ for (i = 0; i < m; i ++) {
+ v = ((char) b0->data[i]) - ((char) b1->data[i]);
+ if (v != 0) return v;
+ if (b0->data[i] == (unsigned char) '\0') return BSTR_OK;
+ }
+ }
- if (n == m || b0->slen == b1->slen) return BSTR_OK;
+ if (n == m || b0->slen == b1->slen) return BSTR_OK;
- if (b0->slen > m) return 1;
- return -1;
+ if (b0->slen > m) return 1;
+ return -1;
}
/* bstring bmidstr (const_bstring b, int left, int len)
@@ -937,17 +937,17 @@ int i, v, m;
*/
bstring bmidstr (const_bstring b, int left, int len) {
- if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
+ if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
- if (left < 0) {
- len += left;
- left = 0;
- }
+ if (left < 0) {
+ len += left;
+ left = 0;
+ }
- if (len > b->slen - left) len = b->slen - left;
+ if (len > b->slen - left) len = b->slen - left;
- if (len <= 0) return bfromcstr ("");
- return blk2bstr (b->data + left, len);
+ if (len <= 0) return bfromcstr ("");
+ return blk2bstr (b->data + left, len);
}
/* int bdelete (bstring b, int pos, int len)
@@ -958,27 +958,27 @@ bstring bmidstr (const_bstring b, int left, int len) {
* len) is clamped to boundaries of the bstring b.
*/
int bdelete (bstring b, int pos, int len) {
- /* Clamp to left side of bstring */
- if (pos < 0) {
- len += pos;
- pos = 0;
- }
-
- if (len < 0 || b == NULL || b->data == NULL || b->slen < 0 ||
- b->mlen < b->slen || b->mlen <= 0)
- return BSTR_ERR;
- if (len > 0 && pos < b->slen) {
- if (pos + len >= b->slen) {
- b->slen = pos;
- } else {
- bBlockCopy ((char *) (b->data + pos),
- (char *) (b->data + pos + len),
- b->slen - (pos+len));
- b->slen -= len;
- }
- b->data[b->slen] = (unsigned char) '\0';
- }
- return BSTR_OK;
+ /* Clamp to left side of bstring */
+ if (pos < 0) {
+ len += pos;
+ pos = 0;
+ }
+
+ if (len < 0 || b == NULL || b->data == NULL || b->slen < 0 ||
+ b->mlen < b->slen || b->mlen <= 0)
+ return BSTR_ERR;
+ if (len > 0 && pos < b->slen) {
+ if (pos + len >= b->slen) {
+ b->slen = pos;
+ } else {
+ bBlockCopy ((char *) (b->data + pos),
+ (char *) (b->data + pos + len),
+ b->slen - (pos+len));
+ b->slen -= len;
+ }
+ b->data[b->slen] = (unsigned char) '\0';
+ }
+ return BSTR_OK;
}
/* int bdestroy (bstring b)
@@ -989,21 +989,21 @@ int bdelete (bstring b, int pos, int len) {
* been bdestroyed is undefined.
*/
int bdestroy (bstring b) {
- if (b == NULL || b->slen < 0 || b->mlen <= 0 || b->mlen < b->slen ||
- b->data == NULL)
- return BSTR_ERR;
+ if (b == NULL || b->slen < 0 || b->mlen <= 0 || b->mlen < b->slen ||
+ b->data == NULL)
+ return BSTR_ERR;
- bstr__free (b->data);
+ bstr__free (b->data);
- /* In case there is any stale usage, there is one more chance to
- notice this error. */
+ /* In case there is any stale usage, there is one more chance to
+ notice this error. */
- b->slen = -1;
- b->mlen = -__LINE__;
- b->data = NULL;
+ b->slen = -1;
+ b->mlen = -__LINE__;
+ b->data = NULL;
- bstr__free (b);
- return BSTR_OK;
+ bstr__free (b);
+ return BSTR_OK;
}
/* int binstr (const_bstring b1, int pos, const_bstring b2)
@@ -1023,74 +1023,74 @@ register unsigned char * d1;
register unsigned char c1;
register int i;
- if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
- b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
- if (b1->slen == pos) return (b2->slen == 0)?pos:BSTR_ERR;
- if (b1->slen < pos || pos < 0) return BSTR_ERR;
- if (b2->slen == 0) return pos;
+ if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
+ b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
+ if (b1->slen == pos) return (b2->slen == 0)?pos:BSTR_ERR;
+ if (b1->slen < pos || pos < 0) return BSTR_ERR;
+ if (b2->slen == 0) return pos;
- /* No space to find such a string? */
- if ((lf = b1->slen - b2->slen + 1) <= pos) return BSTR_ERR;
+ /* No space to find such a string? */
+ if ((lf = b1->slen - b2->slen + 1) <= pos) return BSTR_ERR;
- /* An obvious alias case */
- if (b1->data == b2->data && pos == 0) return 0;
+ /* An obvious alias case */
+ if (b1->data == b2->data && pos == 0) return 0;
- i = pos;
+ i = pos;
- d0 = b2->data;
- d1 = b1->data;
- ll = b2->slen;
+ d0 = b2->data;
+ d1 = b1->data;
+ ll = b2->slen;
- /* Peel off the b2->slen == 1 case */
- c0 = d0[0];
- if (1 == ll) {
- for (;i < lf; i++) if (c0 == d1[i]) return i;
- return BSTR_ERR;
- }
+ /* Peel off the b2->slen == 1 case */
+ c0 = d0[0];
+ if (1 == ll) {
+ for (;i < lf; i++) if (c0 == d1[i]) return i;
+ return BSTR_ERR;
+ }
- c1 = c0;
- j = 0;
- lf = b1->slen - 1;
+ c1 = c0;
+ j = 0;
+ lf = b1->slen - 1;
- ii = -1;
- if (i < lf) do {
- /* Unrolled current character test */
- if (c1 != d1[i]) {
- if (c1 != d1[1+i]) {
- i += 2;
- continue;
- }
- i++;
- }
+ ii = -1;
+ if (i < lf) do {
+ /* Unrolled current character test */
+ if (c1 != d1[i]) {
+ if (c1 != d1[1+i]) {
+ i += 2;
+ continue;
+ }
+ i++;
+ }
- /* Take note if this is the start of a potential match */
- if (0 == j) ii = i;
+ /* Take note if this is the start of a potential match */
+ if (0 == j) ii = i;
- /* Shift the test character down by one */
- j++;
- i++;
+ /* Shift the test character down by one */
+ j++;
+ i++;
- /* If this isn't past the last character continue */
- if (j < ll) {
- c1 = d0[j];
- continue;
- }
+ /* If this isn't past the last character continue */
+ if (j < ll) {
+ c1 = d0[j];
+ continue;
+ }
- N0:;
+ N0:;
- /* If no characters mismatched, then we matched */
- if (i == ii+j) return ii;
+ /* If no characters mismatched, then we matched */
+ if (i == ii+j) return ii;
- /* Shift back to the beginning */
- i -= j;
- j = 0;
- c1 = c0;
- } while (i < lf);
+ /* Shift back to the beginning */
+ i -= j;
+ j = 0;
+ c1 = c0;
+ } while (i < lf);
- /* Deal with last case if unrolling caused a misalignment */
- if (i == lf && ll == j+1 && c1 == d1[i]) goto N0;
+ /* Deal with last case if unrolling caused a misalignment */
+ if (i == lf && ll == j+1 && c1 == d1[i]) goto N0;
- return BSTR_ERR;
+ return BSTR_ERR;
}
/* int binstrr (const_bstring b1, int pos, const_bstring b2)
@@ -1106,38 +1106,38 @@ int binstrr (const_bstring b1, int pos, const_bstring b2) {
int j, i, l;
unsigned char * d0, * d1;
- if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
- b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
- if (b1->slen == pos && b2->slen == 0) return pos;
- if (b1->slen < pos || pos < 0) return BSTR_ERR;
- if (b2->slen == 0) return pos;
+ if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
+ b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
+ if (b1->slen == pos && b2->slen == 0) return pos;
+ if (b1->slen < pos || pos < 0) return BSTR_ERR;
+ if (b2->slen == 0) return pos;
- /* Obvious alias case */
- if (b1->data == b2->data && pos == 0 && b2->slen <= b1->slen) return 0;
+ /* Obvious alias case */
+ if (b1->data == b2->data && pos == 0 && b2->slen <= b1->slen) return 0;
- i = pos;
- if ((l = b1->slen - b2->slen) < 0) return BSTR_ERR;
+ i = pos;
+ if ((l = b1->slen - b2->slen) < 0) return BSTR_ERR;
- /* If no space to find such a string then snap back */
- if (l + 1 <= i) i = l;
- j = 0;
+ /* If no space to find such a string then snap back */
+ if (l + 1 <= i) i = l;
+ j = 0;
- d0 = b2->data;
- d1 = b1->data;
- l = b2->slen;
+ d0 = b2->data;
+ d1 = b1->data;
+ l = b2->slen;
- for (;;) {
- if (d0[j] == d1[i + j]) {
- j ++;
- if (j >= l) return i;
- } else {
- i --;
- if (i < 0) break;
- j=0;
- }
- }
+ for (;;) {
+ if (d0[j] == d1[i + j]) {
+ j ++;
+ if (j >= l) return i;
+ } else {
+ i --;
+ if (i < 0) break;
+ j=0;
+ }
+ }
- return BSTR_ERR;
+ return BSTR_ERR;
}
/* int binstrcaseless (const_bstring b1, int pos, const_bstring b2)
@@ -1153,39 +1153,39 @@ int binstrcaseless (const_bstring b1, int pos, const_bstring b2) {
int j, i, l, ll;
unsigned char * d0, * d1;
- if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
- b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
- if (b1->slen == pos) return (b2->slen == 0)?pos:BSTR_ERR;
- if (b1->slen < pos || pos < 0) return BSTR_ERR;
- if (b2->slen == 0) return pos;
+ if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
+ b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
+ if (b1->slen == pos) return (b2->slen == 0)?pos:BSTR_ERR;
+ if (b1->slen < pos || pos < 0) return BSTR_ERR;
+ if (b2->slen == 0) return pos;
- l = b1->slen - b2->slen + 1;
+ l = b1->slen - b2->slen + 1;
- /* No space to find such a string? */
- if (l <= pos) return BSTR_ERR;
+ /* No space to find such a string? */
+ if (l <= pos) return BSTR_ERR;
- /* An obvious alias case */
- if (b1->data == b2->data && pos == 0) return BSTR_OK;
+ /* An obvious alias case */
+ if (b1->data == b2->data && pos == 0) return BSTR_OK;
- i = pos;
- j = 0;
+ i = pos;
+ j = 0;
- d0 = b2->data;
- d1 = b1->data;
- ll = b2->slen;
+ d0 = b2->data;
+ d1 = b1->data;
+ ll = b2->slen;
- for (;;) {
- if (d0[j] == d1[i + j] || downcase (d0[j]) == downcase (d1[i + j])) {
- j ++;
- if (j >= ll) return i;
- } else {
- i ++;
- if (i >= l) break;
- j=0;
- }
- }
+ for (;;) {
+ if (d0[j] == d1[i + j] || downcase (d0[j]) == downcase (d1[i + j])) {
+ j ++;
+ if (j >= ll) return i;
+ } else {
+ i ++;
+ if (i >= l) break;
+ j=0;
+ }
+ }
- return BSTR_ERR;
+ return BSTR_ERR;
}
/* int binstrrcaseless (const_bstring b1, int pos, const_bstring b2)
@@ -1201,38 +1201,38 @@ int binstrrcaseless (const_bstring b1, int pos, const_bstring b2) {
int j, i, l;
unsigned char * d0, * d1;
- if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
- b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
- if (b1->slen == pos && b2->slen == 0) return pos;
- if (b1->slen < pos || pos < 0) return BSTR_ERR;
- if (b2->slen == 0) return pos;
+ if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
+ b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
+ if (b1->slen == pos && b2->slen == 0) return pos;
+ if (b1->slen < pos || pos < 0) return BSTR_ERR;
+ if (b2->slen == 0) return pos;
- /* Obvious alias case */
- if (b1->data == b2->data && pos == 0 && b2->slen <= b1->slen) return BSTR_OK;
+ /* Obvious alias case */
+ if (b1->data == b2->data && pos == 0 && b2->slen <= b1->slen) return BSTR_OK;
- i = pos;
- if ((l = b1->slen - b2->slen) < 0) return BSTR_ERR;
+ i = pos;
+ if ((l = b1->slen - b2->slen) < 0) return BSTR_ERR;
- /* If no space to find such a string then snap back */
- if (l + 1 <= i) i = l;
- j = 0;
+ /* If no space to find such a string then snap back */
+ if (l + 1 <= i) i = l;
+ j = 0;
- d0 = b2->data;
- d1 = b1->data;
- l = b2->slen;
+ d0 = b2->data;
+ d1 = b1->data;
+ l = b2->slen;
- for (;;) {
- if (d0[j] == d1[i + j] || downcase (d0[j]) == downcase (d1[i + j])) {
- j ++;
- if (j >= l) return i;
- } else {
- i --;
- if (i < 0) break;
- j=0;
- }
- }
+ for (;;) {
+ if (d0[j] == d1[i + j] || downcase (d0[j]) == downcase (d1[i + j])) {
+ j ++;
+ if (j >= l) return i;
+ } else {
+ i --;
+ if (i < 0) break;
+ j=0;
+ }
+ }
- return BSTR_ERR;
+ return BSTR_ERR;
}
@@ -1244,10 +1244,10 @@ unsigned char * d0, * d1;
int bstrchrp (const_bstring b, int c, int pos) {
unsigned char * p;
- if (b == NULL || b->data == NULL || b->slen <= pos || pos < 0) return BSTR_ERR;
- p = (unsigned char *) bstr__memchr ((b->data + pos), (unsigned char) c, (b->slen - pos));
- if (p) return (int) (p - b->data);
- return BSTR_ERR;
+ if (b == NULL || b->data == NULL || b->slen <= pos || pos < 0) return BSTR_ERR;
+ p = (unsigned char *) bstr__memchr ((b->data + pos), (unsigned char) c, (b->slen - pos));
+ if (p) return (int) (p - b->data);
+ return BSTR_ERR;
}
/* int bstrrchrp (const_bstring b, int c, int pos)
@@ -1258,11 +1258,11 @@ unsigned char * p;
int bstrrchrp (const_bstring b, int c, int pos) {
int i;
- if (b == NULL || b->data == NULL || b->slen <= pos || pos < 0) return BSTR_ERR;
- for (i=pos; i >= 0; i--) {
- if (b->data[i] == (unsigned char) c) return i;
- }
- return BSTR_ERR;
+ if (b == NULL || b->data == NULL || b->slen <= pos || pos < 0) return BSTR_ERR;
+ for (i=pos; i >= 0; i--) {
+ if (b->data[i] == (unsigned char) c) return i;
+ }
+ return BSTR_ERR;
}
#if !defined (BSTRLIB_AGGRESSIVE_MEMORY_FOR_SPEED_TRADEOFF)
@@ -1274,8 +1274,8 @@ int i;
struct charField { LONG_TYPE content[CFCLEN]; };
#define testInCharField(cf,c) ((cf)->content[(c) >> LONG_LOG_BITS_QTY] & (((long)1) << ((c) & (LONG_BITS_QTY-1))))
#define setInCharField(cf,idx) { \
- unsigned int c = (unsigned int) (idx); \
- (cf)->content[c >> LONG_LOG_BITS_QTY] |= (LONG_TYPE) (1ul << (c & (LONG_BITS_QTY-1))); \
+ unsigned int c = (unsigned int) (idx); \
+ (cf)->content[c >> LONG_LOG_BITS_QTY] |= (LONG_TYPE) (1ul << (c & (LONG_BITS_QTY-1))); \
}
#else
@@ -1290,27 +1290,27 @@ struct charField { unsigned char content[CFCLEN]; };
/* Convert a bstring to charField */
static int buildCharField (struct charField * cf, const_bstring b) {
int i;
- if (b == NULL || b->data == NULL || b->slen <= 0) return BSTR_ERR;
- memset ((void *) cf->content, 0, sizeof (struct charField));
- for (i=0; i < b->slen; i++) {
- setInCharField (cf, b->data[i]);
- }
- return BSTR_OK;
+ if (b == NULL || b->data == NULL || b->slen <= 0) return BSTR_ERR;
+ memset ((void *) cf->content, 0, sizeof (struct charField));
+ for (i=0; i < b->slen; i++) {
+ setInCharField (cf, b->data[i]);
+ }
+ return BSTR_OK;
}
static void invertCharField (struct charField * cf) {
int i;
- for (i=0; i < CFCLEN; i++) cf->content[i] = ~cf->content[i];
+ for (i=0; i < CFCLEN; i++) cf->content[i] = ~cf->content[i];
}
/* Inner engine for binchr */
static int binchrCF (const unsigned char * data, int len, int pos, const struct charField * cf) {
int i;
- for (i=pos; i < len; i++) {
- unsigned char c = (unsigned char) data[i];
- if (testInCharField (cf, c)) return i;
- }
- return BSTR_ERR;
+ for (i=pos; i < len; i++) {
+ unsigned char c = (unsigned char) data[i];
+ if (testInCharField (cf, c)) return i;
+ }
+ return BSTR_ERR;
}
/* int binchr (const_bstring b0, int pos, const_bstring b1);
@@ -1321,21 +1321,21 @@ int i;
*/
int binchr (const_bstring b0, int pos, const_bstring b1) {
struct charField chrs;
- if (pos < 0 || b0 == NULL || b0->data == NULL ||
- b0->slen <= pos) return BSTR_ERR;
- if (1 == b1->slen) return bstrchrp (b0, b1->data[0], pos);
- if (0 > buildCharField (&chrs, b1)) return BSTR_ERR;
- return binchrCF (b0->data, b0->slen, pos, &chrs);
+ if (pos < 0 || b0 == NULL || b0->data == NULL ||
+ b0->slen <= pos) return BSTR_ERR;
+ if (1 == b1->slen) return bstrchrp (b0, b1->data[0], pos);
+ if (0 > buildCharField (&chrs, b1)) return BSTR_ERR;
+ return binchrCF (b0->data, b0->slen, pos, &chrs);
}
/* Inner engine for binchrr */
static int binchrrCF (const unsigned char * data, int pos, const struct charField * cf) {
int i;
- for (i=pos; i >= 0; i--) {
- unsigned int c = (unsigned int) data[i];
- if (testInCharField (cf, c)) return i;
- }
- return BSTR_ERR;
+ for (i=pos; i >= 0; i--) {
+ unsigned int c = (unsigned int) data[i];
+ if (testInCharField (cf, c)) return i;
+ }
+ return BSTR_ERR;
}
/* int binchrr (const_bstring b0, int pos, const_bstring b1);
@@ -1346,12 +1346,12 @@ int i;
*/
int binchrr (const_bstring b0, int pos, const_bstring b1) {
struct charField chrs;
- if (pos < 0 || b0 == NULL || b0->data == NULL || b1 == NULL ||
- b0->slen < pos) return BSTR_ERR;
- if (pos == b0->slen) pos--;
- if (1 == b1->slen) return bstrrchrp (b0, b1->data[0], pos);
- if (0 > buildCharField (&chrs, b1)) return BSTR_ERR;
- return binchrrCF (b0->data, pos, &chrs);
+ if (pos < 0 || b0 == NULL || b0->data == NULL || b1 == NULL ||
+ b0->slen < pos) return BSTR_ERR;
+ if (pos == b0->slen) pos--;
+ if (1 == b1->slen) return bstrrchrp (b0, b1->data[0], pos);
+ if (0 > buildCharField (&chrs, b1)) return BSTR_ERR;
+ return binchrrCF (b0->data, pos, &chrs);
}
/* int bninchr (const_bstring b0, int pos, const_bstring b1);
@@ -1362,11 +1362,11 @@ struct charField chrs;
*/
int bninchr (const_bstring b0, int pos, const_bstring b1) {
struct charField chrs;
- if (pos < 0 || b0 == NULL || b0->data == NULL ||
- b0->slen <= pos) return BSTR_ERR;
- if (buildCharField (&chrs, b1) < 0) return BSTR_ERR;
- invertCharField (&chrs);
- return binchrCF (b0->data, b0->slen, pos, &chrs);
+ if (pos < 0 || b0 == NULL || b0->data == NULL ||
+ b0->slen <= pos) return BSTR_ERR;
+ if (buildCharField (&chrs, b1) < 0) return BSTR_ERR;
+ invertCharField (&chrs);
+ return binchrCF (b0->data, b0->slen, pos, &chrs);
}
/* int bninchrr (const_bstring b0, int pos, const_bstring b1);
@@ -1377,12 +1377,12 @@ struct charField chrs;
*/
int bninchrr (const_bstring b0, int pos, const_bstring b1) {
struct charField chrs;
- if (pos < 0 || b0 == NULL || b0->data == NULL ||
- b0->slen < pos) return BSTR_ERR;
- if (pos == b0->slen) pos--;
- if (buildCharField (&chrs, b1) < 0) return BSTR_ERR;
- invertCharField (&chrs);
- return binchrrCF (b0->data, pos, &chrs);
+ if (pos < 0 || b0 == NULL || b0->data == NULL ||
+ b0->slen < pos) return BSTR_ERR;
+ if (pos == b0->slen) pos--;
+ if (buildCharField (&chrs, b1) < 0) return BSTR_ERR;
+ invertCharField (&chrs);
+ return binchrrCF (b0->data, pos, &chrs);
}
/* int bsetstr (bstring b0, int pos, bstring b1, unsigned char fill)
@@ -1397,47 +1397,47 @@ int d, newlen;
ptrdiff_t pd;
bstring aux = (bstring) b1;
- if (pos < 0 || b0 == NULL || b0->slen < 0 || NULL == b0->data ||
- b0->mlen < b0->slen || b0->mlen <= 0) return BSTR_ERR;
- if (b1 != NULL && (b1->slen < 0 || b1->data == NULL)) return BSTR_ERR;
+ if (pos < 0 || b0 == NULL || b0->slen < 0 || NULL == b0->data ||
+ b0->mlen < b0->slen || b0->mlen <= 0) return BSTR_ERR;
+ if (b1 != NULL && (b1->slen < 0 || b1->data == NULL)) return BSTR_ERR;
- d = pos;
+ d = pos;
- /* Aliasing case */
- if (NULL != aux) {
- if ((pd = (ptrdiff_t) (b1->data - b0->data)) >= 0 && pd < (ptrdiff_t) b0->mlen) {
- if (NULL == (aux = bstrcpy (b1))) return BSTR_ERR;
- }
- d += aux->slen;
- }
+ /* Aliasing case */
+ if (NULL != aux) {
+ if ((pd = (ptrdiff_t) (b1->data - b0->data)) >= 0 && pd < (ptrdiff_t) b0->mlen) {
+ if (NULL == (aux = bstrcpy (b1))) return BSTR_ERR;
+ }
+ d += aux->slen;
+ }
- /* Increase memory size if necessary */
- if (balloc (b0, d + 1) != BSTR_OK) {
- if (aux != b1) bdestroy (aux);
- return BSTR_ERR;
- }
+ /* Increase memory size if necessary */
+ if (balloc (b0, d + 1) != BSTR_OK) {
+ if (aux != b1) bdestroy (aux);
+ return BSTR_ERR;
+ }
- newlen = b0->slen;
+ newlen = b0->slen;
- /* Fill in "fill" character as necessary */
- if (pos > newlen) {
- bstr__memset (b0->data + b0->slen, (int) fill, (size_t) (pos - b0->slen));
- newlen = pos;
- }
+ /* Fill in "fill" character as necessary */
+ if (pos > newlen) {
+ bstr__memset (b0->data + b0->slen, (int) fill, (size_t) (pos - b0->slen));
+ newlen = pos;
+ }
- /* Copy b1 to position pos in b0. */
- if (aux != NULL) {
- bBlockCopy ((char *) (b0->data + pos), (char *) aux->data, aux->slen);
- if (aux != b1) bdestroy (aux);
- }
+ /* Copy b1 to position pos in b0. */
+ if (aux != NULL) {
+ bBlockCopy ((char *) (b0->data + pos), (char *) aux->data, aux->slen);
+ if (aux != b1) bdestroy (aux);
+ }
- /* Indicate the potentially increased size of b0 */
- if (d > newlen) newlen = d;
+ /* Indicate the potentially increased size of b0 */
+ if (d > newlen) newlen = d;
- b0->slen = newlen;
- b0->data[newlen] = (unsigned char) '\0';
+ b0->slen = newlen;
+ b0->data[newlen] = (unsigned char) '\0';
- return BSTR_OK;
+ return BSTR_OK;
}
/* int binsert (bstring b1, int pos, bstring b2, unsigned char fill)
@@ -1452,40 +1452,40 @@ int d, l;
ptrdiff_t pd;
bstring aux = (bstring) b2;
- if (pos < 0 || b1 == NULL || b2 == NULL || b1->slen < 0 ||
- b2->slen < 0 || b1->mlen < b1->slen || b1->mlen <= 0) return BSTR_ERR;
-
- /* Aliasing case */
- if ((pd = (ptrdiff_t) (b2->data - b1->data)) >= 0 && pd < (ptrdiff_t) b1->mlen) {
- if (NULL == (aux = bstrcpy (b2))) return BSTR_ERR;
- }
-
- /* Compute the two possible end pointers */
- d = b1->slen + aux->slen;
- l = pos + aux->slen;
- if ((d|l) < 0) return BSTR_ERR;
-
- if (l > d) {
- /* Inserting past the end of the string */
- if (balloc (b1, l + 1) != BSTR_OK) {
- if (aux != b2) bdestroy (aux);
- return BSTR_ERR;
- }
- bstr__memset (b1->data + b1->slen, (int) fill, (size_t) (pos - b1->slen));
- b1->slen = l;
- } else {
- /* Inserting in the middle of the string */
- if (balloc (b1, d + 1) != BSTR_OK) {
- if (aux != b2) bdestroy (aux);
- return BSTR_ERR;
- }
- bBlockCopy (b1->data + l, b1->data + pos, d - l);
- b1->slen = d;
- }
- bBlockCopy (b1->data + pos, aux->data, aux->slen);
- b1->data[b1->slen] = (unsigned char) '\0';
- if (aux != b2) bdestroy (aux);
- return BSTR_OK;
+ if (pos < 0 || b1 == NULL || b2 == NULL || b1->slen < 0 ||
+ b2->slen < 0 || b1->mlen < b1->slen || b1->mlen <= 0) return BSTR_ERR;
+
+ /* Aliasing case */
+ if ((pd = (ptrdiff_t) (b2->data - b1->data)) >= 0 && pd < (ptrdiff_t) b1->mlen) {
+ if (NULL == (aux = bstrcpy (b2))) return BSTR_ERR;
+ }
+
+ /* Compute the two possible end pointers */
+ d = b1->slen + aux->slen;
+ l = pos + aux->slen;
+ if ((d|l) < 0) return BSTR_ERR;
+
+ if (l > d) {
+ /* Inserting past the end of the string */
+ if (balloc (b1, l + 1) != BSTR_OK) {
+ if (aux != b2) bdestroy (aux);
+ return BSTR_ERR;
+ }
+ bstr__memset (b1->data + b1->slen, (int) fill, (size_t) (pos - b1->slen));
+ b1->slen = l;
+ } else {
+ /* Inserting in the middle of the string */
+ if (balloc (b1, d + 1) != BSTR_OK) {
+ if (aux != b2) bdestroy (aux);
+ return BSTR_ERR;
+ }
+ bBlockCopy (b1->data + l, b1->data + pos, d - l);
+ b1->slen = d;
+ }
+ bBlockCopy (b1->data + pos, aux->data, aux->slen);
+ b1->data[b1->slen] = (unsigned char) '\0';
+ if (aux != b2) bdestroy (aux);
+ return BSTR_OK;
}
/* int breplace (bstring b1, int pos, int len, bstring b2,
@@ -1495,44 +1495,44 @@ bstring aux = (bstring) b2;
* fill is used is pos > b1->slen.
*/
int breplace (bstring b1, int pos, int len, const_bstring b2,
- unsigned char fill) {
+ unsigned char fill) {
int pl, ret;
ptrdiff_t pd;
bstring aux = (bstring) b2;
- if (pos < 0 || len < 0 || (pl = pos + len) < 0 || b1 == NULL ||
- b2 == NULL || b1->data == NULL || b2->data == NULL ||
- b1->slen < 0 || b2->slen < 0 || b1->mlen < b1->slen ||
- b1->mlen <= 0) return BSTR_ERR;
-
- /* Straddles the end? */
- if (pl >= b1->slen) {
- if ((ret = bsetstr (b1, pos, b2, fill)) < 0) return ret;
- if (pos + b2->slen < b1->slen) {
- b1->slen = pos + b2->slen;
- b1->data[b1->slen] = (unsigned char) '\0';
- }
- return ret;
- }
-
- /* Aliasing case */
- if ((pd = (ptrdiff_t) (b2->data - b1->data)) >= 0 && pd < (ptrdiff_t) b1->slen) {
- if (NULL == (aux = bstrcpy (b2))) return BSTR_ERR;
- }
-
- if (aux->slen > len) {
- if (balloc (b1, b1->slen + aux->slen - len) != BSTR_OK) {
- if (aux != b2) bdestroy (aux);
- return BSTR_ERR;
- }
- }
-
- if (aux->slen != len) bstr__memmove (b1->data + pos + aux->slen, b1->data + pos + len, b1->slen - (pos + len));
- bstr__memcpy (b1->data + pos, aux->data, aux->slen);
- b1->slen += aux->slen - len;
- b1->data[b1->slen] = (unsigned char) '\0';
- if (aux != b2) bdestroy (aux);
- return BSTR_OK;
+ if (pos < 0 || len < 0 || (pl = pos + len) < 0 || b1 == NULL ||
+ b2 == NULL || b1->data == NULL || b2->data == NULL ||
+ b1->slen < 0 || b2->slen < 0 || b1->mlen < b1->slen ||
+ b1->mlen <= 0) return BSTR_ERR;
+
+ /* Straddles the end? */
+ if (pl >= b1->slen) {
+ if ((ret = bsetstr (b1, pos, b2, fill)) < 0) return ret;
+ if (pos + b2->slen < b1->slen) {
+ b1->slen = pos + b2->slen;
+ b1->data[b1->slen] = (unsigned char) '\0';
+ }
+ return ret;
+ }
+
+ /* Aliasing case */
+ if ((pd = (ptrdiff_t) (b2->data - b1->data)) >= 0 && pd < (ptrdiff_t) b1->slen) {
+ if (NULL == (aux = bstrcpy (b2))) return BSTR_ERR;
+ }
+
+ if (aux->slen > len) {
+ if (balloc (b1, b1->slen + aux->slen - len) != BSTR_OK) {
+ if (aux != b2) bdestroy (aux);
+ return BSTR_ERR;
+ }
+ }
+
+ if (aux->slen != len) bstr__memmove (b1->data + pos + aux->slen, b1->data + pos + len, b1->slen - (pos + len));
+ bstr__memcpy (b1->data + pos, aux->data, aux->slen);
+ b1->slen += aux->slen - len;
+ b1->data[b1->slen] = (unsigned char) '\0';
+ if (aux != b2) bdestroy (aux);
+ return BSTR_OK;
}
/* int bfindreplace (bstring b, const_bstring find, const_bstring repl,
@@ -1552,123 +1552,123 @@ ptrdiff_t pd;
bstring auxf = (bstring) find;
bstring auxr = (bstring) repl;
- if (b == NULL || b->data == NULL || find == NULL ||
- find->data == NULL || repl == NULL || repl->data == NULL ||
- pos < 0 || find->slen <= 0 || b->mlen < 0 || b->slen > b->mlen ||
- b->mlen <= 0 || b->slen < 0 || repl->slen < 0) return BSTR_ERR;
- if (pos > b->slen - find->slen) return BSTR_OK;
-
- /* Alias with find string */
- pd = (ptrdiff_t) (find->data - b->data);
- if ((ptrdiff_t) (pos - find->slen) < pd && pd < (ptrdiff_t) b->slen) {
- if (NULL == (auxf = bstrcpy (find))) return BSTR_ERR;
- }
-
- /* Alias with repl string */
- pd = (ptrdiff_t) (repl->data - b->data);
- if ((ptrdiff_t) (pos - repl->slen) < pd && pd < (ptrdiff_t) b->slen) {
- if (NULL == (auxr = bstrcpy (repl))) {
- if (auxf != find) bdestroy (auxf);
- return BSTR_ERR;
- }
- }
-
- delta = auxf->slen - auxr->slen;
-
- /* in-place replacement since find and replace strings are of equal
- length */
- if (delta == 0) {
- while ((pos = instr (b, pos, auxf)) >= 0) {
- bstr__memcpy (b->data + pos, auxr->data, auxr->slen);
- pos += auxf->slen;
- }
- if (auxf != find) bdestroy (auxf);
- if (auxr != repl) bdestroy (auxr);
- return BSTR_OK;
- }
-
- /* shrinking replacement since auxf->slen > auxr->slen */
- if (delta > 0) {
- acc = 0;
-
- while ((i = instr (b, pos, auxf)) >= 0) {
- if (acc && i > pos)
- bstr__memmove (b->data + pos - acc, b->data + pos, i - pos);
- if (auxr->slen)
- bstr__memcpy (b->data + i - acc, auxr->data, auxr->slen);
- acc += delta;
- pos = i + auxf->slen;
- }
-
- if (acc) {
- i = b->slen;
- if (i > pos)
- bstr__memmove (b->data + pos - acc, b->data + pos, i - pos);
- b->slen -= acc;
- b->data[b->slen] = (unsigned char) '\0';
- }
-
- if (auxf != find) bdestroy (auxf);
- if (auxr != repl) bdestroy (auxr);
- return BSTR_OK;
- }
-
- /* expanding replacement since find->slen < repl->slen. Its a lot
- more complicated. */
-
- mlen = 32;
- d = (int *) static_d; /* Avoid malloc for trivial cases */
- acc = slen = 0;
-
- while ((pos = instr (b, pos, auxf)) >= 0) {
- if (slen + 1 >= mlen) {
- int sl;
- int * t;
- mlen += mlen;
- sl = sizeof (int *) * mlen;
- if (static_d == d) d = NULL;
- if (sl < mlen || NULL == (t = (int *) bstr__realloc (d, sl))) {
- ret = BSTR_ERR;
- goto done;
- }
- if (NULL == d) bstr__memcpy (t, static_d, sizeof (static_d));
- d = t;
- }
- d[slen] = pos;
- slen++;
- acc -= delta;
- pos += auxf->slen;
- if (pos < 0 || acc < 0) {
- ret = BSTR_ERR;
- goto done;
- }
- }
- d[slen] = b->slen;
-
- if (BSTR_OK == (ret = balloc (b, b->slen + acc + 1))) {
- b->slen += acc;
- for (i = slen-1; i >= 0; i--) {
- int s, l;
- s = d[i] + auxf->slen;
- l = d[i+1] - s;
- if (l) {
- bstr__memmove (b->data + s + acc, b->data + s, l);
- }
- if (auxr->slen) {
- bstr__memmove (b->data + s + acc - auxr->slen,
- auxr->data, auxr->slen);
- }
- acc += delta;
- }
- b->data[b->slen] = (unsigned char) '\0';
- }
-
- done:;
- if (static_d == d) d = NULL;
- bstr__free (d);
- if (auxf != find) bdestroy (auxf);
- if (auxr != repl) bdestroy (auxr);
- return ret;
+ if (b == NULL || b->data == NULL || find == NULL ||
+ find->data == NULL || repl == NULL || repl->data == NULL ||
+ pos < 0 || find->slen <= 0 || b->mlen < 0 || b->slen > b->mlen ||
+ b->mlen <= 0 || b->slen < 0 || repl->slen < 0) return BSTR_ERR;
+ if (pos > b->slen - find->slen) return BSTR_OK;
+
+ /* Alias with find string */
+ pd = (ptrdiff_t) (find->data - b->data);
+ if ((ptrdiff_t) (pos - find->slen) < pd && pd < (ptrdiff_t) b->slen) {
+ if (NULL == (auxf = bstrcpy (find))) return BSTR_ERR;
+ }
+
+ /* Alias with repl string */
+ pd = (ptrdiff_t) (repl->data - b->data);
+ if ((ptrdiff_t) (pos - repl->slen) < pd && pd < (ptrdiff_t) b->slen) {
+ if (NULL == (auxr = bstrcpy (repl))) {
+ if (auxf != find) bdestroy (auxf);
+ return BSTR_ERR;
+ }
+ }
+
+ delta = auxf->slen - auxr->slen;
+
+ /* in-place replacement since find and replace strings are of equal
+ length */
+ if (delta == 0) {
+ while ((pos = instr (b, pos, auxf)) >= 0) {
+ bstr__memcpy (b->data + pos, auxr->data, auxr->slen);
+ pos += auxf->slen;
+ }
+ if (auxf != find) bdestroy (auxf);
+ if (auxr != repl) bdestroy (auxr);
+ return BSTR_OK;
+ }
+
+ /* shrinking replacement since auxf->slen > auxr->slen */
+ if (delta > 0) {
+ acc = 0;
+
+ while ((i = instr (b, pos, auxf)) >= 0) {
+ if (acc && i > pos)
+ bstr__memmove (b->data + pos - acc, b->data + pos, i - pos);
+ if (auxr->slen)
+ bstr__memcpy (b->data + i - acc, auxr->data, auxr->slen);
+ acc += delta;
+ pos = i + auxf->slen;
+ }
+
+ if (acc) {
+ i = b->slen;
+ if (i > pos)
+ bstr__memmove (b->data + pos - acc, b->data + pos, i - pos);
+ b->slen -= acc;
+ b->data[b->slen] = (unsigned char) '\0';
+ }
+
+ if (auxf != find) bdestroy (auxf);
+ if (auxr != repl) bdestroy (auxr);
+ return BSTR_OK;
+ }
+
+ /* expanding replacement since find->slen < repl->slen. Its a lot
+ more complicated. */
+
+ mlen = 32;
+ d = (int *) static_d; /* Avoid malloc for trivial cases */
+ acc = slen = 0;
+
+ while ((pos = instr (b, pos, auxf)) >= 0) {
+ if (slen + 1 >= mlen) {
+ int sl;
+ int * t;
+ mlen += mlen;
+ sl = sizeof (int *) * mlen;
+ if (static_d == d) d = NULL;
+ if (sl < mlen || NULL == (t = (int *) bstr__realloc (d, sl))) {
+ ret = BSTR_ERR;
+ goto done;
+ }
+ if (NULL == d) bstr__memcpy (t, static_d, sizeof (static_d));
+ d = t;
+ }
+ d[slen] = pos;
+ slen++;
+ acc -= delta;
+ pos += auxf->slen;
+ if (pos < 0 || acc < 0) {
+ ret = BSTR_ERR;
+ goto done;
+ }
+ }
+ d[slen] = b->slen;
+
+ if (BSTR_OK == (ret = balloc (b, b->slen + acc + 1))) {
+ b->slen += acc;
+ for (i = slen-1; i >= 0; i--) {
+ int s, l;
+ s = d[i] + auxf->slen;
+ l = d[i+1] - s;
+ if (l) {
+ bstr__memmove (b->data + s + acc, b->data + s, l);
+ }
+ if (auxr->slen) {
+ bstr__memmove (b->data + s + acc - auxr->slen,
+ auxr->data, auxr->slen);
+ }
+ acc += delta;
+ }
+ b->data[b->slen] = (unsigned char) '\0';
+ }
+
+ done:;
+ if (static_d == d) d = NULL;
+ bstr__free (d);
+ if (auxf != find) bdestroy (auxf);
+ if (auxr != repl) bdestroy (auxr);
+ return ret;
}
/* int bfindreplace (bstring b, const_bstring find, const_bstring repl,
@@ -1678,7 +1678,7 @@ bstring auxr = (bstring) repl;
* given point in a bstring.
*/
int bfindreplace (bstring b, const_bstring find, const_bstring repl, int pos) {
- return findreplaceengine (b, find, repl, pos, binstr);
+ return findreplaceengine (b, find, repl, pos, binstr);
}
/* int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl,
@@ -1688,7 +1688,7 @@ int bfindreplace (bstring b, const_bstring find, const_bstring repl, int pos) {
* string after a given point in a bstring.
*/
int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl, int pos) {
- return findreplaceengine (b, find, repl, pos, binstrcaseless);
+ return findreplaceengine (b, find, repl, pos, binstrcaseless);
}
/* int binsertch (bstring b, int pos, int len, unsigned char fill)
@@ -1701,31 +1701,31 @@ int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl, int
int binsertch (bstring b, int pos, int len, unsigned char fill) {
int d, l, i;
- if (pos < 0 || b == NULL || b->slen < 0 || b->mlen < b->slen ||
- b->mlen <= 0 || len < 0) return BSTR_ERR;
-
- /* Compute the two possible end pointers */
- d = b->slen + len;
- l = pos + len;
- if ((d|l) < 0) return BSTR_ERR;
-
- if (l > d) {
- /* Inserting past the end of the string */
- if (balloc (b, l + 1) != BSTR_OK) return BSTR_ERR;
- pos = b->slen;
- b->slen = l;
- } else {
- /* Inserting in the middle of the string */
- if (balloc (b, d + 1) != BSTR_OK) return BSTR_ERR;
- for (i = d - 1; i >= l; i--) {
- b->data[i] = b->data[i - len];
- }
- b->slen = d;
- }
-
- for (i=pos; i < l; i++) b->data[i] = fill;
- b->data[b->slen] = (unsigned char) '\0';
- return BSTR_OK;
+ if (pos < 0 || b == NULL || b->slen < 0 || b->mlen < b->slen ||
+ b->mlen <= 0 || len < 0) return BSTR_ERR;
+
+ /* Compute the two possible end pointers */
+ d = b->slen + len;
+ l = pos + len;
+ if ((d|l) < 0) return BSTR_ERR;
+
+ if (l > d) {
+ /* Inserting past the end of the string */
+ if (balloc (b, l + 1) != BSTR_OK) return BSTR_ERR;
+ pos = b->slen;
+ b->slen = l;
+ } else {
+ /* Inserting in the middle of the string */
+ if (balloc (b, d + 1) != BSTR_OK) return BSTR_ERR;
+ for (i = d - 1; i >= l; i--) {
+ b->data[i] = b->data[i - len];
+ }
+ b->slen = d;
+ }
+
+ for (i=pos; i < l; i++) b->data[i] = fill;
+ b->data[b->slen] = (unsigned char) '\0';
+ return BSTR_OK;
}
/* int bpattern (bstring b, int len)
@@ -1738,15 +1738,15 @@ int d, l, i;
int bpattern (bstring b, int len) {
int i, d;
- d = blength (b);
- if (d <= 0 || len < 0 || balloc (b, len + 1) != BSTR_OK) return BSTR_ERR;
- if (len > 0) {
- if (d == 1) return bsetstr (b, len, NULL, b->data[0]);
- for (i = d; i < len; i++) b->data[i] = b->data[i - d];
- }
- b->data[len] = (unsigned char) '\0';
- b->slen = len;
- return BSTR_OK;
+ d = blength (b);
+ if (d <= 0 || len < 0 || balloc (b, len + 1) != BSTR_OK) return BSTR_ERR;
+ if (len > 0) {
+ if (d == 1) return bsetstr (b, len, NULL, b->data[0]);
+ for (i = d; i < len; i++) b->data[i] = b->data[i - d];
+ }
+ b->data[len] = (unsigned char) '\0';
+ b->slen = len;
+ return BSTR_OK;
}
#define BS_BUFF_SZ (1024)
@@ -1760,20 +1760,20 @@ int i, d;
int breada (bstring b, bNread readPtr, void * parm) {
int i, l, n;
- if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
- b->mlen <= 0 || readPtr == NULL) return BSTR_ERR;
+ if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
+ b->mlen <= 0 || readPtr == NULL) return BSTR_ERR;
- i = b->slen;
- for (n=i+16; ; n += ((n < BS_BUFF_SZ) ? n : BS_BUFF_SZ)) {
- if (BSTR_OK != balloc (b, n + 1)) return BSTR_ERR;
- l = (int) readPtr ((void *) (b->data + i), 1, n - i, parm);
- i += l;
- b->slen = i;
- if (i < n) break;
- }
+ i = b->slen;
+ for (n=i+16; ; n += ((n < BS_BUFF_SZ) ? n : BS_BUFF_SZ)) {
+ if (BSTR_OK != balloc (b, n + 1)) return BSTR_ERR;
+ l = (int) readPtr ((void *) (b->data + i), 1, n - i, parm);
+ i += l;
+ b->slen = i;
+ if (i < n) break;
+ }
- b->data[i] = (unsigned char) '\0';
- return BSTR_OK;
+ b->data[i] = (unsigned char) '\0';
+ return BSTR_OK;
}
/* bstring bread (bNread readPtr, void * parm)
@@ -1785,11 +1785,11 @@ int i, l, n;
bstring bread (bNread readPtr, void * parm) {
bstring buff;
- if (0 > breada (buff = bfromcstr (""), readPtr, parm)) {
- bdestroy (buff);
- return NULL;
- }
- return buff;
+ if (0 > breada (buff = bfromcstr (""), readPtr, parm)) {
+ bdestroy (buff);
+ return NULL;
+ }
+ return buff;
}
/* int bassigngets (bstring b, bNgetc getcPtr, void * parm, char terminator)
@@ -1808,26 +1808,26 @@ bstring buff;
int bassigngets (bstring b, bNgetc getcPtr, void * parm, char terminator) {
int c, d, e;
- if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
- b->mlen <= 0 || getcPtr == NULL) return BSTR_ERR;
- d = 0;
- e = b->mlen - 2;
+ if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
+ b->mlen <= 0 || getcPtr == NULL) return BSTR_ERR;
+ d = 0;
+ e = b->mlen - 2;
- while ((c = getcPtr (parm)) >= 0) {
- if (d > e) {
- b->slen = d;
- if (balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
- e = b->mlen - 2;
- }
- b->data[d] = (unsigned char) c;
- d++;
- if (c == terminator) break;
- }
+ while ((c = getcPtr (parm)) >= 0) {
+ if (d > e) {
+ b->slen = d;
+ if (balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
+ e = b->mlen - 2;
+ }
+ b->data[d] = (unsigned char) c;
+ d++;
+ if (c == terminator) break;
+ }
- b->data[d] = (unsigned char) '\0';
- b->slen = d;
+ b->data[d] = (unsigned char) '\0';
+ b->slen = d;
- return d == 0 && c < 0;
+ return d == 0 && c < 0;
}
/* int bgetsa (bstring b, bNgetc getcPtr, void * parm, char terminator)
@@ -1846,26 +1846,26 @@ int c, d, e;
int bgetsa (bstring b, bNgetc getcPtr, void * parm, char terminator) {
int c, d, e;
- if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
- b->mlen <= 0 || getcPtr == NULL) return BSTR_ERR;
- d = b->slen;
- e = b->mlen - 2;
+ if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
+ b->mlen <= 0 || getcPtr == NULL) return BSTR_ERR;
+ d = b->slen;
+ e = b->mlen - 2;
- while ((c = getcPtr (parm)) >= 0) {
- if (d > e) {
- b->slen = d;
- if (balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
- e = b->mlen - 2;
- }
- b->data[d] = (unsigned char) c;
- d++;
- if (c == terminator) break;
- }
+ while ((c = getcPtr (parm)) >= 0) {
+ if (d > e) {
+ b->slen = d;
+ if (balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
+ e = b->mlen - 2;
+ }
+ b->data[d] = (unsigned char) c;
+ d++;
+ if (c == terminator) break;
+ }
- b->data[d] = (unsigned char) '\0';
- b->slen = d;
+ b->data[d] = (unsigned char) '\0';
+ b->slen = d;
- return d == 0 && c < 0;
+ return d == 0 && c < 0;
}
/* bstring bgets (bNgetc getcPtr, void * parm, char terminator)
@@ -1882,19 +1882,19 @@ int c, d, e;
bstring bgets (bNgetc getcPtr, void * parm, char terminator) {
bstring buff;
- if (0 > bgetsa (buff = bfromcstr (""), getcPtr, parm, terminator) || 0 >= buff->slen) {
- bdestroy (buff);
- buff = NULL;
- }
- return buff;
+ if (0 > bgetsa (buff = bfromcstr (""), getcPtr, parm, terminator) || 0 >= buff->slen) {
+ bdestroy (buff);
+ buff = NULL;
+ }
+ return buff;
}
struct bStream {
- bstring buff; /* Buffer for over-reads */
- void * parm; /* The stream handle for core stream */
- bNread readFnPtr; /* fread compatible fnptr for core stream */
- int isEOF; /* track file's EOF state */
- int maxBuffSz;
+ bstring buff; /* Buffer for over-reads */
+ void * parm; /* The stream handle for core stream */
+ bNread readFnPtr; /* fread compatible fnptr for core stream */
+ int isEOF; /* track file's EOF state */
+ int maxBuffSz;
};
/* struct bStream * bsopen (bNread readPtr, void * parm)
@@ -1906,15 +1906,15 @@ struct bStream {
struct bStream * bsopen (bNread readPtr, void * parm) {
struct bStream * s;
- if (readPtr == NULL) return NULL;
- s = (struct bStream *) bstr__alloc (sizeof (struct bStream));
- if (s == NULL) return NULL;
- s->parm = parm;
- s->buff = bfromcstr ("");
- s->readFnPtr = readPtr;
- s->maxBuffSz = BS_BUFF_SZ;
- s->isEOF = 0;
- return s;
+ if (readPtr == NULL) return NULL;
+ s = (struct bStream *) bstr__alloc (sizeof (struct bStream));
+ if (s == NULL) return NULL;
+ s->parm = parm;
+ s->buff = bfromcstr ("");
+ s->readFnPtr = readPtr;
+ s->maxBuffSz = BS_BUFF_SZ;
+ s->isEOF = 0;
+ return s;
}
/* int bsbufflength (struct bStream * s, int sz)
@@ -1924,15 +1924,15 @@ struct bStream * s;
*/
int bsbufflength (struct bStream * s, int sz) {
int oldSz;
- if (s == NULL || sz < 0) return BSTR_ERR;
- oldSz = s->maxBuffSz;
- if (sz > 0) s->maxBuffSz = sz;
- return oldSz;
+ if (s == NULL || sz < 0) return BSTR_ERR;
+ oldSz = s->maxBuffSz;
+ if (sz > 0) s->maxBuffSz = sz;
+ return oldSz;
}
int bseof (const struct bStream * s) {
- if (s == NULL || s->readFnPtr == NULL) return BSTR_ERR;
- return s->isEOF && (s->buff->slen == 0);
+ if (s == NULL || s->readFnPtr == NULL) return BSTR_ERR;
+ return s->isEOF && (s->buff->slen == 0);
}
/* void * bsclose (struct bStream * s)
@@ -1942,15 +1942,15 @@ int bseof (const struct bStream * s) {
*/
void * bsclose (struct bStream * s) {
void * parm;
- if (s == NULL) return NULL;
- s->readFnPtr = NULL;
- if (s->buff) bdestroy (s->buff);
- s->buff = NULL;
- parm = s->parm;
- s->parm = NULL;
- s->isEOF = 1;
- bstr__free (s);
- return parm;
+ if (s == NULL) return NULL;
+ s->readFnPtr = NULL;
+ if (s->buff) bdestroy (s->buff);
+ s->buff = NULL;
+ parm = s->parm;
+ s->parm = NULL;
+ s->isEOF = 1;
+ bstr__free (s);
+ return parm;
}
/* int bsreadlna (bstring r, struct bStream * s, char terminator)
@@ -1965,56 +1965,56 @@ int i, l, ret, rlo;
char * b;
struct tagbstring x;
- if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0 ||
- r->slen < 0 || r->mlen < r->slen) return BSTR_ERR;
- l = s->buff->slen;
- if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
- b = (char *) s->buff->data;
- x.data = (unsigned char *) b;
-
- /* First check if the current buffer holds the terminator */
- b[l] = terminator; /* Set sentinel */
- for (i=0; b[i] != terminator; i++) ;
- if (i < l) {
- x.slen = i + 1;
- ret = bconcat (r, &x);
- s->buff->slen = l;
- if (BSTR_OK == ret) bdelete (s->buff, 0, i + 1);
- return BSTR_OK;
- }
-
- rlo = r->slen;
-
- /* If not then just concatenate the entire buffer to the output */
- x.slen = l;
- if (BSTR_OK != bconcat (r, &x)) return BSTR_ERR;
-
- /* Perform direct in-place reads into the destination to allow for
- the minimum of data-copies */
- for (;;) {
- if (BSTR_OK != balloc (r, r->slen + s->maxBuffSz + 1)) return BSTR_ERR;
- b = (char *) (r->data + r->slen);
- l = (int) s->readFnPtr (b, 1, s->maxBuffSz, s->parm);
- if (l <= 0) {
- r->data[r->slen] = (unsigned char) '\0';
- s->buff->slen = 0;
- s->isEOF = 1;
- /* If nothing was read return with an error message */
- return BSTR_ERR & -(r->slen == rlo);
- }
- b[l] = terminator; /* Set sentinel */
- for (i=0; b[i] != terminator; i++) ;
- if (i < l) break;
- r->slen += l;
- }
-
- /* Terminator found, push over-read back to buffer */
- i++;
- r->slen += i;
- s->buff->slen = l - i;
- bstr__memcpy (s->buff->data, b + i, l - i);
- r->data[r->slen] = (unsigned char) '\0';
- return BSTR_OK;
+ if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0 ||
+ r->slen < 0 || r->mlen < r->slen) return BSTR_ERR;
+ l = s->buff->slen;
+ if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+ b = (char *) s->buff->data;
+ x.data = (unsigned char *) b;
+
+ /* First check if the current buffer holds the terminator */
+ b[l] = terminator; /* Set sentinel */
+ for (i=0; b[i] != terminator; i++) ;
+ if (i < l) {
+ x.slen = i + 1;
+ ret = bconcat (r, &x);
+ s->buff->slen = l;
+ if (BSTR_OK == ret) bdelete (s->buff, 0, i + 1);
+ return BSTR_OK;
+ }
+
+ rlo = r->slen;
+
+ /* If not then just concatenate the entire buffer to the output */
+ x.slen = l;
+ if (BSTR_OK != bconcat (r, &x)) return BSTR_ERR;
+
+ /* Perform direct in-place reads into the destination to allow for
+ the minimum of data-copies */
+ for (;;) {
+ if (BSTR_OK != balloc (r, r->slen + s->maxBuffSz + 1)) return BSTR_ERR;
+ b = (char *) (r->data + r->slen);
+ l = (int) s->readFnPtr (b, 1, s->maxBuffSz, s->parm);
+ if (l <= 0) {
+ r->data[r->slen] = (unsigned char) '\0';
+ s->buff->slen = 0;
+ s->isEOF = 1;
+ /* If nothing was read return with an error message */
+ return BSTR_ERR & -(r->slen == rlo);
+ }
+ b[l] = terminator; /* Set sentinel */
+ for (i=0; b[i] != terminator; i++) ;
+ if (i < l) break;
+ r->slen += l;
+ }
+
+ /* Terminator found, push over-read back to buffer */
+ i++;
+ r->slen += i;
+ s->buff->slen = l - i;
+ bstr__memcpy (s->buff->data, b + i, l - i);
+ r->data[r->slen] = (unsigned char) '\0';
+ return BSTR_OK;
}
/* int bsreadlnsa (bstring r, struct bStream * s, bstring term)
@@ -2030,61 +2030,61 @@ unsigned char * b;
struct tagbstring x;
struct charField cf;
- if (s == NULL || s->buff == NULL || r == NULL || term == NULL ||
- term->data == NULL || r->mlen <= 0 || r->slen < 0 ||
- r->mlen < r->slen) return BSTR_ERR;
- if (term->slen == 1) return bsreadlna (r, s, term->data[0]);
- if (term->slen < 1 || buildCharField (&cf, term)) return BSTR_ERR;
-
- l = s->buff->slen;
- if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
- b = (unsigned char *) s->buff->data;
- x.data = b;
-
- /* First check if the current buffer holds the terminator */
- b[l] = term->data[0]; /* Set sentinel */
- for (i=0; !testInCharField (&cf, b[i]); i++) ;
- if (i < l) {
- x.slen = i + 1;
- ret = bconcat (r, &x);
- s->buff->slen = l;
- if (BSTR_OK == ret) bdelete (s->buff, 0, i + 1);
- return BSTR_OK;
- }
-
- rlo = r->slen;
-
- /* If not then just concatenate the entire buffer to the output */
- x.slen = l;
- if (BSTR_OK != bconcat (r, &x)) return BSTR_ERR;
-
- /* Perform direct in-place reads into the destination to allow for
- the minimum of data-copies */
- for (;;) {
- if (BSTR_OK != balloc (r, r->slen + s->maxBuffSz + 1)) return BSTR_ERR;
- b = (unsigned char *) (r->data + r->slen);
- l = (int) s->readFnPtr (b, 1, s->maxBuffSz, s->parm);
- if (l <= 0) {
- r->data[r->slen] = (unsigned char) '\0';
- s->buff->slen = 0;
- s->isEOF = 1;
- /* If nothing was read return with an error message */
- return BSTR_ERR & -(r->slen == rlo);
- }
-
- b[l] = term->data[0]; /* Set sentinel */
- for (i=0; !testInCharField (&cf, b[i]); i++) ;
- if (i < l) break;
- r->slen += l;
- }
-
- /* Terminator found, push over-read back to buffer */
- i++;
- r->slen += i;
- s->buff->slen = l - i;
- bstr__memcpy (s->buff->data, b + i, l - i);
- r->data[r->slen] = (unsigned char) '\0';
- return BSTR_OK;
+ if (s == NULL || s->buff == NULL || r == NULL || term == NULL ||
+ term->data == NULL || r->mlen <= 0 || r->slen < 0 ||
+ r->mlen < r->slen) return BSTR_ERR;
+ if (term->slen == 1) return bsreadlna (r, s, term->data[0]);
+ if (term->slen < 1 || buildCharField (&cf, term)) return BSTR_ERR;
+
+ l = s->buff->slen;
+ if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+ b = (unsigned char *) s->buff->data;
+ x.data = b;
+
+ /* First check if the current buffer holds the terminator */
+ b[l] = term->data[0]; /* Set sentinel */
+ for (i=0; !testInCharField (&cf, b[i]); i++) ;
+ if (i < l) {
+ x.slen = i + 1;
+ ret = bconcat (r, &x);
+ s->buff->slen = l;
+ if (BSTR_OK == ret) bdelete (s->buff, 0, i + 1);
+ return BSTR_OK;
+ }
+
+ rlo = r->slen;
+
+ /* If not then just concatenate the entire buffer to the output */
+ x.slen = l;
+ if (BSTR_OK != bconcat (r, &x)) return BSTR_ERR;
+
+ /* Perform direct in-place reads into the destination to allow for
+ the minimum of data-copies */
+ for (;;) {
+ if (BSTR_OK != balloc (r, r->slen + s->maxBuffSz + 1)) return BSTR_ERR;
+ b = (unsigned char *) (r->data + r->slen);
+ l = (int) s->readFnPtr (b, 1, s->maxBuffSz, s->parm);
+ if (l <= 0) {
+ r->data[r->slen] = (unsigned char) '\0';
+ s->buff->slen = 0;
+ s->isEOF = 1;
+ /* If nothing was read return with an error message */
+ return BSTR_ERR & -(r->slen == rlo);
+ }
+
+ b[l] = term->data[0]; /* Set sentinel */
+ for (i=0; !testInCharField (&cf, b[i]); i++) ;
+ if (i < l) break;
+ r->slen += l;
+ }
+
+ /* Terminator found, push over-read back to buffer */
+ i++;
+ r->slen += i;
+ s->buff->slen = l - i;
+ bstr__memcpy (s->buff->data, b + i, l - i);
+ r->data[r->slen] = (unsigned char) '\0';
+ return BSTR_OK;
}
/* int bsreada (bstring r, struct bStream * s, int n)
@@ -2100,56 +2100,56 @@ int l, ret, orslen;
char * b;
struct tagbstring x;
- if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0
- || r->slen < 0 || r->mlen < r->slen || n <= 0) return BSTR_ERR;
+ if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0
+ || r->slen < 0 || r->mlen < r->slen || n <= 0) return BSTR_ERR;
- n += r->slen;
- if (n <= 0) return BSTR_ERR;
+ n += r->slen;
+ if (n <= 0) return BSTR_ERR;
- l = s->buff->slen;
+ l = s->buff->slen;
- orslen = r->slen;
+ orslen = r->slen;
- if (0 == l) {
- if (s->isEOF) return BSTR_ERR;
- if (r->mlen > n) {
- l = (int) s->readFnPtr (r->data + r->slen, 1, n - r->slen, s->parm);
- if (0 >= l || l > n - r->slen) {
- s->isEOF = 1;
- return BSTR_ERR;
- }
- r->slen += l;
- r->data[r->slen] = (unsigned char) '\0';
- return 0;
- }
- }
+ if (0 == l) {
+ if (s->isEOF) return BSTR_ERR;
+ if (r->mlen > n) {
+ l = (int) s->readFnPtr (r->data + r->slen, 1, n - r->slen, s->parm);
+ if (0 >= l || l > n - r->slen) {
+ s->isEOF = 1;
+ return BSTR_ERR;
+ }
+ r->slen += l;
+ r->data[r->slen] = (unsigned char) '\0';
+ return 0;
+ }
+ }
- if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
- b = (char *) s->buff->data;
- x.data = (unsigned char *) b;
+ if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+ b = (char *) s->buff->data;
+ x.data = (unsigned char *) b;
- do {
- if (l + r->slen >= n) {
- x.slen = n - r->slen;
- ret = bconcat (r, &x);
- s->buff->slen = l;
- if (BSTR_OK == ret) bdelete (s->buff, 0, x.slen);
- return BSTR_ERR & -(r->slen == orslen);
- }
+ do {
+ if (l + r->slen >= n) {
+ x.slen = n - r->slen;
+ ret = bconcat (r, &x);
+ s->buff->slen = l;
+ if (BSTR_OK == ret) bdelete (s->buff, 0, x.slen);
+ return BSTR_ERR & -(r->slen == orslen);
+ }
- x.slen = l;
- if (BSTR_OK != bconcat (r, &x)) break;
+ x.slen = l;
+ if (BSTR_OK != bconcat (r, &x)) break;
- l = n - r->slen;
- if (l > s->maxBuffSz) l = s->maxBuffSz;
+ l = n - r->slen;
+ if (l > s->maxBuffSz) l = s->maxBuffSz;
- l = (int) s->readFnPtr (b, 1, l, s->parm);
+ l = (int) s->readFnPtr (b, 1, l, s->parm);
- } while (l > 0);
- if (l < 0) l = 0;
- if (l == 0) s->isEOF = 1;
- s->buff->slen = l;
- return BSTR_ERR & -(r->slen == orslen);
+ } while (l > 0);
+ if (l < 0) l = 0;
+ if (l == 0) s->isEOF = 1;
+ s->buff->slen = l;
+ return BSTR_ERR & -(r->slen == orslen);
}
/* int bsreadln (bstring r, struct bStream * s, char terminator)
@@ -2160,11 +2160,11 @@ struct tagbstring x;
* returned, but will be retained for subsequent read operations.
*/
int bsreadln (bstring r, struct bStream * s, char terminator) {
- if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0)
- return BSTR_ERR;
- if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
- r->slen = 0;
- return bsreadlna (r, s, terminator);
+ if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0)
+ return BSTR_ERR;
+ if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+ r->slen = 0;
+ return bsreadlna (r, s, terminator);
}
/* int bsreadlns (bstring r, struct bStream * s, bstring term)
@@ -2175,13 +2175,13 @@ int bsreadln (bstring r, struct bStream * s, char terminator) {
* are not returned, but will be retained for subsequent read operations.
*/
int bsreadlns (bstring r, struct bStream * s, const_bstring term) {
- if (s == NULL || s->buff == NULL || r == NULL || term == NULL
- || term->data == NULL || r->mlen <= 0) return BSTR_ERR;
- if (term->slen == 1) return bsreadln (r, s, term->data[0]);
- if (term->slen < 1) return BSTR_ERR;
- if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
- r->slen = 0;
- return bsreadlnsa (r, s, term);
+ if (s == NULL || s->buff == NULL || r == NULL || term == NULL
+ || term->data == NULL || r->mlen <= 0) return BSTR_ERR;
+ if (term->slen == 1) return bsreadln (r, s, term->data[0]);
+ if (term->slen < 1) return BSTR_ERR;
+ if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+ r->slen = 0;
+ return bsreadlnsa (r, s, term);
}
/* int bsread (bstring r, struct bStream * s, int n)
@@ -2193,11 +2193,11 @@ int bsreadlns (bstring r, struct bStream * s, const_bstring term) {
* additional characters from the core stream beyond virtual stream pointer.
*/
int bsread (bstring r, struct bStream * s, int n) {
- if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0
- || n <= 0) return BSTR_ERR;
- if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
- r->slen = 0;
- return bsreada (r, s, n);
+ if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0
+ || n <= 0) return BSTR_ERR;
+ if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+ r->slen = 0;
+ return bsreada (r, s, n);
}
/* int bsunread (struct bStream * s, const_bstring b)
@@ -2207,8 +2207,8 @@ int bsread (bstring r, struct bStream * s, int n) {
* stream.
*/
int bsunread (struct bStream * s, const_bstring b) {
- if (s == NULL || s->buff == NULL) return BSTR_ERR;
- return binsert (s->buff, 0, b, (unsigned char) '?');
+ if (s == NULL || s->buff == NULL) return BSTR_ERR;
+ return binsert (s->buff, 0, b, (unsigned char) '?');
}
/* int bspeek (bstring r, const struct bStream * s)
@@ -2217,8 +2217,8 @@ int bsunread (struct bStream * s, const_bstring b) {
* read prior to reads from the core stream.
*/
int bspeek (bstring r, const struct bStream * s) {
- if (s == NULL || s->buff == NULL) return BSTR_ERR;
- return bassign (r, s->buff);
+ if (s == NULL || s->buff == NULL) return BSTR_ERR;
+ return bassign (r, s->buff);
}
/* bstring bjoin (const struct bstrList * bl, const_bstring sep);
@@ -2231,46 +2231,46 @@ bstring bjoin (const struct bstrList * bl, const_bstring sep) {
bstring b;
int i, c, v;
- if (bl == NULL || bl->qty < 0) return NULL;
- if (sep != NULL && (sep->slen < 0 || sep->data == NULL)) return NULL;
-
- for (i = 0, c = 1; i < bl->qty; i++) {
- v = bl->entry[i]->slen;
- if (v < 0) return NULL; /* Invalid input */
- c += v;
- if (c < 0) return NULL; /* Wrap around ?? */
- }
-
- if (sep != NULL) c += (bl->qty - 1) * sep->slen;
-
- b = (bstring) bstr__alloc (sizeof (struct tagbstring));
- if (NULL == b) return NULL; /* Out of memory */
- b->data = (unsigned char *) bstr__alloc (c);
- if (b->data == NULL) {
- bstr__free (b);
- return NULL;
- }
-
- b->mlen = c;
- b->slen = c-1;
-
- for (i = 0, c = 0; i < bl->qty; i++) {
- if (i > 0 && sep != NULL) {
- bstr__memcpy (b->data + c, sep->data, sep->slen);
- c += sep->slen;
- }
- v = bl->entry[i]->slen;
- bstr__memcpy (b->data + c, bl->entry[i]->data, v);
- c += v;
- }
- b->data[c] = (unsigned char) '\0';
- return b;
+ if (bl == NULL || bl->qty < 0) return NULL;
+ if (sep != NULL && (sep->slen < 0 || sep->data == NULL)) return NULL;
+
+ for (i = 0, c = 1; i < bl->qty; i++) {
+ v = bl->entry[i]->slen;
+ if (v < 0) return NULL; /* Invalid input */
+ c += v;
+ if (c < 0) return NULL; /* Wrap around ?? */
+ }
+
+ if (sep != NULL) c += (bl->qty - 1) * sep->slen;
+
+ b = (bstring) bstr__alloc (sizeof (struct tagbstring));
+ if (NULL == b) return NULL; /* Out of memory */
+ b->data = (unsigned char *) bstr__alloc (c);
+ if (b->data == NULL) {
+ bstr__free (b);
+ return NULL;
+ }
+
+ b->mlen = c;
+ b->slen = c-1;
+
+ for (i = 0, c = 0; i < bl->qty; i++) {
+ if (i > 0 && sep != NULL) {
+ bstr__memcpy (b->data + c, sep->data, sep->slen);
+ c += sep->slen;
+ }
+ v = bl->entry[i]->slen;
+ bstr__memcpy (b->data + c, bl->entry[i]->data, v);
+ c += v;
+ }
+ b->data[c] = (unsigned char) '\0';
+ return b;
}
#define BSSSC_BUFF_LEN (256)
/* int bssplitscb (struct bStream * s, const_bstring splitStr,
- * int (* cb) (void * parm, int ofs, const_bstring entry), void * parm)
+ * int (* cb) (void * parm, int ofs, const_bstring entry), void * parm)
*
* Iterate the set of disjoint sequential substrings read from a stream
* divided by any of the characters in splitStr. An empty splitStr causes
@@ -2287,56 +2287,56 @@ int i, c, v;
* undefined manner.
*/
int bssplitscb (struct bStream * s, const_bstring splitStr,
- int (* cb) (void * parm, int ofs, const_bstring entry), void * parm) {
+ int (* cb) (void * parm, int ofs, const_bstring entry), void * parm) {
struct charField chrs;
bstring buff;
int i, p, ret;
- if (cb == NULL || s == NULL || s->readFnPtr == NULL
- || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
-
- if (NULL == (buff = bfromcstr (""))) return BSTR_ERR;
-
- if (splitStr->slen == 0) {
- while (bsreada (buff, s, BSSSC_BUFF_LEN) >= 0) ;
- if ((ret = cb (parm, 0, buff)) > 0)
- ret = 0;
- } else {
- buildCharField (&chrs, splitStr);
- ret = p = i = 0;
- for (;;) {
- if (i >= buff->slen) {
- bsreada (buff, s, BSSSC_BUFF_LEN);
- if (i >= buff->slen) {
- if (0 < (ret = cb (parm, p, buff))) ret = 0;
- break;
- }
- }
- if (testInCharField (&chrs, buff->data[i])) {
- struct tagbstring t;
- unsigned char c;
-
- blk2tbstr (t, buff->data + i + 1, buff->slen - (i + 1));
- if ((ret = bsunread (s, &t)) < 0) break;
- buff->slen = i;
- c = buff->data[i];
- buff->data[i] = (unsigned char) '\0';
- if ((ret = cb (parm, p, buff)) < 0) break;
- buff->data[i] = c;
- buff->slen = 0;
- p += i + 1;
- i = -1;
- }
- i++;
- }
- }
-
- bdestroy (buff);
- return ret;
+ if (cb == NULL || s == NULL || s->readFnPtr == NULL
+ || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
+
+ if (NULL == (buff = bfromcstr (""))) return BSTR_ERR;
+
+ if (splitStr->slen == 0) {
+ while (bsreada (buff, s, BSSSC_BUFF_LEN) >= 0) ;
+ if ((ret = cb (parm, 0, buff)) > 0)
+ ret = 0;
+ } else {
+ buildCharField (&chrs, splitStr);
+ ret = p = i = 0;
+ for (;;) {
+ if (i >= buff->slen) {
+ bsreada (buff, s, BSSSC_BUFF_LEN);
+ if (i >= buff->slen) {
+ if (0 < (ret = cb (parm, p, buff))) ret = 0;
+ break;
+ }
+ }
+ if (testInCharField (&chrs, buff->data[i])) {
+ struct tagbstring t;
+ unsigned char c;
+
+ blk2tbstr (t, buff->data + i + 1, buff->slen - (i + 1));
+ if ((ret = bsunread (s, &t)) < 0) break;
+ buff->slen = i;
+ c = buff->data[i];
+ buff->data[i] = (unsigned char) '\0';
+ if ((ret = cb (parm, p, buff)) < 0) break;
+ buff->data[i] = c;
+ buff->slen = 0;
+ p += i + 1;
+ i = -1;
+ }
+ i++;
+ }
+ }
+
+ bdestroy (buff);
+ return ret;
}
/* int bssplitstrcb (struct bStream * s, const_bstring splitStr,
- * int (* cb) (void * parm, int ofs, const_bstring entry), void * parm)
+ * int (* cb) (void * parm, int ofs, const_bstring entry), void * parm)
*
* Iterate the set of disjoint sequential substrings read from a stream
* divided by the entire substring splitStr. An empty splitStr causes
@@ -2353,48 +2353,48 @@ int i, p, ret;
* undefined manner.
*/
int bssplitstrcb (struct bStream * s, const_bstring splitStr,
- int (* cb) (void * parm, int ofs, const_bstring entry), void * parm) {
+ int (* cb) (void * parm, int ofs, const_bstring entry), void * parm) {
bstring buff;
int i, p, ret;
- if (cb == NULL || s == NULL || s->readFnPtr == NULL
- || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
-
- if (splitStr->slen == 1) return bssplitscb (s, splitStr, cb, parm);
-
- if (NULL == (buff = bfromcstr (""))) return BSTR_ERR;
-
- if (splitStr->slen == 0) {
- for (i=0; bsreada (buff, s, BSSSC_BUFF_LEN) >= 0; i++) {
- if ((ret = cb (parm, 0, buff)) < 0) {
- bdestroy (buff);
- return ret;
- }
- buff->slen = 0;
- }
- return BSTR_OK;
- } else {
- ret = p = i = 0;
- for (i=p=0;;) {
- if ((ret = binstr (buff, 0, splitStr)) >= 0) {
- struct tagbstring t;
- blk2tbstr (t, buff->data, ret);
- i = ret + splitStr->slen;
- if ((ret = cb (parm, p, &t)) < 0) break;
- p += i;
- bdelete (buff, 0, i);
- } else {
- bsreada (buff, s, BSSSC_BUFF_LEN);
- if (bseof (s)) {
- if ((ret = cb (parm, p, buff)) > 0) ret = 0;
- break;
- }
- }
- }
- }
-
- bdestroy (buff);
- return ret;
+ if (cb == NULL || s == NULL || s->readFnPtr == NULL
+ || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
+
+ if (splitStr->slen == 1) return bssplitscb (s, splitStr, cb, parm);
+
+ if (NULL == (buff = bfromcstr (""))) return BSTR_ERR;
+
+ if (splitStr->slen == 0) {
+ for (i=0; bsreada (buff, s, BSSSC_BUFF_LEN) >= 0; i++) {
+ if ((ret = cb (parm, 0, buff)) < 0) {
+ bdestroy (buff);
+ return ret;
+ }
+ buff->slen = 0;
+ }
+ return BSTR_OK;
+ } else {
+ ret = p = i = 0;
+ for (i=p=0;;) {
+ if ((ret = binstr (buff, 0, splitStr)) >= 0) {
+ struct tagbstring t;
+ blk2tbstr (t, buff->data, ret);
+ i = ret + splitStr->slen;
+ if ((ret = cb (parm, p, &t)) < 0) break;
+ p += i;
+ bdelete (buff, 0, i);
+ } else {
+ bsreada (buff, s, BSSSC_BUFF_LEN);
+ if (bseof (s)) {
+ if ((ret = cb (parm, p, buff)) > 0) ret = 0;
+ break;
+ }
+ }
+ }
+ }
+
+ bdestroy (buff);
+ return ret;
}
/* int bstrListCreate (void)
@@ -2403,17 +2403,17 @@ int i, p, ret;
*/
struct bstrList * bstrListCreate (void) {
struct bstrList * sl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
- if (sl) {
- sl->entry = (bstring *) bstr__alloc (1*sizeof (bstring));
- if (!sl->entry) {
- bstr__free (sl);
- sl = NULL;
- } else {
- sl->qty = 0;
- sl->mlen = 1;
- }
- }
- return sl;
+ if (sl) {
+ sl->entry = (bstring *) bstr__alloc (1*sizeof (bstring));
+ if (!sl->entry) {
+ bstr__free (sl);
+ sl = NULL;
+ } else {
+ sl->qty = 0;
+ sl->mlen = 1;
+ }
+ }
+ return sl;
}
/* int bstrListDestroy (struct bstrList * sl)
@@ -2422,19 +2422,19 @@ struct bstrList * sl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList)
*/
int bstrListDestroy (struct bstrList * sl) {
int i;
- if (sl == NULL || sl->qty < 0) return BSTR_ERR;
- for (i=0; i < sl->qty; i++) {
- if (sl->entry[i]) {
- bdestroy (sl->entry[i]);
- sl->entry[i] = NULL;
- }
- }
- sl->qty = -1;
- sl->mlen = -1;
- bstr__free (sl->entry);
- sl->entry = NULL;
- bstr__free (sl);
- return BSTR_OK;
+ if (sl == NULL || sl->qty < 0) return BSTR_ERR;
+ for (i=0; i < sl->qty; i++) {
+ if (sl->entry[i]) {
+ bdestroy (sl->entry[i]);
+ sl->entry[i] = NULL;
+ }
+ }
+ sl->qty = -1;
+ sl->mlen = -1;
+ bstr__free (sl->entry);
+ sl->entry = NULL;
+ bstr__free (sl);
+ return BSTR_OK;
}
/* int bstrListAlloc (struct bstrList * sl, int msz)
@@ -2446,21 +2446,21 @@ int bstrListAlloc (struct bstrList * sl, int msz) {
bstring * l;
int smsz;
size_t nsz;
- if (!sl || msz <= 0 || !sl->entry || sl->qty < 0 || sl->mlen <= 0 || sl->qty > sl->mlen) return BSTR_ERR;
- if (sl->mlen >= msz) return BSTR_OK;
- smsz = snapUpSize (msz);
- nsz = ((size_t) smsz) * sizeof (bstring);
- if (nsz < (size_t) smsz) return BSTR_ERR;
- l = (bstring *) bstr__realloc (sl->entry, nsz);
- if (!l) {
- smsz = msz;
- nsz = ((size_t) smsz) * sizeof (bstring);
- l = (bstring *) bstr__realloc (sl->entry, nsz);
- if (!l) return BSTR_ERR;
- }
- sl->mlen = smsz;
- sl->entry = l;
- return BSTR_OK;
+ if (!sl || msz <= 0 || !sl->entry || sl->qty < 0 || sl->mlen <= 0 || sl->qty > sl->mlen) return BSTR_ERR;
+ if (sl->mlen >= msz) return BSTR_OK;
+ smsz = snapUpSize (msz);
+ nsz = ((size_t) smsz) * sizeof (bstring);
+ if (nsz < (size_t) smsz) return BSTR_ERR;
+ l = (bstring *) bstr__realloc (sl->entry, nsz);
+ if (!l) {
+ smsz = msz;
+ nsz = ((size_t) smsz) * sizeof (bstring);
+ l = (bstring *) bstr__realloc (sl->entry, nsz);
+ if (!l) return BSTR_ERR;
+ }
+ sl->mlen = smsz;
+ sl->entry = l;
+ return BSTR_OK;
}
/* int bstrListAllocMin (struct bstrList * sl, int msz)
@@ -2471,20 +2471,20 @@ size_t nsz;
int bstrListAllocMin (struct bstrList * sl, int msz) {
bstring * l;
size_t nsz;
- if (!sl || msz <= 0 || !sl->entry || sl->qty < 0 || sl->mlen <= 0 || sl->qty > sl->mlen) return BSTR_ERR;
- if (msz < sl->qty) msz = sl->qty;
- if (sl->mlen == msz) return BSTR_OK;
- nsz = ((size_t) msz) * sizeof (bstring);
- if (nsz < (size_t) msz) return BSTR_ERR;
- l = (bstring *) bstr__realloc (sl->entry, nsz);
- if (!l) return BSTR_ERR;
- sl->mlen = msz;
- sl->entry = l;
- return BSTR_OK;
+ if (!sl || msz <= 0 || !sl->entry || sl->qty < 0 || sl->mlen <= 0 || sl->qty > sl->mlen) return BSTR_ERR;
+ if (msz < sl->qty) msz = sl->qty;
+ if (sl->mlen == msz) return BSTR_OK;
+ nsz = ((size_t) msz) * sizeof (bstring);
+ if (nsz < (size_t) msz) return BSTR_ERR;
+ l = (bstring *) bstr__realloc (sl->entry, nsz);
+ if (!l) return BSTR_ERR;
+ sl->mlen = msz;
+ sl->entry = l;
+ return BSTR_OK;
}
/* int bsplitcb (const_bstring str, unsigned char splitChar, int pos,
- * int (* cb) (void * parm, int ofs, int len), void * parm)
+ * int (* cb) (void * parm, int ofs, int len), void * parm)
*
* Iterate the set of disjoint sequential substrings over str divided by the
* character in splitChar.
@@ -2499,25 +2499,25 @@ size_t nsz;
* otherwise bsplitcb will continue in an undefined manner.
*/
int bsplitcb (const_bstring str, unsigned char splitChar, int pos,
- int (* cb) (void * parm, int ofs, int len), void * parm) {
+ int (* cb) (void * parm, int ofs, int len), void * parm) {
int i, p, ret;
- if (cb == NULL || str == NULL || pos < 0 || pos > str->slen)
- return BSTR_ERR;
+ if (cb == NULL || str == NULL || pos < 0 || pos > str->slen)
+ return BSTR_ERR;
- p = pos;
- do {
- for (i=p; i < str->slen; i++) {
- if (str->data[i] == splitChar) break;
- }
- if ((ret = cb (parm, p, i - p)) < 0) return ret;
- p = i + 1;
- } while (p <= str->slen);
- return BSTR_OK;
+ p = pos;
+ do {
+ for (i=p; i < str->slen; i++) {
+ if (str->data[i] == splitChar) break;
+ }
+ if ((ret = cb (parm, p, i - p)) < 0) return ret;
+ p = i + 1;
+ } while (p <= str->slen);
+ return BSTR_OK;
}
/* int bsplitscb (const_bstring str, const_bstring splitStr, int pos,
- * int (* cb) (void * parm, int ofs, int len), void * parm)
+ * int (* cb) (void * parm, int ofs, int len), void * parm)
*
* Iterate the set of disjoint sequential substrings over str divided by any
* of the characters in splitStr. An empty splitStr causes the whole str to
@@ -2533,35 +2533,35 @@ int i, p, ret;
* otherwise bsplitscb will continue in an undefined manner.
*/
int bsplitscb (const_bstring str, const_bstring splitStr, int pos,
- int (* cb) (void * parm, int ofs, int len), void * parm) {
+ int (* cb) (void * parm, int ofs, int len), void * parm) {
struct charField chrs;
int i, p, ret;
- if (cb == NULL || str == NULL || pos < 0 || pos > str->slen
- || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
- if (splitStr->slen == 0) {
- if ((ret = cb (parm, 0, str->slen)) > 0) ret = 0;
- return ret;
- }
+ if (cb == NULL || str == NULL || pos < 0 || pos > str->slen
+ || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
+ if (splitStr->slen == 0) {
+ if ((ret = cb (parm, 0, str->slen)) > 0) ret = 0;
+ return ret;
+ }
- if (splitStr->slen == 1)
- return bsplitcb (str, splitStr->data[0], pos, cb, parm);
+ if (splitStr->slen == 1)
+ return bsplitcb (str, splitStr->data[0], pos, cb, parm);
- buildCharField (&chrs, splitStr);
+ buildCharField (&chrs, splitStr);
- p = pos;
- do {
- for (i=p; i < str->slen; i++) {
- if (testInCharField (&chrs, str->data[i])) break;
- }
- if ((ret = cb (parm, p, i - p)) < 0) return ret;
- p = i + 1;
- } while (p <= str->slen);
- return BSTR_OK;
+ p = pos;
+ do {
+ for (i=p; i < str->slen; i++) {
+ if (testInCharField (&chrs, str->data[i])) break;
+ }
+ if ((ret = cb (parm, p, i - p)) < 0) return ret;
+ p = i + 1;
+ } while (p <= str->slen);
+ return BSTR_OK;
}
/* int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos,
- * int (* cb) (void * parm, int ofs, int len), void * parm)
+ * int (* cb) (void * parm, int ofs, int len), void * parm)
*
* Iterate the set of disjoint sequential substrings over str divided by the
* substring splitStr. An empty splitStr causes the whole str to be
@@ -2577,59 +2577,59 @@ int i, p, ret;
* otherwise bsplitscb will continue in an undefined manner.
*/
int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos,
- int (* cb) (void * parm, int ofs, int len), void * parm) {
+ int (* cb) (void * parm, int ofs, int len), void * parm) {
int i, p, ret;
- if (cb == NULL || str == NULL || pos < 0 || pos > str->slen
- || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
+ if (cb == NULL || str == NULL || pos < 0 || pos > str->slen
+ || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
- if (0 == splitStr->slen) {
- for (i=pos; i < str->slen; i++) {
- if ((ret = cb (parm, i, 1)) < 0) return ret;
- }
- return BSTR_OK;
- }
+ if (0 == splitStr->slen) {
+ for (i=pos; i < str->slen; i++) {
+ if ((ret = cb (parm, i, 1)) < 0) return ret;
+ }
+ return BSTR_OK;
+ }
- if (splitStr->slen == 1)
- return bsplitcb (str, splitStr->data[0], pos, cb, parm);
+ if (splitStr->slen == 1)
+ return bsplitcb (str, splitStr->data[0], pos, cb, parm);
- for (i=p=pos; i <= str->slen - splitStr->slen; i++) {
- if (0 == bstr__memcmp (splitStr->data, str->data + i, splitStr->slen)) {
- if ((ret = cb (parm, p, i - p)) < 0) return ret;
- i += splitStr->slen;
- p = i;
- }
- }
- if ((ret = cb (parm, p, str->slen - p)) < 0) return ret;
- return BSTR_OK;
+ for (i=p=pos; i <= str->slen - splitStr->slen; i++) {
+ if (0 == bstr__memcmp (splitStr->data, str->data + i, splitStr->slen)) {
+ if ((ret = cb (parm, p, i - p)) < 0) return ret;
+ i += splitStr->slen;
+ p = i;
+ }
+ }
+ if ((ret = cb (parm, p, str->slen - p)) < 0) return ret;
+ return BSTR_OK;
}
struct genBstrList {
- bstring b;
- struct bstrList * bl;
+ bstring b;
+ struct bstrList * bl;
};
static int bscb (void * parm, int ofs, int len) {
struct genBstrList * g = (struct genBstrList *) parm;
- if (g->bl->qty >= g->bl->mlen) {
- int mlen = g->bl->mlen * 2;
- bstring * tbl;
+ if (g->bl->qty >= g->bl->mlen) {
+ int mlen = g->bl->mlen * 2;
+ bstring * tbl;
- while (g->bl->qty >= mlen) {
- if (mlen < g->bl->mlen) return BSTR_ERR;
- mlen += mlen;
- }
+ while (g->bl->qty >= mlen) {
+ if (mlen < g->bl->mlen) return BSTR_ERR;
+ mlen += mlen;
+ }
- tbl = (bstring *) bstr__realloc (g->bl->entry, sizeof (bstring) * mlen);
- if (tbl == NULL) return BSTR_ERR;
+ tbl = (bstring *) bstr__realloc (g->bl->entry, sizeof (bstring) * mlen);
+ if (tbl == NULL) return BSTR_ERR;
- g->bl->entry = tbl;
- g->bl->mlen = mlen;
- }
+ g->bl->entry = tbl;
+ g->bl->mlen = mlen;
+ }
- g->bl->entry[g->bl->qty] = bmidstr (g->b, ofs, len);
- g->bl->qty++;
- return BSTR_OK;
+ g->bl->entry[g->bl->qty] = bmidstr (g->b, ofs, len);
+ g->bl->qty++;
+ return BSTR_OK;
}
/* struct bstrList * bsplit (const_bstring str, unsigned char splitChar)
@@ -2640,24 +2640,24 @@ struct genBstrList * g = (struct genBstrList *) parm;
struct bstrList * bsplit (const_bstring str, unsigned char splitChar) {
struct genBstrList g;
- if (str == NULL || str->data == NULL || str->slen < 0) return NULL;
+ if (str == NULL || str->data == NULL || str->slen < 0) return NULL;
- g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
- if (g.bl == NULL) return NULL;
- g.bl->mlen = 4;
- g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
- if (NULL == g.bl->entry) {
- bstr__free (g.bl);
- return NULL;
- }
+ g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
+ if (g.bl == NULL) return NULL;
+ g.bl->mlen = 4;
+ g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
+ if (NULL == g.bl->entry) {
+ bstr__free (g.bl);
+ return NULL;
+ }
- g.b = (bstring) str;
- g.bl->qty = 0;
- if (bsplitcb (str, splitChar, 0, bscb, &g) < 0) {
- bstrListDestroy (g.bl);
- return NULL;
- }
- return g.bl;
+ g.b = (bstring) str;
+ g.bl->qty = 0;
+ if (bsplitcb (str, splitChar, 0, bscb, &g) < 0) {
+ bstrListDestroy (g.bl);
+ return NULL;
+ }
+ return g.bl;
}
/* struct bstrList * bsplitstr (const_bstring str, const_bstring splitStr)
@@ -2668,24 +2668,24 @@ struct genBstrList g;
struct bstrList * bsplitstr (const_bstring str, const_bstring splitStr) {
struct genBstrList g;
- if (str == NULL || str->data == NULL || str->slen < 0) return NULL;
+ if (str == NULL || str->data == NULL || str->slen < 0) return NULL;
- g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
- if (g.bl == NULL) return NULL;
- g.bl->mlen = 4;
- g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
- if (NULL == g.bl->entry) {
- bstr__free (g.bl);
- return NULL;
- }
+ g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
+ if (g.bl == NULL) return NULL;
+ g.bl->mlen = 4;
+ g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
+ if (NULL == g.bl->entry) {
+ bstr__free (g.bl);
+ return NULL;
+ }
- g.b = (bstring) str;
- g.bl->qty = 0;
- if (bsplitstrcb (str, splitStr, 0, bscb, &g) < 0) {
- bstrListDestroy (g.bl);
- return NULL;
- }
- return g.bl;
+ g.b = (bstring) str;
+ g.bl->qty = 0;
+ if (bsplitstrcb (str, splitStr, 0, bscb, &g) < 0) {
+ bstrListDestroy (g.bl);
+ return NULL;
+ }
+ return g.bl;
}
/* struct bstrList * bsplits (const_bstring str, bstring splitStr)
@@ -2697,26 +2697,26 @@ struct genBstrList g;
struct bstrList * bsplits (const_bstring str, const_bstring splitStr) {
struct genBstrList g;
- if ( str == NULL || str->slen < 0 || str->data == NULL ||
- splitStr == NULL || splitStr->slen < 0 || splitStr->data == NULL)
- return NULL;
+ if ( str == NULL || str->slen < 0 || str->data == NULL ||
+ splitStr == NULL || splitStr->slen < 0 || splitStr->data == NULL)
+ return NULL;
- g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
- if (g.bl == NULL) return NULL;
- g.bl->mlen = 4;
- g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
- if (NULL == g.bl->entry) {
- bstr__free (g.bl);
- return NULL;
- }
- g.b = (bstring) str;
- g.bl->qty = 0;
+ g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
+ if (g.bl == NULL) return NULL;
+ g.bl->mlen = 4;
+ g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
+ if (NULL == g.bl->entry) {
+ bstr__free (g.bl);
+ return NULL;
+ }
+ g.b = (bstring) str;
+ g.bl->qty = 0;
- if (bsplitscb (str, splitStr, 0, bscb, &g) < 0) {
- bstrListDestroy (g.bl);
- return NULL;
- }
- return g.bl;
+ if (bsplitscb (str, splitStr, 0, bscb, &g) < 0) {
+ bstrListDestroy (g.bl);
+ return NULL;
+ }
+ return g.bl;
}
#if defined (__TURBOC__) && !defined (__BORLANDC__)
@@ -2772,40 +2772,40 @@ va_list arglist;
bstring buff;
int n, r;
- if (b == NULL || fmt == NULL || b->data == NULL || b->mlen <= 0
- || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
+ if (b == NULL || fmt == NULL || b->data == NULL || b->mlen <= 0
+ || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
- /* Since the length is not determinable beforehand, a search is
- performed using the truncating "vsnprintf" call (to avoid buffer
- overflows) on increasing potential sizes for the output result. */
+ /* Since the length is not determinable beforehand, a search is
+ performed using the truncating "vsnprintf" call (to avoid buffer
+ overflows) on increasing potential sizes for the output result. */
- if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
- if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
- n = 1;
- if (NULL == (buff = bfromcstralloc (n + 2, ""))) return BSTR_ERR;
- }
+ if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
+ if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
+ n = 1;
+ if (NULL == (buff = bfromcstralloc (n + 2, ""))) return BSTR_ERR;
+ }
- for (;;) {
- va_start (arglist, fmt);
- exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
- va_end (arglist);
+ for (;;) {
+ va_start (arglist, fmt);
+ exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
+ va_end (arglist);
- buff->data[n] = (unsigned char) '\0';
- buff->slen = (int) (strlen) ((char *) buff->data);
+ buff->data[n] = (unsigned char) '\0';
+ buff->slen = (int) (strlen) ((char *) buff->data);
- if (buff->slen < n) break;
+ if (buff->slen < n) break;
- if (r > n) n = r; else n += n;
+ if (r > n) n = r; else n += n;
- if (BSTR_OK != balloc (buff, n + 2)) {
- bdestroy (buff);
- return BSTR_ERR;
- }
- }
+ if (BSTR_OK != balloc (buff, n + 2)) {
+ bdestroy (buff);
+ return BSTR_ERR;
+ }
+ }
- r = bconcat (b, buff);
- bdestroy (buff);
- return r;
+ r = bconcat (b, buff);
+ bdestroy (buff);
+ return r;
}
/* int bassignformat (bstring b, const char * fmt, ...)
@@ -2820,40 +2820,40 @@ va_list arglist;
bstring buff;
int n, r;
- if (b == NULL || fmt == NULL || b->data == NULL || b->mlen <= 0
- || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
+ if (b == NULL || fmt == NULL || b->data == NULL || b->mlen <= 0
+ || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
- /* Since the length is not determinable beforehand, a search is
- performed using the truncating "vsnprintf" call (to avoid buffer
- overflows) on increasing potential sizes for the output result. */
+ /* Since the length is not determinable beforehand, a search is
+ performed using the truncating "vsnprintf" call (to avoid buffer
+ overflows) on increasing potential sizes for the output result. */
- if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
- if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
- n = 1;
- if (NULL == (buff = bfromcstralloc (n + 2, ""))) return BSTR_ERR;
- }
+ if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
+ if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
+ n = 1;
+ if (NULL == (buff = bfromcstralloc (n + 2, ""))) return BSTR_ERR;
+ }
- for (;;) {
- va_start (arglist, fmt);
- exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
- va_end (arglist);
+ for (;;) {
+ va_start (arglist, fmt);
+ exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
+ va_end (arglist);
- buff->data[n] = (unsigned char) '\0';
- buff->slen = (int) (strlen) ((char *) buff->data);
+ buff->data[n] = (unsigned char) '\0';
+ buff->slen = (int) (strlen) ((char *) buff->data);
- if (buff->slen < n) break;
+ if (buff->slen < n) break;
- if (r > n) n = r; else n += n;
+ if (r > n) n = r; else n += n;
- if (BSTR_OK != balloc (buff, n + 2)) {
- bdestroy (buff);
- return BSTR_ERR;
- }
- }
+ if (BSTR_OK != balloc (buff, n + 2)) {
+ bdestroy (buff);
+ return BSTR_ERR;
+ }
+ }
- r = bassign (b, buff);
- bdestroy (buff);
- return r;
+ r = bassign (b, buff);
+ bdestroy (buff);
+ return r;
}
/* bstring bformat (const char * fmt, ...)
@@ -2868,37 +2868,37 @@ va_list arglist;
bstring buff;
int n, r;
- if (fmt == NULL) return NULL;
+ if (fmt == NULL) return NULL;
- /* Since the length is not determinable beforehand, a search is
- performed using the truncating "vsnprintf" call (to avoid buffer
- overflows) on increasing potential sizes for the output result. */
+ /* Since the length is not determinable beforehand, a search is
+ performed using the truncating "vsnprintf" call (to avoid buffer
+ overflows) on increasing potential sizes for the output result. */
- if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
- if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
- n = 1;
- if (NULL == (buff = bfromcstralloc (n + 2, ""))) return NULL;
- }
+ if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
+ if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
+ n = 1;
+ if (NULL == (buff = bfromcstralloc (n + 2, ""))) return NULL;
+ }
- for (;;) {
- va_start (arglist, fmt);
- exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
- va_end (arglist);
+ for (;;) {
+ va_start (arglist, fmt);
+ exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
+ va_end (arglist);
- buff->data[n] = (unsigned char) '\0';
- buff->slen = (int) (strlen) ((char *) buff->data);
+ buff->data[n] = (unsigned char) '\0';
+ buff->slen = (int) (strlen) ((char *) buff->data);
- if (buff->slen < n) break;
+ if (buff->slen < n) break;
- if (r > n) n = r; else n += n;
+ if (r > n) n = r; else n += n;
- if (BSTR_OK != balloc (buff, n + 2)) {
- bdestroy (buff);
- return NULL;
- }
- }
+ if (BSTR_OK != balloc (buff, n + 2)) {
+ bdestroy (buff);
+ return NULL;
+ }
+ }
- return buff;
+ return buff;
}
/* int bvcformata (bstring b, int count, const char * fmt, va_list arglist)
@@ -2924,32 +2924,32 @@ int n, r;
int bvcformata (bstring b, int count, const char * fmt, va_list arg) {
int n, r, l;
- if (b == NULL || fmt == NULL || count <= 0 || b->data == NULL
- || b->mlen <= 0 || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
+ if (b == NULL || fmt == NULL || count <= 0 || b->data == NULL
+ || b->mlen <= 0 || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
- if (count > (n = b->slen + count) + 2) return BSTR_ERR;
- if (BSTR_OK != balloc (b, n + 2)) return BSTR_ERR;
+ if (count > (n = b->slen + count) + 2) return BSTR_ERR;
+ if (BSTR_OK != balloc (b, n + 2)) return BSTR_ERR;
- exvsnprintf (r, (char *) b->data + b->slen, count + 2, fmt, arg);
+ exvsnprintf (r, (char *) b->data + b->slen, count + 2, fmt, arg);
- /* Did the operation complete successfully within bounds? */
+ /* Did the operation complete successfully within bounds? */
- if (n >= (l = b->slen + (int) (strlen) ((const char *) b->data + b->slen))) {
- b->slen = l;
- return BSTR_OK;
- }
+ if (n >= (l = b->slen + (int) (strlen) ((const char *) b->data + b->slen))) {
+ b->slen = l;
+ return BSTR_OK;
+ }
- /* Abort, since the buffer was not large enough. The return value
- tries to help set what the retry length should be. */
+ /* Abort, since the buffer was not large enough. The return value
+ tries to help set what the retry length should be. */
- b->data[b->slen] = '\0';
- if (r > count+1) l = r; else {
- l = count+count;
- if (count > l) l = INT_MAX;
- }
- n = -l;
- if (n > BSTR_ERR-1) n = BSTR_ERR-1;
- return n;
+ b->data[b->slen] = '\0';
+ if (r > count+1) l = r; else {
+ l = count+count;
+ if (count > l) l = INT_MAX;
+ }
+ n = -l;
+ if (n > BSTR_ERR-1) n = BSTR_ERR-1;
+ return n;
}
#endif
diff --git a/src/calculator.c b/src/calculator.c
new file mode 100644
index 0000000..bd73a4d
--- /dev/null
+++ b/src/calculator.c
@@ -0,0 +1,926 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: calculator.c
+ *
+ * Description: Infix calculator
+ *
+ * Author: Brandon Mills (bm), mills.brandont at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 Brandon Mills
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+/*
+ * =======================================================================================
+ *
+ * Some changes done for the integration in LIKWID, see inline comments
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h> // Temporary
+#include <getopt.h>
+#include <calculator_stack.h>
+
+#define bool char
+#define true 1
+#define false 0
+
+#define PI 3.141592653589793
+
+/* Added by Thomas Roehl (Thomas.Roehl at fau.de) to reduce reallocs by allocating a temporary
+ * token for parsing as well as for transforming a number to a string.
+ */
+#define MAXTOKENLENGTH 512
+
+typedef enum
+{
+ addop,
+ multop,
+ expop,
+ lparen,
+ rparen,
+ digit,
+ value,
+ decimal,
+ space,
+ text,
+ function,
+ identifier,
+ argsep,
+ invalid
+} Symbol;
+
+struct Preferences
+{
+ struct Display
+ {
+ bool tokens;
+ bool postfix;
+ } display;
+ struct Mode
+ {
+ bool degrees;
+ } mode;
+} prefs;
+
+typedef enum
+{
+ divZero,
+ overflow,
+ parenMismatch
+} Error;
+
+typedef char* token;
+/* Added by Thomas Roehl (Thomas.Roehl at fau.de) to keep track of the
+ * intermediate calculation results to free them in the end
+ */
+token* calcTokens = NULL;
+int nrCalcTokens = 0;
+
+typedef double number;
+
+void raise(Error err)
+{
+ char* msg;
+ switch(err)
+ {
+ case divZero:
+ msg = "Divide by zero";
+ break;
+ case overflow:
+ msg = "Overflow";
+ break;
+ case parenMismatch:
+ msg = "Mismatched parentheses";
+ break;
+ }
+ printf("\tError: %s\n", msg);
+}
+
+inline unsigned int toDigit(char ch)
+{
+ return ch - '0';
+}
+
+number buildNumber(token str)
+{
+ number result = 0;
+ result = strtod(str, NULL);
+ return result;
+}
+
+token num2Str(number num)
+{
+ /* Increased precision by Thomas Roehl (Thomas.Roehl at fau.de) as required for LIKWID */
+ token str = (token)malloc((MAXTOKENLENGTH+1)*sizeof(char));
+ snprintf(str, 39, "%.20f", num);
+ return str;
+}
+
+
+
+inline number toRadians(number degrees)
+{
+ return degrees * PI / 180.0;
+}
+
+inline number toDegrees(number radians)
+{
+ return radians * 180.0 / PI;
+}
+
+token doFunc(token input, token function)
+{
+ number num = buildNumber(input);
+ number result = num;
+
+ if(strcmp(function, "abs") == 0)
+ result = fabs(num);
+ else if(strcmp(function, "floor") == 0)
+ result = floor(num);
+ else if(strcmp(function, "ceil") == 0)
+ result = ceil(num);
+ else if(strcmp(function, "sin") == 0)
+ result = !prefs.mode.degrees ? sin(num) : sin(toRadians(num));
+ else if(strcmp(function, "cos") == 0)
+ result = !prefs.mode.degrees ? cos(num) : cos(toRadians(num));
+ else if(strcmp(function, "tan") == 0)
+ result = !prefs.mode.degrees ? tan(num) : tan(toRadians(num));
+ else if(strcmp(function, "arcsin") == 0
+ || strcmp(function, "asin") == 0)
+ result = !prefs.mode.degrees ? asin(num) : toDegrees(asin(num));
+ else if(strcmp(function, "arccos") == 0
+ || strcmp(function, "acos") == 0)
+ result = !prefs.mode.degrees ? acos(num) : toDegrees(acos(num));
+ else if(strcmp(function, "arctan") == 0
+ || strcmp(function, "atan") == 0)
+ result = !prefs.mode.degrees ? atan(num) : toDegrees(atan(num));
+ else if(strcmp(function, "sqrt") == 0)
+ result = sqrt(num);
+ else if(strcmp(function, "cbrt") == 0)
+ result = cbrt(num);
+ else if(strcmp(function, "log") == 0)
+ result = log(num);
+ else if(strcmp(function, "exp") == 0)
+ result = exp(num);
+ printf("Free %s\n", function);
+ free(function);
+ return num2Str(result);
+}
+
+int doOp(token loperand, token op, token roperand, token *result)
+{
+ /* Added by Thomas Roehl (Thomas.Roehl at fau.de) to return
+ * errors from calculation like devide-by-zero, ... */
+ int err = 0;
+ number lside = buildNumber(loperand);
+ number rside = buildNumber(roperand);
+ number ret;
+ switch(*op)
+ {
+ case '^':
+ {
+ ret = pow(lside, rside);
+ }
+ break;
+ case '*':
+ {
+ ret = lside * rside;
+ }
+ break;
+ case '/':
+ {
+ if(rside == 0)
+ {
+ /* Changed by Thomas Roehl */
+ //raise(divZero);
+ err = -1;
+ }
+ else
+ ret = lside / rside;
+ }
+ break;
+ case '%':
+ {
+ if(rside == 0)
+ {
+ /* Changed by Thomas Roehl */
+ //raise(divZero);
+ err = -1;
+ }
+ else
+ {
+ ret = (int)(lside / rside);
+ ret = lside - (ret * rside);
+ }
+ }
+ break;
+ case '+':
+ {
+ ret = lside + rside;
+ }
+ break;
+ case '-':
+ {
+ ret = lside - rside;
+ }
+ break;
+ }
+ *result = num2Str(ret);
+ return err;
+}
+
+
+Symbol type(char ch)
+{
+ Symbol result;
+ switch(ch)
+ {
+ case '+':
+ case '-':
+ result = addop;
+ break;
+ case '*':
+ case '/':
+ case '%':
+ result = multop;
+ break;
+ case '^':
+ result = expop;
+ break;
+ case '(':
+ result = lparen;
+ break;
+ case ')':
+ result = rparen;
+ break;
+ case '.':
+ result = decimal;
+ break;
+ case ' ':
+ result = space;
+ break;
+ case ',':
+ result = argsep;
+ break;
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ result = digit;
+ break;
+ case 'A':
+ case 'B':
+ case 'C':
+ case 'D':
+ case 'E':
+ case 'F':
+ case 'G':
+ case 'H':
+ case 'I':
+ case 'J':
+ case 'K':
+ case 'L':
+ case 'M':
+ case 'N':
+ case 'O':
+ case 'P':
+ case 'Q':
+ case 'R':
+ case 'S':
+ case 'T':
+ case 'U':
+ case 'V':
+ case 'W':
+ case 'X':
+ case 'Y':
+ case 'Z':
+ case 'a':
+ case 'b':
+ case 'c':
+ case 'd':
+ case 'e':
+ case 'f':
+ case 'g':
+ case 'h':
+ case 'i':
+ case 'j':
+ case 'k':
+ case 'l':
+ case 'm':
+ case 'n':
+ case 'o':
+ case 'p':
+ case 'q':
+ case 'r':
+ case 's':
+ case 't':
+ case 'u':
+ case 'v':
+ case 'w':
+ case 'x':
+ case 'y':
+ case 'z':
+ result = text;
+ break;
+ default:
+ result = invalid;
+ break;
+ }
+ return result;
+}
+
+bool isFunction(token tk)
+{
+ return (strcmp(tk, "abs") == 0
+ || strcmp(tk, "floor") == 0
+ || strcmp(tk, "ceil") == 0
+ || strcmp(tk, "sin") == 0
+ || strcmp(tk, "cos") == 0
+ || strcmp(tk, "tan") == 0
+ || strcmp(tk, "arcsin") == 0
+ || strcmp(tk, "arccos") == 0
+ || strcmp(tk, "arctan") == 0
+ || strcmp(tk, "asin") == 0
+ || strcmp(tk, "acos") == 0
+ || strcmp(tk, "atan") == 0
+ || strcmp(tk, "sqrt") == 0
+ || strcmp(tk, "cbrt") == 0
+ || strcmp(tk, "log") == 0
+ || strcmp(tk, "exp") == 0);
+}
+
+Symbol tokenType(token tk)
+{
+ Symbol ret = type(*tk);
+ switch(ret)
+ {
+ case text:
+ if(isFunction(tk))
+ ret = function;
+ else
+ ret = identifier;
+ break;
+ case addop:
+ if(*tk == '-' && strlen(tk) > 1)
+ ret = tokenType(tk+1);
+ break;
+ case decimal:
+ case digit:
+ ret = value;
+ break;
+ }
+ return ret;
+}
+
+int tokenize(char *str, char *(**tokensRef))
+{
+ char** tokens = NULL;
+ char** tmp = NULL;
+ char* ptr = str;
+ char ch = '\0';
+ int numTokens = 0;
+ /* Added by Thomas Roehl (Thomas.Roehl at fau.de) to parse string
+ * in a temporary token to reduce frequent reallocs. newToken
+ * is replaced by tmpToken during parsing. Removed all reallocs
+ * and not required mallocs from the original code.
+ */
+ char* tmpToken = malloc((MAXTOKENLENGTH+1) * sizeof(char));
+ if (!tmpToken)
+ {
+ fprintf(stderr, "Malloc of temporary buffer failed\n");
+ return 0;
+ }
+ while(ch = *ptr++)
+ {
+ if(type(ch) == invalid) // Stop tokenizing when we encounter an invalid character
+ break;
+
+ token newToken = NULL;
+ /* Added by Thomas Roehl (Thomas.Roehl at fau.de)
+ * Prepare temporary token for next parsing step */
+ memset(tmpToken, '\0', MAXTOKENLENGTH+1);
+ switch(type(ch))
+ {
+ case addop:
+ {
+ // Check if this is a negative
+ if(ch == '-'
+ && (numTokens == 0
+ || (tokenType(tokens[numTokens-1]) == addop
+ || tokenType(tokens[numTokens-1]) == multop
+ || tokenType(tokens[numTokens-1]) == expop
+ || tokenType(tokens[numTokens-1]) == lparen)))
+ {
+ // Assemble an n-character (plus null-terminator) number token
+ {
+ int len = 1;
+ bool hasDecimal = false;
+ bool hasExponent = false;
+
+ if(type(ch) == decimal) // Allow numbers to start with decimal
+ {
+ //printf("Decimal\n");
+ hasDecimal = true;
+ len++;
+ //newToken = (char*)malloc((len + 1) * sizeof(char));
+ tmpToken[0] = '0';
+ tmpToken[1] = '.';
+ }
+ else // Numbers that do not start with decimal
+ {
+ //newToken = (char*)malloc((len + 1) * sizeof(char)); // Leave room for '\0'
+ tmpToken[len-1] = ch;
+ }
+
+ // Assemble rest of number
+ for(; // Don't change len
+ *ptr // There is a next character and it is not null
+ && len <= MAXTOKENLENGTH
+ && (type(*ptr) == digit // The next character is a digit
+ || ((type(*ptr) == decimal // Or the next character is a decimal
+ && hasDecimal == 0)) // But we have not added a decimal
+ || ((*ptr == 'E' || *ptr == 'e') // Or the next character is an exponent
+ && hasExponent == false) // But we have not added an exponent yet
+ /* Added by Thomas Roehl (Thomas.Roehl at fau.de) to parse scientific notation
+ * with signed exponent correctly
+ */
+ || ((*ptr == '+' || *ptr == '-') && hasExponent == true)); // Exponent with sign
+ ++len)
+ {
+ if(type(*ptr) == decimal)
+ hasDecimal = true;
+ else if(*ptr == 'E' || *ptr == 'e')
+ hasExponent = true;
+ //newToken = (char*)realloc(newToken, (len + 1) * sizeof(char)); // Leave room for '\0'
+ tmpToken[len] = *ptr++;
+ }
+
+ // Append null-terminator
+ tmpToken[len] = '\0';
+ }
+ break;
+ }
+ // If it's not part of a number, it's an op - fall through
+ }
+ case multop:
+ case expop:
+ case lparen:
+ case rparen:
+ case argsep:
+ // Assemble a single-character (plus null-terminator) operation token
+ {
+ //newToken = (char*)malloc(2 * sizeof(char)); // Leave room for '\0'
+ tmpToken[0] = ch;
+ tmpToken[1] = '\0';
+ }
+ break;
+ case digit:
+ case decimal:
+ // Assemble an n-character (plus null-terminator) number token
+ {
+ int len = 1;
+ bool hasDecimal = false;
+ bool hasExponent = false;
+
+ if(type(ch) == decimal) // Allow numbers to start with decimal
+ {
+ //printf("Decimal\n");
+ hasDecimal = true;
+ len++;
+ //newToken = (char*)malloc((len + 1) * sizeof(char));
+ tmpToken[0] = '0';
+ tmpToken[1] = '.';
+ }
+ else // Numbers that do not start with decimal
+ {
+ //newToken = (char*)malloc((len + 1) * sizeof(char)); // Leave room for '\0'
+ tmpToken[len-1] = ch;
+ }
+
+ // Assemble rest of number
+ /* Added support for signed exponents in scientific notation
+ * by Thomas Roehl (Thomas.Roehl at fau.de) as required for LIKWID */
+ for(; // Don't change len
+ *ptr // There is a next character and it is not null
+ && len <= MAXTOKENLENGTH
+ && (type(*ptr) == digit // The next character is a digit
+ || ((type(*ptr) == decimal // Or the next character is a decimal
+ && hasDecimal == false)) // But we have not added a decimal
+ || ((*ptr == 'E' || *ptr == 'e') // Or the next character is an exponent
+ && hasExponent == false) // But we have not added an exponent yet
+ /* Added by Thomas Roehl (Thomas.Roehl at fau.de) to parse scientific notation
+ * with signed exponent correctly
+ */
+ || ((*ptr == '+' || *ptr == '-') && hasExponent == true)); // Exponent with sign
+ ++len)
+ {
+ if(type(*ptr) == decimal)
+ {
+ hasDecimal = true;
+ }
+ else if(*ptr == 'E' || *ptr == 'e')
+ {
+ hasExponent = true;
+ }
+ //newToken = (char*)realloc(newToken, (len + 1) * sizeof(char)); // Leave room for '\0'
+ tmpToken[len] = *ptr++;
+ }
+
+ // Append null-terminator
+ tmpToken[len] = '\0';
+ }
+ break;
+ case text:
+ // Assemble an n-character (plus null-terminator) text token
+ {
+ int len = 1;
+ //newToken = (char*)malloc((len + 1) * sizeof(char)); // Leave room for '\0'
+ tmpToken[0] = ch;
+ for(len = 1; *ptr && type(*ptr) == text && len <= MAXTOKENLENGTH; ++len)
+ {
+ //newToken = (char*)realloc(newToken, (len + 1) * sizeof(char)); // Leave room for '\0'
+ tmpToken[len] = *ptr++;
+ }
+ tmpToken[len] = '\0';
+ }
+ break;
+ }
+ // Add to list of tokens
+ if(tmpToken[0] != '\0')
+ {
+ numTokens++;
+ /*if(tokens == NULL) // First allocation
+ tokens = (char**)malloc(numTokens * sizeof(char*));
+ else*/
+ /* Added by Thomas Roehl (Thomas.Roehl at fau.de)
+ * Allocate new output token and copy temporary token
+ */
+ newToken = malloc((strlen(tmpToken)+1) * sizeof(char));
+ strcpy(newToken, tmpToken);
+ newToken[strlen(tmpToken)] = '\0';
+ tmp = (char**)realloc(tokens, numTokens * sizeof(char*));
+ if (tmp == NULL)
+ {
+ *tokensRef = NULL;
+ free(tmpToken);
+ return 0;
+ }
+ tokens = tmp;
+ tmp = NULL;
+ tokens[numTokens - 1] = newToken;
+ }
+ }
+ *tokensRef = tokens; // Send back out
+ /* Added by Thomas Roehl (Thomas.Roehl at fau.de) */
+ free(tmpToken);
+ return numTokens;
+}
+
+bool leftAssoc(token op)
+{
+ bool ret;
+ switch(tokenType(op))
+ {
+ case addop:
+ case multop:
+ ret = true;
+ break;
+ case expop:
+ ret = false;
+ break;
+ }
+ return ret;
+}
+
+int precedence(token op1, token op2)
+{
+ int ret;
+
+ if(tokenType(op1) == tokenType(op2)) // Equal precedence
+ ret = 0;
+ else if(tokenType(op1) == addop
+ && (tokenType(op2) == multop || tokenType(op2) == expop)) // op1 has lower precedence
+ ret = -1;
+ else if(tokenType(op2) == addop
+ && (tokenType(op1) == multop || tokenType(op1) == expop)) // op1 has higher precedence
+ ret = 1;
+ else if(tokenType(op1) == multop
+ && tokenType(op2) == expop) // op1 has lower precedence
+ ret = -1;
+ else if(tokenType(op1) == expop
+ && tokenType(op2) == multop) // op1 has higher precedence
+ ret = 1;
+
+ return ret;
+}
+
+int evalStackPush(Stack *s, token val)
+{
+ /* Added by Thomas Roehl (Thomas.Roehl at fau.de) to return
+ * calculation errors. Function now returns an int.
+ */
+ int ret = 0;
+ if(prefs.display.postfix)
+ printf("\t%s\n", val);
+
+ switch(tokenType(val))
+ {
+ case function:
+ {
+ token operand, res;
+ operand = (token)stackPop(s);
+ res = doFunc(operand, val);
+ //free(operand);
+ stackPush(s, res);
+ }
+ break;
+ case expop:
+ case multop:
+ case addop:
+ {
+ if(stackSize(s) >= 2)
+ {
+ // Pop two operands
+ token l, r, res;
+ r = (token)stackPop(s);
+ l = (token)stackPop(s);
+
+ // Evaluate
+ /* Added return value by Thomas Roehl (Thomas.Roehl at fau.de) */
+ ret = doOp(l, val, r, &res);
+ // Push result
+ stackPush(s, res);
+ /* Added by Thomas Roehl (Thomas.Roehl at fau.de)
+ * Keeping track of the intermediate results
+ */
+ calcTokens[nrCalcTokens] = res;
+ nrCalcTokens++;
+ }
+ else
+ {
+ stackPush(s, val);
+ }
+ }
+ break;
+ case value:
+ {
+ stackPush(s, val);
+ }
+ break;
+ }
+ /* Return value by Thomas Roehl (Thomas.Roehl at fau.de) */
+ return ret;
+}
+
+int postfix(token *tokens, int numTokens, Stack *output)
+{
+ Stack operators;
+ int i;
+ int err = 0;
+ stackInit(&operators, 2*numTokens);
+ for(i = 0; i < numTokens; i++)
+ {
+ // From Wikipedia/Shunting-yard_algorithm:
+ switch(tokenType(tokens[i]))
+ {
+ case value:
+ {
+ // If the token is a number, then add it to the output queue.
+ //printf("Adding number to output stack\n");
+ err = evalStackPush(output, tokens[i]);
+ }
+ break;
+ case function:
+ {
+ // If the token is a function token, then push it onto the stack.
+ stackPush(&operators, tokens[i]);
+ }
+ break;
+ case argsep:
+ {
+ /*
+ * If the token is a function argument separator (e.g., a comma):
+ * Until the token at the top of the stack is a left
+ * paren, pop operators off the stack onto the output
+ * queue. If no left paren encountered, either separator
+ * was misplaced or parens mismatched.
+ */
+ while(stackSize(&operators) > 0
+ && tokenType((token)stackTop(&operators)) != lparen
+ && stackSize(&operators) > 1
+ && err == 0)
+ {
+ //printf("Moving operator from operator stack to output stack\n");
+ token t = (token)stackPop(&operators);
+ err = evalStackPush(output, t);
+ //free(t);
+ }
+ if(stackSize(&operators) > 0
+ && tokenType((token)stackTop(&operators)) != lparen)
+ {
+ err = -1;
+ /* Changed by Thomas Roehl */
+ //raise(parenMismatch);
+ }
+ //printf("Removing left paren from operator stack\n");
+ token t = stackPop(&operators); // Discard lparen
+ //free(t);
+ }
+ break;
+ case addop:
+ case multop:
+ case expop:
+ {
+ /*
+ * If the token is an operator, op1, then:
+ * while there is an operator token, op2, at the top of the stack, and
+ * either op1 is left-associative and its precedence is less than or equal to that of op2,
+ * or op1 is right-associative and its precedence is less than that of op2,
+ * pop op2 off the stack, onto the output queue
+ * push op1 onto the stack
+ */
+ while(stackSize(&operators) > 0
+ && (tokenType((char*)stackTop(&operators)) == addop || tokenType((char*)stackTop(&operators)) == multop || tokenType((char*)stackTop(&operators)) == expop)
+ && ((leftAssoc(tokens[i]) && precedence(tokens[i], (char*)stackTop(&operators)) <= 0)
+ || (!leftAssoc(tokens[i]) && precedence(tokens[i], (char*)stackTop(&operators)) < 0))
+ && err == 0)
+ {
+ //printf("Moving operator from operator stack to output stack\n");
+ token t = (token)stackPop(&operators);
+ err = evalStackPush(output, t);
+ //free(t);
+ }
+ //printf("Adding operator to operator stack\n");
+ stackPush(&operators, tokens[i]);
+ }
+ break;
+ case lparen:
+ {
+ // If the token is a left paren, then push it onto the stack
+ //printf("Adding left paren to operator stack\n");
+ stackPush(&operators, tokens[i]);
+ }
+ break;
+ case rparen:
+ {
+ /*
+ * If the token is a right paren:
+ * Until the token at the top of the stack is a left paren, pop operators off the stack onto the output queue
+ * Pop the left paren from the stack, but not onto the output queue
+ * If the stack runs out without finding a left paren, then there are mismatched parens
+ */
+ while(stackSize(&operators) > 0
+ && tokenType((token)stackTop(&operators)) != lparen
+ && stackSize(&operators) > 1
+ && err == 0)
+ {
+ //printf("Moving operator from operator stack to output stack\n");
+ token t = (token)stackPop(&operators);
+ err = evalStackPush(output, t);
+ //free(t);
+ }
+ if(stackSize(&operators) > 0
+ && tokenType((token)stackTop(&operators)) != lparen)
+ {
+ err = -1;
+ /* Changed by Thomas Roehl */
+ //raise(parenMismatch);
+ }
+ //printf("Removing left paren from operator stack\n");
+ token t = (token)stackPop(&operators);
+ //stackPop(&operators); // Discard lparen
+ //free(t);
+ }
+ break;
+ }
+ if (err)
+ break;
+ }
+ /*
+ * When there are no more tokens to read:
+ * While there are still operator tokens on the stack:
+ * If the operator token on the top of the stack is a paren, then there are mismatched parens
+ * Pop the operator onto the output queue
+ */
+ while(stackSize(&operators) > 0)
+ {
+ if(tokenType((token)stackTop(&operators)) == lparen)
+ {
+ /* Changed by Thomas Roehl */
+ //raise(parenMismatch);
+ err = -1;
+ }
+ //printf("Moving operator from operator stack to output stack\n");
+ token t = (token)stackPop(&operators);
+ err = evalStackPush(output, t);
+ //free(t);
+ }
+ stackFree(&operators);
+ return err;
+}
+
+
+
+/* Added by Thomas Roehl (Thomas.Roehl at fau.de) as interface for LIKWID */
+int calculate_infix(char* finfix, double *result)
+{
+ int i;
+ int ret = 0;
+ *result = 0;
+ token* tokens = NULL;
+ Stack expr;
+ nrCalcTokens = 0;
+ int numTokens = tokenize(finfix, &tokens);
+ calcTokens = (token*)malloc(2 * numTokens * sizeof(token));
+ if (calcTokens == NULL)
+ {
+ ret = -1;
+ *result = NAN;
+ }
+ memset(calcTokens, 0, 2 * numTokens * sizeof(token));
+ stackInit(&expr, 2*numTokens);
+ ret = postfix(tokens, numTokens, &expr);
+ if ((stackSize(&expr) != 1) || (ret < 0))
+ {
+ *result = NAN;
+ goto calcerror;
+ }
+ else
+ {
+ *result = strtod((char*)stackTop(&expr), NULL);
+ }
+ ret = 0;
+calcerror:
+ for (i=0;i<nrCalcTokens; i++)
+ {
+ if (calcTokens[i] != NULL)
+ free(calcTokens[i]);
+ }
+ if (calcTokens)
+ free(calcTokens);
+ calcTokens = NULL;
+ nrCalcTokens = 0;
+ for (i=0;i<numTokens;i++)
+ {
+ if (tokens[i])
+ {
+ free(tokens[i]);
+ }
+ }
+ if (tokens)
+ {
+ free(tokens);
+ tokens = NULL;
+ numTokens = 0;
+ }
+ stackFree(&expr);
+ return ret;
+}
+
+
diff --git a/src/calculator_stack.c b/src/calculator_stack.c
new file mode 100644
index 0000000..e14acee
--- /dev/null
+++ b/src/calculator_stack.c
@@ -0,0 +1,77 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: calculator_stack.c
+ *
+ * Description: Stack implementation for infix calculator
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Brandon Mills (bm), mills.brandont at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <calculator_stack.h>
+
+void stackInit(Stack *s, int size)
+{
+ s->content = malloc(size * sizeof(void*));
+ s->size = size;
+ s->top = -1;
+}
+
+void stackPush(Stack *s, void* val)
+{
+ (s->top)++;
+ s->content[s->top] = val;
+}
+
+void* stackTop(Stack *s)
+{
+ void *ret = NULL;
+ if(s->top >= 0 && s->content != NULL)
+ ret = s->content[s->top];
+ return ret;
+}
+
+void* stackPop(Stack *s)
+{
+ void *ret = NULL;
+ if(s->top >= 0 && s->content != NULL)
+ ret = s->content[(s->top)--];
+ return ret;
+}
+
+int stackSize(Stack *s)
+{
+ return s->top + 1;
+}
+
+void stackFree(Stack *s)
+{
+ if (s->content)
+ free(s->content);
+ s->content = NULL;
+ s->size = 0;
+ s->top = -1;
+}
+
diff --git a/src/configuration.c b/src/configuration.c
new file mode 100644
index 0000000..f7a9357
--- /dev/null
+++ b/src/configuration.c
@@ -0,0 +1,339 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: configuration.c
+ *
+ * Description: Configuration file module.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <errno.h>
+
+
+
+#include <configuration.h>
+
+Configuration config = {NULL,NULL,NULL,NULL,-1,MAX_NUM_THREADS,MAX_NUM_NODES};
+int init_config = 0;
+
+static int daemonPath_len = 0;
+static int groupPath_len = 0;
+
+static int default_configuration(void)
+{
+ int ret = 0;
+ char filename[1024] = { [0 ... 1023] = '\0' };
+ char *fptr = NULL;
+ size_t len = 0;
+ filename[0] = '\0';
+ if (ACCESSMODE == 0)
+ {
+ config.daemonMode = ACCESSMODE_DIRECT;
+ init_config = 1;
+ return 0;
+ }
+ config.daemonMode = ACCESSMODE_DAEMON;
+
+ groupPath_len = strlen(TOSTRING(GROUPPATH))+10;
+ config.groupPath = malloc(groupPath_len+1);
+ ret = snprintf(config.groupPath, groupPath_len, "%s", TOSTRING(GROUPPATH));
+ config.groupPath[ret] = '\0';
+
+
+ FILE* fp = popen("which likwid-accessD 2>/dev/null | tr -d '\n'","r");
+ if (fp == NULL)
+ {
+ goto use_hardcoded;
+ }
+ ret = getline(&fptr, &len, fp);
+ if (ret < 0)
+ {
+ fclose(fp);
+ if (fptr)
+ free(fptr);
+ goto use_hardcoded;
+ }
+ if (!access(fptr, X_OK))
+ {
+ config.daemonPath = (char*)malloc((len+1) * sizeof(char));
+ strncpy(config.daemonPath, fptr, len);
+ config.daemonPath[len] = '\0';
+ if (fptr)
+ free(fptr);
+ }
+ else
+ {
+ fprintf(stderr, "Found access daemon at %s but it is not executable, using compiled in daemon path.\n", fptr);
+ fclose(fp);
+ if (fptr)
+ free(fptr);
+ goto use_hardcoded;
+ }
+ init_config = 1;
+ fclose(fp);
+ return 0;
+use_hardcoded:
+ ret = sprintf(filename,"%s", TOSTRING(ACCESSDAEMON));
+ filename[ret] = '\0';
+ if (!access(filename, X_OK))
+ {
+ config.daemonPath = (char*)malloc((strlen(filename)+1) * sizeof(char));
+ strcpy(config.daemonPath, filename);
+ init_config = 1;
+ }
+ else
+ {
+ ERROR_PLAIN_PRINT(Unable to get path to access daemon. Maybe your PATH environment variable does not contain the folder where you installed it or the file was moved away / not copied to that location?);
+ exit(EXIT_FAILURE);
+ }
+ return 0;
+}
+
+int init_configuration(void)
+{
+ int i;
+ FILE* fp;
+ char line[512];
+ char name[128];
+ char value[256];
+ char filename[1024];
+ filename[0] = '\0';
+ char preconfigured[1024];
+ preconfigured[0] = '\0';
+ if (init_config == 1)
+ {
+ return 0;
+ }
+ sprintf(preconfigured, "%s%s",TOSTRING(INSTALL_PREFIX),TOSTRING(CFGFILE));
+
+ if (access(preconfigured, R_OK) != 0)
+ {
+ if (access(TOSTRING(CFGFILE), R_OK) != 0)
+ {
+ if (!access("/etc/likwid.cfg",R_OK))
+ {
+ sprintf(filename,"%s", "/etc/likwid.cfg");
+ }
+ }
+ else
+ {
+ sprintf(filename,"%s",TOSTRING(CFGFILE));
+ }
+ }
+ else
+ {
+ sprintf(filename, "%s",preconfigured);
+ }
+
+ if ((config.topologyCfgFileName == NULL) && (strlen(filename) == 0))
+ {
+ if (!access(TOSTRING(TOPOFILE), R_OK))
+ {
+ preconfigured[0] = '\0';
+ sprintf(preconfigured,"%s", TOSTRING(TOPOFILE));
+ }
+ else
+ {
+ sprintf(preconfigured, "%s%s",TOSTRING(INSTALL_PREFIX),TOSTRING(TOPOFILE));
+ if (access(preconfigured, R_OK))
+ {
+ preconfigured[0] = '\0';
+ }
+ }
+ if (preconfigured[0] != '\0')
+ {
+ config.topologyCfgFileName = (char*)malloc((strlen(preconfigured)+1) * sizeof(char));
+ strcpy(config.topologyCfgFileName, preconfigured);
+ config.topologyCfgFileName[strlen(preconfigured)] = '\0';
+ }
+ }
+
+ if ((strlen(filename) == 0) || (!access(filename, R_OK)))
+ {
+ return default_configuration();
+ }
+ DEBUG_PRINT(DEBUGLEV_INFO, Reading configuration from %s, filename);
+ // Copy determined config filename to struct
+ config.configFileName = malloc((strlen(filename)+1)*sizeof(char));
+ strcpy(config.configFileName, filename);
+ config.configFileName[strlen(filename)] = '\0';
+
+ fp = fopen(config.configFileName, "r");
+ if (fp == NULL)
+ {
+ DEBUG_PLAIN_PRINT(DEBUGLEV_INFO, Using compile-time configuration)
+ return default_configuration();
+ }
+ DEBUG_PRINT(DEBUGLEV_INFO, Reading configuration from %s, filename)
+ while (fgets(line, 512, fp) != NULL) {
+ if (sscanf(line,"%s = %s", name, value) != 2)
+ {
+ continue;
+ }
+ if (strncmp(name, "#", 1) == 0)
+ {
+ continue;
+ }
+ if (strcmp(name, "topology_file") == 0)
+ {
+ config.topologyCfgFileName = (char*)malloc((strlen(value)+1) * sizeof(char));
+ strcpy(config.topologyCfgFileName, value);
+ config.topologyCfgFileName[strlen(value)] = '\0';
+ }
+ else if (strcmp(name, "daemon_path") == 0)
+ {
+ config.daemonPath = (char*)malloc((strlen(value)+1) * sizeof(char));
+ strcpy(config.daemonPath, value);
+ config.daemonPath[strlen(value)] = '\0';
+ if (access(config.daemonPath, R_OK))
+ {
+ if (default_configuration() < 0)
+ {
+ ERROR_PLAIN_PRINT(Unable to get path to access daemon);
+ exit(EXIT_FAILURE);
+ }
+ }
+ }
+ else if (strcmp(name, "groupPath") == 0)
+ {
+ struct stat st;
+ stat(value, &st);
+ if (S_ISDIR(st.st_mode))
+ {
+ config.groupPath = (char*)malloc((strlen(value)+1) * sizeof(char));
+ strcpy(config.groupPath, value);
+ config.groupPath[strlen(value)] = '\0';
+ }
+ else
+ {
+ ERROR_PRINT(Path to group files %s is not a directory, value);
+ exit(EXIT_FAILURE);
+ }
+ }
+ else if (strcmp(name, "daemon_mode") == 0)
+ {
+ if (strcmp(value, "daemon") == 0)
+ {
+ config.daemonMode = ACCESSMODE_DAEMON;
+ }
+ else if (strcmp(value, "direct") == 0)
+ {
+ config.daemonMode = ACCESSMODE_DIRECT;
+ }
+ }
+ else if (strcmp(name, "max_threads") == 0)
+ {
+ config.maxNumThreads = atoi(value);
+ }
+ else if (strcmp(name, "max_nodes") == 0)
+ {
+ config.maxNumNodes = atoi(value);
+ }
+ }
+
+
+ init_config = 1;
+
+ fclose(fp);
+ return 0;
+}
+
+Configuration_t get_configuration(void)
+{
+ if (init_config == 1)
+ {
+ return &config;
+ }
+ return NULL;
+}
+
+int destroy_configuration(void)
+{
+ if (init_config == 0)
+ {
+ return -EFAULT;
+ }
+ if (config.configFileName != NULL)
+ {
+ free(config.configFileName);
+ }
+ if (config.groupPath != NULL)
+ {
+ free(config.groupPath);
+ }
+ if (config.topologyCfgFileName != NULL)
+ {
+ free(config.topologyCfgFileName);
+ }
+ if (config.daemonMode != ACCESSMODE_DIRECT)
+ {
+ if (config.daemonPath != NULL)
+ {
+ free(config.daemonPath);
+ }
+ }
+ init_config = 0;
+ return 0;
+}
+
+int config_setGroupPath(char* path)
+{
+ int ret = 0;
+ struct stat st;
+ char* new;
+ stat(path, &st);
+ if (S_ISDIR(st.st_mode))
+ {
+ if (strlen(path)+1 > groupPath_len)
+ {
+ new = malloc(strlen(path)+1);
+ if (new == NULL)
+ {
+ printf("Cannot allocate space for new group path\n");
+ return -ENOMEM;
+ }
+ ret = sprintf(new, "%s", path);
+ new[ret] = '\0';
+ if (config.groupPath)
+ free(config.groupPath);
+ config.groupPath = new;
+ groupPath_len = strlen(path);
+ }
+ else
+ {
+ ret = snprintf(config.groupPath, groupPath_len, "%s", path);
+ config.groupPath[ret] = '\0';
+ }
+ return 0;
+ }
+ printf("Given path is no directory\n");
+ return -ENOTDIR;
+}
diff --git a/src/cpuFeatures.c b/src/cpuFeatures.c
index 4733a82..e3ecfdc 100644
--- a/src/cpuFeatures.c
+++ b/src/cpuFeatures.c
@@ -9,13 +9,13 @@
* Allows to turn on and off the Hardware prefetcher
* available.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -37,17 +37,18 @@
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
-
+#include <errno.h>
#include <types.h>
-#include <msr.h>
-#include <cpuid.h>
+#include <access.h>
+#include <topology.h>
#include <registers.h>
#include <textcolor.h>
-#include <cpuFeatures.h>
+#include <likwid.h>
/* ##### EXPORTED VARIABLES ########################################### */
-CpuFeatureFlags cpuFeatureFlags;
+static uint64_t cpuFeatureMask[MAX_NUM_THREADS] = {0x0ULL};
+static int features_initialized = 0;
/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
@@ -59,341 +60,501 @@ CpuFeatureFlags cpuFeatureFlags;
#define TEST_FLAG(feature,flag) \
if (flags & (1ULL<<(flag))) \
- { \
- cpuFeatureFlags.feature = 1; \
- } \
- else \
- { \
- cpuFeatureFlags.feature = 0; \
+ { \
+ cpuFeatureMask[cpu] |= (1ULL<<feature); \
+ } \
+ else \
+ { \
+ cpuFeatureMask[cpu] &= ~(1ULL<<feature); \
}
-
-/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
-
-void
-cpuFeatures_init(int cpu)
-{
- uint64_t flags = msr_read(cpu, MSR_IA32_MISC_ENABLE);
-
- TEST_FLAG(fastStrings,0);
- TEST_FLAG(thermalControl,3);
- TEST_FLAG(perfMonitoring,7);
- TEST_FLAG(branchTraceStorage,11);
- TEST_FLAG(pebs,12);
- TEST_FLAG(speedstep,16);
- TEST_FLAG(monitor,18);
- TEST_FLAG(cpuidMaxVal,22);
- TEST_FLAG(xdBit,34);
-
- if ((cpuid_info.model == NEHALEM) ||
- (cpuid_info.model == NEHALEM_BLOOMFIELD) ||
- (cpuid_info.model == NEHALEM_LYNNFIELD) ||
- (cpuid_info.model == NEHALEM_WESTMERE) ||
- (cpuid_info.model == NEHALEM_WESTMERE_M) ||
- (cpuid_info.model == NEHALEM_EX))
- {
- /*Nehalem */
- TEST_FLAG(turboMode,38);
- TEST_FLAG(hardwarePrefetcher,9);
- TEST_FLAG(clPrefetcher,19);
- TEST_FLAG(dcuPrefetcher,37);
- TEST_FLAG(ipPrefetcher,39);
- }
- else if ((cpuid_info.model == CORE2_45) ||
- (cpuid_info.model == CORE2_65))
- {
- /*Core 2*/
- TEST_FLAG(hardwarePrefetcher,9);
- TEST_FLAG(ferrMultiplex,10);
- TEST_FLAG(clPrefetcher,19);
- TEST_FLAG(speedstepLock,20);
- TEST_FLAG(dcuPrefetcher,37);
- TEST_FLAG(dynamicAcceleration,38);
- TEST_FLAG(ipPrefetcher,39);
+#define TEST_FLAG_INV(feature,flag) \
+ if (flags & (1ULL<<(flag))) \
+ { \
+ cpuFeatureMask[cpu] &= ~(1ULL<<feature); \
+ } \
+ else \
+ { \
+ cpuFeatureMask[cpu] |= (1ULL<<feature); \
}
- /*
- printf("FLAGS: 0x%llX \n",flags);
- */
-}
+#define IF_FLAG(feature) (cpuFeatureMask[cpu] & (1ULL<<feature))
-void
-cpuFeatures_print(int cpu)
-{
- uint64_t flags = msr_read(cpu, MSR_IA32_MISC_ENABLE);
- printf(HLINE);
- printf("Fast-Strings: \t\t\t");
- if (flags & 1)
- {
- PRINT_VALUE(GREEN,enabled);
- }
- else
+/* ##### FUNCTIONS - LOCAL TO THIS SOURCE FILE ######################### */
+static void cpuFeatures_update(int cpu)
+{
+ int ret;
+ uint64_t flags = 0x0ULL;
+ ret = HPMread(cpu, MSR_DEV, MSR_IA32_MISC_ENABLE, &flags);
+ if (ret != 0)
{
- PRINT_VALUE(RED,disabled);
+ fprintf(stderr, "Cannot read register 0x%X on cpu %d: err %d\n", MSR_IA32_MISC_ENABLE, cpu, ret);
}
- printf("Automatic Thermal Control: \t");
- if (flags & (1ULL<<3))
+ /*cpuFeatureFlags.fastStrings = 0;
+ cpuFeatureFlags.thermalControl = 0;
+ cpuFeatureFlags.perfMonitoring = 0;
+ cpuFeatureFlags.hardwarePrefetcher = 0;
+ cpuFeatureFlags.ferrMultiplex = 0;
+ cpuFeatureFlags.branchTraceStorage = 0;
+ cpuFeatureFlags.pebs = 0;
+ cpuFeatureFlags.speedstep = 0;
+ cpuFeatureFlags.monitor = 0;
+ cpuFeatureFlags.clPrefetcher = 0;
+ cpuFeatureFlags.speedstepLock = 0;
+ cpuFeatureFlags.cpuidMaxVal = 0;
+ cpuFeatureFlags.xdBit = 0;
+ cpuFeatureFlags.dcuPrefetcher = 0;
+ cpuFeatureFlags.dynamicAcceleration = 0;
+ cpuFeatureFlags.turboMode = 0;
+ cpuFeatureFlags.ipPrefetcher = 0;*/
+
+ TEST_FLAG(FEAT_FAST_STRINGS,0);
+ TEST_FLAG(FEAT_THERMAL_CONTROL,3);
+ TEST_FLAG(FEAT_PERF_MON,7);
+ TEST_FLAG_INV(FEAT_BRANCH_TRACE_STORAGE,11);
+ TEST_FLAG_INV(FEAT_PEBS,12);
+ TEST_FLAG(FEAT_SPEEDSTEP,16);
+ TEST_FLAG(FEAT_MONITOR,18);
+ TEST_FLAG(FEAT_CPUID_MAX_VAL,22);
+ TEST_FLAG_INV(FEAT_XTPR_MESSAGE, 23);
+ TEST_FLAG_INV(FEAT_XD_BIT,34);
+
+ if ((cpuid_info.model == CORE2_45) ||
+ (cpuid_info.model == CORE2_65))
{
- PRINT_VALUE(GREEN,enabled);
+ TEST_FLAG_INV(FEAT_HW_PREFETCHER,9);
+ TEST_FLAG(FEAT_FERR_MULTIPLEX,10);
+ TEST_FLAG(FEAT_TM2,13);
+ TEST_FLAG_INV(FEAT_CL_PREFETCHER,19);
+ TEST_FLAG(FEAT_SPEEDSTEP_LOCK,20);
+ TEST_FLAG_INV(FEAT_DCU_PREFETCHER,37);
+ TEST_FLAG_INV(FEAT_DYN_ACCEL,38);
+ TEST_FLAG_INV(FEAT_IP_PREFETCHER,39);
}
- else
+ else if ((cpuid_info.model == NEHALEM) ||
+ (cpuid_info.model == NEHALEM_BLOOMFIELD) ||
+ (cpuid_info.model == NEHALEM_LYNNFIELD) ||
+ (cpuid_info.model == NEHALEM_WESTMERE) ||
+ (cpuid_info.model == NEHALEM_WESTMERE_M) ||
+ (cpuid_info.model == NEHALEM_EX) ||
+ (cpuid_info.model == WESTMERE_EX) ||
+ (cpuid_info.model == ATOM_SILVERMONT_E) ||
+ (cpuid_info.model == ATOM_SILVERMONT_C) ||
+ (cpuid_info.model == ATOM_SILVERMONT_Z1) ||
+ (cpuid_info.model == ATOM_SILVERMONT_Z2) ||
+ (cpuid_info.model == ATOM_SILVERMONT_F) ||
+ (cpuid_info.model == ATOM_SILVERMONT_AIR) ||
+ (cpuid_info.model == SANDYBRIDGE) ||
+ (cpuid_info.model == SANDYBRIDGE_EP) ||
+ (cpuid_info.model == IVYBRIDGE) ||
+ (cpuid_info.model == IVYBRIDGE_EP) ||
+ (cpuid_info.model == HASWELL) ||
+ (cpuid_info.model == HASWELL_M1) ||
+ (cpuid_info.model == HASWELL_M2) ||
+ (cpuid_info.model == HASWELL_EP) ||
+ (cpuid_info.model == BROADWELL) ||
+ (cpuid_info.model == BROADWELL_D) ||
+ (cpuid_info.model == BROADWELL_E) ||
+ (cpuid_info.model == SKYLAKE1) ||
+ (cpuid_info.model == SKYLAKE2))
{
- PRINT_VALUE(RED,disabled);
+ TEST_FLAG_INV(FEAT_TURBO_MODE,38);
}
- printf("Performance monitoring: \t");
- if (flags & (1ULL<<7))
- {
- PRINT_VALUE(GREEN,enabled);
- }
- else
+ if ((cpuid_info.model == NEHALEM) ||
+ (cpuid_info.model == NEHALEM_BLOOMFIELD) ||
+ (cpuid_info.model == NEHALEM_LYNNFIELD) ||
+ (cpuid_info.model == NEHALEM_WESTMERE) ||
+ (cpuid_info.model == NEHALEM_WESTMERE_M) ||
+ (cpuid_info.model == NEHALEM_EX) ||
+ (cpuid_info.model == WESTMERE_EX) ||
+ (cpuid_info.model == SANDYBRIDGE) ||
+ (cpuid_info.model == SANDYBRIDGE_EP) ||
+ (cpuid_info.model == IVYBRIDGE) ||
+ (cpuid_info.model == IVYBRIDGE_EP) ||
+ (cpuid_info.model == HASWELL) ||
+ (cpuid_info.model == HASWELL_M1) ||
+ (cpuid_info.model == HASWELL_M2) ||
+ (cpuid_info.model == HASWELL_EP) ||
+ (cpuid_info.model == BROADWELL) ||
+ (cpuid_info.model == BROADWELL_D) ||
+ (cpuid_info.model == BROADWELL_E) ||
+ (cpuid_info.model == SKYLAKE1) ||
+ (cpuid_info.model == SKYLAKE2))
{
- PRINT_VALUE(RED,disabled);
+ ret = HPMread(cpu, MSR_DEV, MSR_PREFETCH_ENABLE, &flags);
+ if (ret != 0)
+ {
+ fprintf(stderr, "Cannot read register 0x%X on cpu %d: err %d\n", MSR_PREFETCH_ENABLE, cpu, ret);
+ }
+ TEST_FLAG_INV(FEAT_IP_PREFETCHER,3);
+ TEST_FLAG_INV(FEAT_DCU_PREFETCHER,2);
+ TEST_FLAG_INV(FEAT_CL_PREFETCHER,1);
+ TEST_FLAG_INV(FEAT_HW_PREFETCHER,0);
}
- printf("Branch Trace Storage: \t\t");
+}
- if (flags & (1ULL<<11))
- {
- PRINT_VALUE(RED,notsupported);
- }
- else
- {
- PRINT_VALUE(GREEN,supported);
- }
+static char* cpuFeatureNames[CPUFEATURES_MAX] = {
+ [FEAT_HW_PREFETCHER] = "Hardware Prefetcher",
+ [FEAT_IP_PREFETCHER] = "IP Prefetcher",
+ [FEAT_DCU_PREFETCHER] = "DCU Pretecher",
+ [FEAT_CL_PREFETCHER] = "Adjacent Cache Line Prefetcher",
+ [FEAT_FAST_STRINGS] = "Fast-Strings",
+ [FEAT_THERMAL_CONTROL] = "Automatic Thermal Control Circuit",
+ [FEAT_PERF_MON] = "Performance Monitoring",
+ [FEAT_BRANCH_TRACE_STORAGE] = "Branch Trace Storage",
+ [FEAT_PEBS] = "Precise Event Based Sampling (PEBS)",
+ [FEAT_SPEEDSTEP] = "Enhanced Intel SpeedStep Technology",
+ [FEAT_MONITOR] = "MONITOR/MWAIT",
+ [FEAT_CPUID_MAX_VAL] = "Limit CPUID Maxval",
+ [FEAT_XD_BIT] = "Execute Disable Bit",
+ [FEAT_TURBO_MODE] = "Intel Turbo Mode",
+ [FEAT_DYN_ACCEL] = "Intel Dynamic Acceleration",
+ [FEAT_FERR_MULTIPLEX] = "FERR# Multiplexing",
+ [FEAT_XTPR_MESSAGE] = "xTPR Message",
+ [FEAT_TM2] = "Thermal Monitoring 2",
+ [FEAT_SPEEDSTEP_LOCK] = "Enhanced Intel SpeedStep Technology Select Lock",
+};
- printf("PEBS: \t\t\t\t");
- if (flags & (1ULL<<12))
- {
- PRINT_VALUE(RED,notsupported);
- }
- else
- {
- PRINT_VALUE(GREEN,supported);
- }
+/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
- printf("Intel Enhanced SpeedStep: \t");
- if (flags & (1ULL<<16))
- {
- PRINT_VALUE(GREEN,enabled);
- }
- else
+void
+cpuFeatures_init()
+{
+ int i;
+ if (features_initialized)
{
- PRINT_VALUE(RED,disabled);
+ return;
}
- printf("MONITOR/MWAIT: \t\t\t");
- if (flags & (1ULL<<18))
+ topology_init();
+ if (!HPMinitialized())
{
- PRINT_VALUE(GREEN,supported);
+ HPMinit();
+
}
- else
+ for (i = 0; i < cpuid_topology.numHWThreads; i++)
{
- PRINT_VALUE(RED,notsupported);
+ HPMaddThread(cpuid_topology.threadPool[i].apicId);
+ cpuFeatures_update(cpuid_topology.threadPool[i].apicId);
}
- printf("Limit CPUID Maxval: \t\t");
- if (flags & (1ULL<<22))
- {
- PRINT_VALUE(RED,enabled);
- }
- else
- {
- PRINT_VALUE(GREEN,disabled);
- }
+
+ features_initialized = 1;
+}
- printf("XD Bit Disable: \t\t");
- if (flags & (1ULL<<34))
- {
- PRINT_VALUE(RED,disabled);
- }
- else
+void
+cpuFeatures_print(int cpu)
+{
+ int i;
+ uint64_t flags = 0x0ULL;
+ if (!features_initialized)
{
- PRINT_VALUE(GREEN,enabled);
+ return;
}
- if ((cpuid_info.model == NEHALEM) ||
- (cpuid_info.model == NEHALEM_BLOOMFIELD) ||
- (cpuid_info.model == NEHALEM_LYNNFIELD) ||
- (cpuid_info.model == NEHALEM_WESTMERE) ||
- (cpuid_info.model == NEHALEM_WESTMERE_M) ||
- (cpuid_info.model == NEHALEM_EX) ||
- (cpuid_info.model == CORE2_45) ||
- (cpuid_info.model == CORE2_65))
- {
- printf("IP Prefetcher: \t\t\t");
- if (flags & (1ULL<<39))
- {
- PRINT_VALUE(RED,disabled);
- }
- else
- {
- PRINT_VALUE(GREEN,enabled);
- }
+ cpuFeatures_update(cpu);
- printf("Hardware Prefetcher: \t\t");
- if (flags & (1ULL<<9))
- {
- PRINT_VALUE(RED,disabled);
- }
- else
- {
- PRINT_VALUE(GREEN,enabled);
- }
- printf("Adjacent Cache Line Prefetch: \t");
- if (flags & (1ULL<<19))
- {
- PRINT_VALUE(RED,disabled);
- }
- else
- {
- PRINT_VALUE(GREEN,enabled);
- }
-
- printf("DCU Prefetcher: \t\t");
- if (flags & (1ULL<<37))
- {
- PRINT_VALUE(RED,disabled);
- }
- else
- {
- PRINT_VALUE(GREEN,enabled);
- }
- }
-
- if ((cpuid_info.model == NEHALEM) ||
- (cpuid_info.model == NEHALEM_BLOOMFIELD) ||
- (cpuid_info.model == NEHALEM_LYNNFIELD) ||
- (cpuid_info.model == NEHALEM_WESTMERE) ||
- (cpuid_info.model == NEHALEM_WESTMERE_M) ||
- (cpuid_info.model == NEHALEM_EX))
+ printf(HLINE);
+ for (i=0;i<CPUFEATURES_MAX; i++)
{
- printf("Intel Turbo Mode: \t\t");
- if (flags & (1ULL<<38))
+ if ((cpuid_info.model != CORE2_45) &&
+ (cpuid_info.model != CORE2_65) &&
+ ((i == FEAT_FERR_MULTIPLEX) ||
+ (i == FEAT_DYN_ACCEL) ||
+ (i == FEAT_SPEEDSTEP_LOCK) ||
+ (i == FEAT_TM2)))
{
- PRINT_VALUE(RED,disabled);
+ continue;
}
- else
+ printf("%-48s: ",cpuFeatureNames[i]);
+ if (IF_FLAG(i))
{
- PRINT_VALUE(GREEN,enabled);
- }
- }
- else if ((cpuid_info.model == CORE2_45) ||
- (cpuid_info.model == CORE2_65))
- {
-
- printf("Intel Dynamic Acceleration: \t");
- if (flags & (1ULL<<38))
- {
- PRINT_VALUE(RED,disabled);
+ PRINT_VALUE(GREEN, enabled);
}
else
{
- PRINT_VALUE(GREEN,enabled);
+ PRINT_VALUE(RED,disabled);
}
}
-
printf(HLINE);
}
-void
-cpuFeatures_enable(int cpu, CpuFeature type)
+int
+cpuFeatures_enable(int cpu, CpuFeature type, int print)
{
+ int ret;
+ uint64_t flags;
+ uint32_t reg = MSR_IA32_MISC_ENABLE;
+ int newOffsets = 0;
+ if (IF_FLAG(type))
+ {
+ return 0;
+ }
if ((cpuid_info.model == NEHALEM) ||
(cpuid_info.model == NEHALEM_BLOOMFIELD) ||
(cpuid_info.model == NEHALEM_LYNNFIELD) ||
(cpuid_info.model == NEHALEM_WESTMERE) ||
(cpuid_info.model == NEHALEM_WESTMERE_M) ||
(cpuid_info.model == NEHALEM_EX) ||
- (cpuid_info.model == CORE2_45) ||
- (cpuid_info.model == CORE2_65))
+ (cpuid_info.model == WESTMERE_EX) ||
+ (cpuid_info.model == SANDYBRIDGE) ||
+ (cpuid_info.model == SANDYBRIDGE_EP) ||
+ (cpuid_info.model == IVYBRIDGE) ||
+ (cpuid_info.model == IVYBRIDGE_EP) ||
+ (cpuid_info.model == HASWELL) ||
+ (cpuid_info.model == HASWELL_M1) ||
+ (cpuid_info.model == HASWELL_M2) ||
+ (cpuid_info.model == HASWELL_EP) ||
+ (cpuid_info.model == BROADWELL) ||
+ (cpuid_info.model == BROADWELL_D) ||
+ (cpuid_info.model == BROADWELL_E) ||
+ (cpuid_info.model == SKYLAKE1) ||
+ (cpuid_info.model == SKYLAKE2))
{
- uint64_t flags = msr_read(cpu, MSR_IA32_MISC_ENABLE);
- switch ( type )
- {
- case HW_PREFETCHER:
+ reg = MSR_PREFETCH_ENABLE;
+ newOffsets = 1;
+ }
+ ret = HPMread(cpu, MSR_DEV, reg, &flags);
+ if (ret != 0)
+ {
+ fprintf(stderr, "Cannot read register 0x%X for CPU %d to activate feature %s\n", reg, cpu, cpuFeatureNames[type]);
+ return ret;
+ }
+ ret = 0;
+ switch ( type )
+ {
+ case FEAT_HW_PREFETCHER:
+ if (print)
printf("HW_PREFETCHER:\t");
+ if (newOffsets)
+ {
+ flags &= ~(1ULL<<0);
+ }
+ else
+ {
flags &= ~(1ULL<<9);
- break;
+ }
+ break;
- case CL_PREFETCHER:
+ case FEAT_CL_PREFETCHER:
+ if (print)
printf("CL_PREFETCHER:\t");
+ if (newOffsets)
+ {
+ flags &= ~(1ULL<<1);
+ }
+ else
+ {
flags &= ~(1ULL<<19);
- break;
+ }
+ break;
- case DCU_PREFETCHER:
+ case FEAT_DCU_PREFETCHER:
+ if (print)
printf("DCU_PREFETCHER:\t");
+ if (newOffsets)
+ {
+ flags &= ~(1ULL<<2);
+ }
+ else
+ {
flags &= ~(1ULL<<37);
- break;
+ }
+ break;
- case IP_PREFETCHER:
+ case FEAT_IP_PREFETCHER:
+ if (print)
printf("IP_PREFETCHER:\t");
+ if (newOffsets)
+ {
+ flags &= ~(1ULL<<3);
+ }
+ else
+ {
flags &= ~(1ULL<<39);
- break;
+ }
+ break;
+
+ default:
+ printf("\nERROR: Processor feature '%s' cannot be enabled!\n", cpuFeatureNames[type]);
+ ret = -EINVAL;
+ break;
+ }
+ if (ret != 0)
+ {
+ return ret;
+ }
- default:
- printf("ERROR: CpuFeature not supported!\n");
- break;
+ ret = HPMwrite(cpu, MSR_DEV, reg, flags);
+ if (ret == 0)
+ {
+ if (print)
+ {
+ PRINT_VALUE(GREEN,enabled);
}
- PRINT_VALUE(GREEN,enabled);
- printf("\n");
- msr_write(cpu, MSR_IA32_MISC_ENABLE, flags);
}
else
{
- printf("ERROR: Architecture does not support the manipulation of prefetchers\n");
+ if (print)
+ {
+ PRINT_VALUE(RED,failed);
+ }
}
+ cpuFeatures_update(cpu);
+ return 0;
}
-void
-cpuFeatures_disable(int cpu, CpuFeature type)
+int
+cpuFeatures_disable(int cpu, CpuFeature type, int print)
{
+ int ret;
+ uint64_t flags;
+ uint32_t reg = MSR_IA32_MISC_ENABLE;
+ int newOffsets = 0;
+ if (!IF_FLAG(type))
+ {
+ return 0;
+ }
if ((cpuid_info.model == NEHALEM) ||
(cpuid_info.model == NEHALEM_BLOOMFIELD) ||
(cpuid_info.model == NEHALEM_LYNNFIELD) ||
(cpuid_info.model == NEHALEM_WESTMERE) ||
(cpuid_info.model == NEHALEM_WESTMERE_M) ||
(cpuid_info.model == NEHALEM_EX) ||
- (cpuid_info.model == CORE2_45) ||
- (cpuid_info.model == CORE2_65))
+ (cpuid_info.model == WESTMERE_EX) ||
+ (cpuid_info.model == SANDYBRIDGE) ||
+ (cpuid_info.model == SANDYBRIDGE_EP) ||
+ (cpuid_info.model == IVYBRIDGE) ||
+ (cpuid_info.model == IVYBRIDGE_EP) ||
+ (cpuid_info.model == HASWELL) ||
+ (cpuid_info.model == HASWELL_M1) ||
+ (cpuid_info.model == HASWELL_M2) ||
+ (cpuid_info.model == HASWELL_EP) ||
+ (cpuid_info.model == BROADWELL) ||
+ (cpuid_info.model == BROADWELL_D) ||
+ (cpuid_info.model == BROADWELL_E) ||
+ (cpuid_info.model == SKYLAKE1) ||
+ (cpuid_info.model == SKYLAKE2))
{
- uint64_t flags = msr_read(cpu, MSR_IA32_MISC_ENABLE);
-
- switch ( type )
- {
- case HW_PREFETCHER:
+ reg = MSR_PREFETCH_ENABLE;
+ newOffsets = 1;
+ }
+ ret = HPMread(cpu, MSR_DEV, reg, &flags);
+ if (ret != 0)
+ {
+ fprintf(stderr, "Reading register 0x%X on CPU %d failed\n", reg, cpu);
+ return ret;
+ }
+ ret = 0;
+ switch ( type )
+ {
+ case FEAT_HW_PREFETCHER:
+ if (print)
printf("HW_PREFETCHER:\t");
+ if (newOffsets)
+ {
+ flags |= (1ULL<<0);
+ }
+ else
+ {
flags |= (1ULL<<9);
- break;
+ }
+ break;
- case CL_PREFETCHER:
+ case FEAT_CL_PREFETCHER:
+ if (print)
printf("CL_PREFETCHER:\t");
+ if (newOffsets)
+ {
+ flags |= (1ULL<<1);
+ }
+ else
+ {
flags |= (1ULL<<19);
- break;
+ }
+ break;
- case DCU_PREFETCHER:
+ case FEAT_DCU_PREFETCHER:
+ if (print)
printf("DCU_PREFETCHER:\t");
+ if (newOffsets)
+ {
+ flags |= (1ULL<<2);
+ }
+ else
+ {
flags |= (1ULL<<37);
- break;
+ }
+ break;
- case IP_PREFETCHER:
+ case FEAT_IP_PREFETCHER:
+ if (print)
printf("IP_PREFETCHER:\t");
+ if (newOffsets)
+ {
+ flags |= (1ULL<<3);
+ }
+ else
+ {
flags |= (1ULL<<39);
- break;
+ }
+ break;
- default:
- printf("ERROR: CpuFeature not supported!\n");
- break;
- }
- PRINT_VALUE(RED,disabled);
- printf("\n");
+ default:
+ printf("ERROR: Processor feature '%s' cannot be disabled!\n", cpuFeatureNames[type]);
+ ret = -EINVAL;
+ break;
+ }
+ if (ret != 0)
+ {
+ return ret;
+ }
- msr_write(cpu, MSR_IA32_MISC_ENABLE, flags);
+ ret = HPMwrite(cpu, MSR_DEV, reg, flags);
+ if (ret != 0)
+ {
+ if (print)
+ {
+ PRINT_VALUE(RED,failed);
+ }
+ ret = -EFAULT;
}
else
{
- printf("ERROR: Architecture does not support the manipulation of prefetchers\n");
+ if (print)
+ {
+ PRINT_VALUE(RED,disabled);
+ }
+ ret = 0;
}
+ cpuFeatures_update(cpu);
+ return ret;
}
+int cpuFeatures_get(int cpu, CpuFeature type)
+{
+ if ((type >= FEAT_HW_PREFETCHER) && (type < CPUFEATURES_MAX))
+ {
+ if (IF_FLAG(type))
+ {
+ return TRUE;
+ }
+ else
+ {
+ return FALSE;
+ }
+ }
+ return -EINVAL;
+}
+
+char* cpuFeatures_name(CpuFeature type)
+{
+ if ((type >= FEAT_HW_PREFETCHER) && (type < CPUFEATURES_MAX))
+ {
+ return cpuFeatureNames[type];
+ }
+ return NULL;
+}
diff --git a/src/cpuid.c b/src/cpuid.c
deleted file mode 100644
index 6a9ac47..0000000
--- a/src/cpuid.c
+++ /dev/null
@@ -1,1244 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: cpuid.c
- *
- * Description: Implementation of cpuid module.
- * Provides API to extract cpuid info on x86 processors.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-/* ##### HEADER FILE INCLUDES ######################################### */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/wait.h>
-#include <unistd.h>
-#include <sched.h>
-#include <time.h>
-#include <math.h>
-
-#include <error.h>
-#include <cpuid.h>
-#include <tree.h>
-#include <bitUtil.h>
-#include <strUtil.h>
-
-/* ##### EXPORTED VARIABLES ########################################### */
-
-CpuInfo cpuid_info;
-CpuTopology cpuid_topology;
-
-
-/* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */
-
-static int largest_function = 0;
-
-/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
-
-/* this was taken from the linux kernel */
-#define CPUID \
- __asm__ volatile ("cpuid" \
- : "=a" (eax), \
- "=b" (ebx), \
- "=c" (ecx), \
- "=d" (edx) \
- : "0" (eax), "2" (ecx))
-
-
-/* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */
-
-static char* pentium_m_b_str = "Intel Pentium M Banias processor";
-static char* pentium_m_d_str = "Intel Pentium M Dothan processor";
-static char* core_duo_str = "Intel Core Duo processor";
-static char* core_2a_str = "Intel Core 2 65nm processor";
-static char* core_2b_str = "Intel Core 2 45nm processor";
-static char* atom_45_str = "Intel Atom 45nm processor";
-static char* atom_32_str = "Intel Atom 32nm processor";
-static char* atom_22_str = "Intel Atom 22nm processor";
-static char* atom_silvermont_str = "Intel Atom (Silvermont) 22nm processor";
-static char* atom_saltwell_str = "Intel Atom (Saltwell) 32nm processor";
-static char* nehalem_bloom_str = "Intel Core Bloomfield processor";
-static char* nehalem_lynn_str = "Intel Core Lynnfield processor";
-static char* nehalem_west_str = "Intel Core Westmere processor";
-static char* sandybridge_str = "Intel Core SandyBridge processor";
-static char* ivybridge_str = "Intel Core IvyBridge processor";
-static char* ivybridge_ep_str = "Intel Core IvyBridge EP processor";
-static char* sandybridge_ep_str = "Intel Core SandyBridge EP processor";
-static char* haswell_str = "Intel Core Haswell processor";
-static char* haswell_ex_str = "Intel Core Haswell EX processor";
-static char* nehalem_ex_str = "Intel Nehalem EX processor";
-static char* westmere_ex_str = "Intel Westmere EX processor";
-static char* xeon_mp_string = "Intel Xeon MP processor";
-static char* xeon_phi_string = "Intel Xeon Phi Coprocessor";
-static char* barcelona_str = "AMD Barcelona processor";
-static char* shanghai_str = "AMD Shanghai processor";
-static char* istanbul_str = "AMD Istanbul processor";
-static char* magnycours_str = "AMD Magny Cours processor";
-static char* interlagos_str = "AMD Interlagos processor";
-static char* kabini_str = "AMD Family 16 model - Kabini processor";
-static char* opteron_sc_str = "AMD Opteron single core 130nm processor";
-static char* opteron_dc_e_str = "AMD Opteron Dual Core Rev E 90nm processor";
-static char* opteron_dc_f_str = "AMD Opteron Dual Core Rev F 90nm processor";
-static char* athlon64_str = "AMD Athlon64 X2 (AM2) Rev F 90nm processor";
-static char* athlon64_f_str = "AMD Athlon64 (AM2) Rev F 90nm processor";
-static char* athlon64_X2_g_str = "AMD Athlon64 X2 (AM2) Rev G 65nm processor";
-static char* athlon64_g_str = "AMD Athlon64 (AM2) Rev G 65nm processor";
-static char* amd_k8_str = "AMD K8 architecture";
-static char* unknown_intel_str = "Unknown Intel Processor";
-static char* unknown_amd_str = "Unknown AMD Processor";
-
-static volatile int init = 0;
-static uint32_t eax, ebx, ecx, edx;
-
-/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
-
-static void initTopology(FILE* file)
-{
- size_t items;
- HWThread* hwThreadPool;
- CacheLevel* cacheLevels;
- TreeNode* currentNode;
-
- items = fread((void*) &cpuid_topology, sizeof(CpuTopology), 1, file);
-
- hwThreadPool = (HWThread*) malloc(cpuid_topology.numHWThreads * sizeof(HWThread));
- items = fread((void*) hwThreadPool, sizeof(HWThread), cpuid_topology.numHWThreads, file);
- cpuid_topology.threadPool = hwThreadPool;
-
- cacheLevels = (CacheLevel*) malloc(cpuid_topology.numCacheLevels * sizeof(CacheLevel));
- items = fread((void*) cacheLevels, sizeof(CacheLevel), cpuid_topology.numCacheLevels, file);
- cpuid_topology.cacheLevels = cacheLevels;
- cpuid_topology.topologyTree = NULL;
-
- tree_init(&cpuid_topology.topologyTree, 0);
-
- for (uint32_t i=0; i< cpuid_topology.numHWThreads; i++)
- {
- if (!tree_nodeExists(cpuid_topology.topologyTree,
- hwThreadPool[i].packageId))
- {
- tree_insertNode(cpuid_topology.topologyTree,
- hwThreadPool[i].packageId);
- }
- currentNode = tree_getNode(cpuid_topology.topologyTree,
- hwThreadPool[i].packageId);
-
- if (!tree_nodeExists(currentNode, hwThreadPool[i].coreId))
- {
- tree_insertNode(currentNode, hwThreadPool[i].coreId);
- }
- currentNode = tree_getNode(currentNode, hwThreadPool[i].coreId);
-
- if (!tree_nodeExists(currentNode, i))
- {
- tree_insertNode(currentNode, i);
- }
- }
-}
-
-static uint32_t amdGetAssociativity(uint32_t flag)
-{
- uint32_t asso= 0;
-
- switch ( flag )
- {
- case 0x0:
- asso = 0;
- break;
-
- case 0x1:
- asso = 1;
- break;
-
- case 0x2:
- asso = 2;
- break;
-
- case 0x4:
- asso = 4;
- break;
-
- case 0x6:
- asso = 8;
- break;
-
- case 0x8:
- asso = 16;
- break;
-
- case 0xA:
- asso = 32;
- break;
-
- case 0xB:
- asso = 48;
- break;
-
- case 0xC:
- asso = 64;
- break;
-
- case 0xD:
- asso = 96;
- break;
-
- case 0xE:
- asso = 128;
- break;
-
- case 0xF:
- asso = 0;
- break;
-
- default:
- break;
- }
- return asso;
-
-}
-
-static int intelCpuidFunc_4(CacheLevel** cachePool)
-{
- int i;
- int level=0;
- int maxNumLevels=0;
- uint32_t valid=1;
- CacheLevel* pool;
- int threadsPerCpu = 0;
- int numThreadsPerSocket = cpuid_topology.numCoresPerSocket *
- cpuid_topology.numThreadsPerCore;
-
- while (valid)
- {
- eax = 0x04;
- ecx = level;
- CPUID;
- valid = extractBitField(eax,5,0);
- if (!valid)
- {
- break;
- }
- level++;
- }
-
- maxNumLevels = level;
- *cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel));
- pool = *cachePool;
-
- for (i=0; i < maxNumLevels; i++)
- {
- eax = 0x04;
- ecx = i;
- CPUID;
-
- pool[i].level = extractBitField(eax,3,5);
- pool[i].type = (CacheType) extractBitField(eax,5,0);
- pool[i].associativity = extractBitField(ebx,8,22)+1;
- pool[i].sets = ecx+1;
- pool[i].lineSize = extractBitField(ebx,12,0)+1;
- pool[i].size = pool[i].sets *
- pool[i].associativity *
- pool[i].lineSize;
- pool[i].threads = extractBitField(eax,10,14)+1;
- pool[i].inclusive = edx&0x2;
-
- /* WORKAROUND cpuid reports wrong number of threads on SMT processor with SMT
- * turned off */
- if (i < 3)
- {
- if ((cpuid_info.model == NEHALEM_BLOOMFIELD) ||
- (cpuid_info.model == NEHALEM_LYNNFIELD) ||
- (cpuid_info.model == NEHALEM_WESTMERE) ||
- (cpuid_info.model == NEHALEM_WESTMERE_M) ||
- (cpuid_info.model == SANDYBRIDGE) ||
- (cpuid_info.model == SANDYBRIDGE_EP) ||
- (cpuid_info.model == IVYBRIDGE) ||
- (cpuid_info.model == IVYBRIDGE_EP) ||
- (cpuid_info.model == HASWELL) ||
- (cpuid_info.model == HASWELL_EX) ||
- (cpuid_info.model == HASWELL_M1) ||
- (cpuid_info.model == HASWELL_M2) ||
- (cpuid_info.model == WESTMERE_EX) ||
- (cpuid_info.model == NEHALEM_EX))
- {
- if (cpuid_topology.numThreadsPerCore == 1)
- {
- pool[i].threads = 1;
- }
- }
- }
-
- /* :WORKAROUND:08/13/2009 08:34:15 AM:jt: For L3 caches the value is sometimes
- * too large in here.
- * See Documentation: Threads contains maximum number of threads supported
- * by the cache.
- * Limit threads per Socket then to the maximum possible value. If the number
- * of threads supported by the cache does not divide the threads on the socket
- * without remainder, the threads are adjusted to fit the multiple caches.
- */
- if(pool[i].threads > numThreadsPerSocket)
- {
- pool[i].threads = numThreadsPerSocket;
- }
- else if (((double)numThreadsPerSocket)/((double)pool[i].threads) !=
- (double)(numThreadsPerSocket/pool[i].threads))
- {
- pool[i].threads = numThreadsPerSocket/
- (int)ceil(((double)numThreadsPerSocket)/((double)pool[i].threads));
- }
- /* For Intel Silvermont this is not enough. It returns 4 threads and 8 cores
- * for the L2 cache. But according to the data sheet, each 1MB L2 cache slice
- * is shared by 2 threads/cores.
- */
- else if (pool[i].level == 2 &&
- ((cpuid_info.model == ATOM_SILVERMONT_C) ||
- (cpuid_info.model == ATOM_SILVERMONT_E) ||
- (cpuid_info.model == ATOM_SILVERMONT_F1) ||
- (cpuid_info.model == ATOM_SILVERMONT_F2) ||
- (cpuid_info.model == ATOM_SILVERMONT_F3)))
- {
- pool[i].threads = 2;
- }
- }
-
-
-
- return maxNumLevels;
-}
-
-static int recheck_numHWThreads()
-{
- int cpucount = 0;
- char line[1024];
- FILE* fp = fopen("/proc/cpuinfo","r");
- if (fp != NULL)
- {
- while( fgets(line,1024,fp) )
- {
- if (strncmp(line, "processor", 9) == 0)
- {
- cpucount++;
- }
- }
- }
- return cpucount;
-}
-
-/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
-
-int cpuid_init (void)
-{
- int isIntel = 1;
-
- /* FIXME: Race condition??? */
- if (init) return EXIT_SUCCESS;
- init =1;
-
- eax = 0x00;
- CPUID;
-
- largest_function = eax;
- if (ebx == 0x68747541U)
- {
- isIntel = 0;
- }
-
- eax = 0x01;
- CPUID;
- cpuid_info.family = ((eax>>8)&0xFU) + ((eax>>20)&0xFFU);
- cpuid_info.model = (((eax>>16)&0xFU)<<4) + ((eax>>4)&0xFU);
- cpuid_info.stepping = (eax&0xFU);
-
- switch ( cpuid_info.family )
- {
- case P6_FAMILY:
- switch ( cpuid_info.model )
- {
- case PENTIUM_M_BANIAS:
- cpuid_info.name = pentium_m_b_str;
- break;
-
- case PENTIUM_M_DOTHAN:
- cpuid_info.name = pentium_m_d_str;
- break;
-
- case CORE_DUO:
- cpuid_info.name = core_duo_str;
- break;
-
- case CORE2_65:
- cpuid_info.name = core_2a_str;
- break;
-
- case CORE2_45:
- cpuid_info.name = core_2b_str;
- break;
-
- case NEHALEM_BLOOMFIELD:
- cpuid_info.name = nehalem_bloom_str;
- break;
-
- case NEHALEM_LYNNFIELD:
- cpuid_info.name = nehalem_lynn_str;
- break;
-
- case NEHALEM_WESTMERE_M:
-
- case NEHALEM_WESTMERE:
- cpuid_info.name = nehalem_west_str;
- break;
-
- case SANDYBRIDGE:
- cpuid_info.name = sandybridge_str;
- break;
-
- case SANDYBRIDGE_EP:
- cpuid_info.name = sandybridge_ep_str;
- break;
-
- case IVYBRIDGE:
- cpuid_info.name = ivybridge_str;
- break;
-
- case IVYBRIDGE_EP:
- cpuid_info.name = ivybridge_ep_str;
- break;
-
- case HASWELL:
-
- case HASWELL_M1:
-
- case HASWELL_M2:
- cpuid_info.name = haswell_str;
- break;
-
- case HASWELL_EX:
- cpuid_info.name = haswell_ex_str;
- break;
-
- case NEHALEM_EX:
- cpuid_info.name = nehalem_ex_str;
- break;
-
- case WESTMERE_EX:
- cpuid_info.name = westmere_ex_str;
- break;
-
- case XEON_MP:
- cpuid_info.name = xeon_mp_string;
- break;
-
- case ATOM_45:
-
- case ATOM:
- cpuid_info.name = atom_45_str;
- break;
-
- case ATOM_32:
- cpuid_info.name = atom_32_str;
- break;
-
- case ATOM_22:
- cpuid_info.name = atom_22_str;
- break;
-
- case ATOM_SILVERMONT_C:
- case ATOM_SILVERMONT_E:
- case ATOM_SILVERMONT_F1:
- case ATOM_SILVERMONT_F2:
- case ATOM_SILVERMONT_F3:
- cpuid_info.name = atom_silvermont_str;
- break;
-
- default:
- cpuid_info.name = unknown_intel_str;
- break;
- }
- break;
-
- case MIC_FAMILY:
- switch ( cpuid_info.model )
- {
- case XEON_PHI:
- cpuid_info.name = xeon_phi_string;
- break;
-
- }
- break;
-
- case K8_FAMILY:
-
- if (isIntel)
- {
- ERROR_PLAIN_PRINT(Netburst architecture is not supported);
- }
-
- switch ( cpuid_info.model )
- {
- case OPTERON_DC_E:
- cpuid_info.name = opteron_dc_e_str;
- break;
-
- case OPTERON_DC_F:
- cpuid_info.name = opteron_dc_f_str;
- break;
-
- case ATHLON64_X2:
-
- case ATHLON64_X2_F:
- cpuid_info.name = athlon64_str;
- break;
-
- case ATHLON64_F1:
-
- case ATHLON64_F2:
- cpuid_info.name = athlon64_f_str;
- break;
-
- case ATHLON64_X2_G:
- cpuid_info.name = athlon64_X2_g_str;
- break;
-
- case ATHLON64_G1:
-
- case ATHLON64_G2:
- cpuid_info.name = athlon64_g_str;
- break;
-
- case OPTERON_SC_1MB:
- cpuid_info.name = opteron_sc_str;
- break;
-
- default:
- cpuid_info.name = amd_k8_str;
- break;
- }
-
- break;
-
- case K10_FAMILY:
- switch ( cpuid_info.model )
- {
- case BARCELONA:
- cpuid_info.name = barcelona_str;
- break;
-
- case SHANGHAI:
- cpuid_info.name = shanghai_str;
- break;
-
- case ISTANBUL:
- cpuid_info.name = istanbul_str;
- break;
-
- case MAGNYCOURS:
- cpuid_info.name = magnycours_str;
- break;
-
- default:
- cpuid_info.name = unknown_amd_str;
- break;
- }
- break;
-
- case K15_FAMILY:
- cpuid_info.name = interlagos_str;
- break;
-
- case K16_FAMILY:
- cpuid_info.name = kabini_str;
- break;
-
- default:
- return EXIT_FAILURE;
- break;
- }
-
- cpuid_info.featureFlags = 0;
- cpuid_info.features = (char*) malloc(200*sizeof(char));
- cpuid_info.features[0] = 0;
- if (ecx & (1<<0))
- {
- strcat(cpuid_info.features, "SSE3 ");
- cpuid_info.featureFlags |= (1<<SSE3);
- }
- if (ecx & (1<<3))
- {
- strcat(cpuid_info.features, "MONITOR ");
- cpuid_info.featureFlags |= (1<<MONITOR);
- }
- if (ecx & (1<<5))
- {
- strcat(cpuid_info.features, "VMX ");
- cpuid_info.featureFlags |= (1<<VMX);
- }
- if (ecx & (1<<7))
- {
- strcat(cpuid_info.features, "EIST ");
- cpuid_info.featureFlags |= (1<<EIST);
- }
- if (ecx & (1<<8))
- {
- strcat(cpuid_info.features, "TM2 ");
- cpuid_info.featureFlags |= (1<<TM2);
- }
- if (ecx & (1<<9))
- {
- strcat(cpuid_info.features, "SSSE3 ");
- cpuid_info.featureFlags |= (1<<SSSE3);
- }
- if (ecx & (1<<12))
- {
- strcat(cpuid_info.features, "FMA ");
- cpuid_info.featureFlags |= (1<<FMA);
- }
- if (ecx & (1<<19))
- {
- strcat(cpuid_info.features, "SSE4.1 ");
- cpuid_info.featureFlags |= (1<<SSE41);
- }
- if (ecx & (1<<20))
- {
- strcat(cpuid_info.features, "SSE4.2 ");
- cpuid_info.featureFlags |= (1<<SSE42);
- }
- if (ecx & (1<<25))
- {
- strcat(cpuid_info.features, "AES ");
- cpuid_info.featureFlags |= (1<<AES);
- }
- if (ecx & (1<<28))
- {
- strcat(cpuid_info.features, "AVX ");
- cpuid_info.featureFlags |= (1<<AVX);
- }
- if (ecx & (1<<30))
- {
- strcat(cpuid_info.features, "RDRAND ");
- cpuid_info.featureFlags |= (1<<RDRAND);
- }
- if (edx & (1<<22))
- {
- strcat(cpuid_info.features, "ACPI ");
- cpuid_info.featureFlags |= (1<<ACPI);
- }
- if (edx & (1<<23))
- {
- strcat(cpuid_info.features, "MMX ");
- cpuid_info.featureFlags |= (1<<MMX);
- }
- if (edx & (1<<25))
- {
- strcat(cpuid_info.features, "SSE ");
- cpuid_info.featureFlags |= (1<<SSE);
- }
- if (edx & (1<<26))
- {
- strcat(cpuid_info.features, "SSE2 ");
- cpuid_info.featureFlags |= (1<<SSE2);
- }
- if (edx & (1<<29))
- {
- strcat(cpuid_info.features, "TM ");
- cpuid_info.featureFlags |= (1<<TM);
- }
-
- eax = 0x80000001;
- CPUID;
- if (edx & (1<<27))
- {
- strcat(cpuid_info.features, "RDTSCP ");
- cpuid_info.featureFlags |= (1<<RDTSCP);
- }
-
- cpuid_info.perf_version = 0;
- if( cpuid_info.family == P6_FAMILY && 0x0A <= largest_function)
- {
- eax = 0x0A;
- CPUID;
- cpuid_info.perf_version = (eax&0xFFU);
- cpuid_info.perf_num_ctr = ((eax>>8)&0xFFU);
- cpuid_info.perf_width_ctr = ((eax>>16)&0xFFU);
- cpuid_info.perf_num_fixed_ctr = (edx&0xFU);
-
- eax = 0x06;
- CPUID;
- if (eax & (1<<1))
- {
- cpuid_info.turbo = 1;
- }
- else
- {
- cpuid_info.turbo = 0;
- }
- }
-
- FILE *file;
- char *filepath = TOSTRING(CFGFILE);
-
- if ((file = fopen(filepath, "rb")) != NULL)
- {
- //printf("Read config from file\n");
- initTopology(file);
- fclose(file);
- }
- else
- {
- cpuid_topology.numHWThreads = sysconf(_SC_NPROCESSORS_CONF);
- if (recheck_numHWThreads() != cpuid_topology.numHWThreads)
- {
- cpuid_topology.numHWThreads = recheck_numHWThreads();
- }
- cpu_set_t cpuSet;
- CPU_ZERO(&cpuSet);
- sched_getaffinity(0,sizeof(cpu_set_t), &cpuSet);
- cpuid_initTopology();
- cpuid_initCacheTopology();
-
- /* restore affinity mask of process */
- sched_setaffinity(0, sizeof(cpu_set_t), &cpuSet);
- }
-
- return EXIT_SUCCESS;
-}
-
-void cpuid_print (void)
-{
- printf("\nSupported Intel processors:\n");
- printf("\t%s\n",core_2a_str);
- printf("\t%s\n",core_2b_str);
- printf("\t%s\n",xeon_mp_string);
- printf("\t%s\n",atom_45_str);
- printf("\t%s\n",atom_32_str);
- printf("\t%s\n",atom_22_str);
- printf("\t%s\n",nehalem_bloom_str);
- printf("\t%s\n",nehalem_lynn_str);
- printf("\t%s\n",nehalem_west_str);
- printf("\t%s (with Uncore support)\n",nehalem_ex_str);
- printf("\t%s (with Uncore support)\n",westmere_ex_str);
- printf("\t%s\n",sandybridge_str);
- printf("\t%s (with Uncore support)\n",sandybridge_ep_str);
- printf("\t%s\n",ivybridge_str);
- printf("\t%s (with Uncore support)\n",ivybridge_ep_str);
- printf("\t%s (with Uncore support)\n",haswell_str);
- printf("\t%s (no Uncore support)\n",haswell_ex_str);
- printf("\t%s\n",atom_silvermont_str);
- printf("\t%s\n",atom_saltwell_str);
- printf("\t%s\n\n",xeon_phi_string);
-
- printf("Supported AMD processors:\n");
- printf("\t%s\n",opteron_sc_str);
- printf("\t%s\n",opteron_dc_e_str);
- printf("\t%s\n",opteron_dc_f_str);
- printf("\t%s\n",barcelona_str);
- printf("\t%s\n",shanghai_str);
- printf("\t%s\n",istanbul_str);
- printf("\t%s\n",magnycours_str);
- printf("\t%s\n",interlagos_str);
- printf("\t%s\n\n",kabini_str);
-}
-
-
-
-
-
-#define freeStrings \
- bdestroy(filename); \
-bdestroy(grepString); \
-bdestroy(cpulist)
-
-
-int cpuid_isInCpuset(void)
-{
- int pos = 0;
- bstring grepString = bformat("Cpus_allowed_list:");
- bstring filename = bformat("/proc/%d/status",getpid());
- FILE* fp = fopen(bdata(filename),"r");
-
- if (fp == NULL)
- {
- bdestroy(filename);
- bdestroy(grepString);
- return 0;
- }
- else
- {
- bstring cpulist;
- uint32_t tmpThreads[MAX_NUM_THREADS];
- bstring src = bread ((bNread) fread, fp);
- if ((pos = binstr(src,0,grepString)) != BSTR_ERR)
- {
- int end = bstrchrp(src, 10, pos);
- pos = pos+blength(grepString);
- cpulist = bmidstr(src,pos, end-pos);
- btrimws(cpulist);
-
- if (bstr_to_cpuset_physical(tmpThreads, cpulist) < cpuid_topology.numHWThreads)
- {
- freeStrings;
- return 1;
- }
- else
- {
- freeStrings;
- return 0;
- }
- }
- return 0;
- }
-}
-
-void cpuid_initTopology(void)
-{
- uint32_t apicId;
- uint32_t bitField;
- int level;
- int prevOffset = 0;
- int currOffset = 0;
- cpu_set_t set;
- HWThread* hwThreadPool;
- int hasBLeaf = 0;
- int maxNumLogicalProcs;
- int maxNumLogicalProcsPerCore;
- int maxNumCores;
- TreeNode* currentNode;
- int width;
-
- /* check if 0x0B cpuid leaf is supported */
- if (largest_function >= 0x0B)
- {
- eax = 0x0B;
- ecx = 0;
- CPUID;
-
- if (ebx)
- {
- hasBLeaf = 1;
- }
- }
-
- hwThreadPool = (HWThread*) malloc(cpuid_topology.numHWThreads * sizeof(HWThread));
- tree_init(&cpuid_topology.topologyTree, 0);
-
- if (hasBLeaf)
- {
- for (uint32_t i=0; i < cpuid_topology.numHWThreads; i++)
- {
-
- CPU_ZERO(&set);
- CPU_SET(i,&set);
- sched_setaffinity(0, sizeof(cpu_set_t), &set);
- eax = 0x0B;
- ecx = 0;
- CPUID;
- apicId = edx;
- hwThreadPool[i].apicId = apicId;
-
- for (level=0; level < 3; level++)
- {
- eax = 0x0B;
- ecx = level;
- CPUID;
- currOffset = eax&0xFU;
-
- switch ( level ) {
- case 0: /* SMT thread */
- bitField = extractBitField(apicId,
- currOffset,
- 0);
- hwThreadPool[i].threadId = bitField;
- break;
-
- case 1: /* Core */
- bitField = extractBitField(apicId,
- currOffset-prevOffset,
- prevOffset);
- hwThreadPool[i].coreId = bitField;
- break;
-
- case 2: /* Package */
- bitField = extractBitField(apicId,
- 32-prevOffset,
- prevOffset);
- hwThreadPool[i].packageId = bitField;
- break;
-
- }
- prevOffset = currOffset;
- }
- }
- }
- else
- {
- switch ( cpuid_info.family )
- {
-
- case MIC_FAMILY:
-
- case P6_FAMILY:
- eax = 0x01;
- CPUID;
- maxNumLogicalProcs = extractBitField(ebx,8,16);
-
- /* Check number of cores per package */
- eax = 0x04;
- ecx = 0;
- CPUID;
- maxNumCores = extractBitField(eax,6,26)+1;
-
- maxNumLogicalProcsPerCore = maxNumLogicalProcs/maxNumCores;
-
- for (uint32_t i=0; i< cpuid_topology.numHWThreads; i++)
- {
- CPU_ZERO(&set);
- CPU_SET(i,&set);
- sched_setaffinity(0, sizeof(cpu_set_t), &set);
-
- eax = 0x01;
- CPUID;
- hwThreadPool[i].apicId = extractBitField(ebx,8,24);
-
- /* ThreadId is extracted from th apicId using the bit width
- * of the number of logical processors
- * */
- hwThreadPool[i].threadId =
- extractBitField(hwThreadPool[i].apicId,
- getBitFieldWidth(maxNumLogicalProcsPerCore),0);
-
- /* CoreId is extracted from th apicId using the bitWidth
- * of the number of logical processors as offset and the
- * bit width of the number of cores as width
- * */
- hwThreadPool[i].coreId =
- extractBitField(hwThreadPool[i].apicId,
- getBitFieldWidth(maxNumCores),
- getBitFieldWidth(maxNumLogicalProcsPerCore));
-
- hwThreadPool[i].packageId =
- extractBitField(hwThreadPool[i].apicId,
- 8-getBitFieldWidth(maxNumLogicalProcs),
- getBitFieldWidth(maxNumLogicalProcs));
- }
- break;
-
- case K8_FAMILY:
- /* AMD Bios manual Rev. 2.28 section 3.1
- * Legacy method */
- /*FIXME: This is a bit of a hack */
-
- maxNumLogicalProcsPerCore = 1;
- maxNumLogicalProcs = 1;
-
- eax = 0x80000008;
- CPUID;
-
- maxNumCores = extractBitField(ecx,8,0)+1;
-
- for (uint32_t i=0; i< cpuid_topology.numHWThreads; i++)
- {
- CPU_ZERO(&set);
- CPU_SET(i,&set);
- sched_setaffinity(0, sizeof(cpu_set_t), &set);
-
- eax = 0x01;
- CPUID;
- hwThreadPool[i].apicId = extractBitField(ebx,8,24);
-
- /* ThreadId is extracted from th apicId using the bit width
- * of the number of logical processors
- * */
- hwThreadPool[i].threadId =
- extractBitField(hwThreadPool[i].apicId,
- getBitFieldWidth(maxNumLogicalProcsPerCore),0);
-
- /* CoreId is extracted from th apicId using the bitWidth
- * of the number of logical processors as offset and the
- * bit width of the number of cores as width
- * */
- hwThreadPool[i].coreId =
- extractBitField(hwThreadPool[i].apicId,
- getBitFieldWidth(maxNumCores),
- 0);
-
- hwThreadPool[i].packageId =
- extractBitField(hwThreadPool[i].apicId,
- 8-getBitFieldWidth(maxNumCores),
- getBitFieldWidth(maxNumCores));
- }
- break;
-
- case K16_FAMILY:
-
- case K15_FAMILY:
-
- case K10_FAMILY:
- /* AMD Bios manual Rev. 2.28 section 3.2
- * Extended method */
- eax = 0x80000008;
- CPUID;
-
- width = extractBitField(ecx,4,12);
-
- if (width == 0)
- {
- width = extractBitField(ecx,8,0)+1;
- }
-
- eax = 0x01;
- CPUID;
- maxNumLogicalProcs = extractBitField(ebx,8,16);
- maxNumCores = extractBitField(ecx,8,0)+1;
-
-
- for (uint32_t i=0; i< cpuid_topology.numHWThreads; i++)
- {
- CPU_ZERO(&set);
- CPU_SET(i,&set);
- sched_setaffinity(0, sizeof(cpu_set_t), &set);
-
- eax = 0x01;
- CPUID;
- hwThreadPool[i].apicId = extractBitField(ebx,8,24);
- /* AMD only knows cores */
- hwThreadPool[i].threadId = 0;
-
- hwThreadPool[i].coreId =
- extractBitField(hwThreadPool[i].apicId,
- width, 0);
- hwThreadPool[i].packageId =
- extractBitField(hwThreadPool[i].apicId,
- (8-width), width);
- }
-
- break;
- }
- }
-
- for (uint32_t i=0; i< cpuid_topology.numHWThreads; i++)
- {
- /* Add node to Topology tree */
- if (!tree_nodeExists(cpuid_topology.topologyTree,
- hwThreadPool[i].packageId))
- {
- tree_insertNode(cpuid_topology.topologyTree,
- hwThreadPool[i].packageId);
- }
- currentNode = tree_getNode(cpuid_topology.topologyTree,
- hwThreadPool[i].packageId);
-
- if (!tree_nodeExists(currentNode, hwThreadPool[i].coreId))
- {
- tree_insertNode(currentNode, hwThreadPool[i].coreId);
- }
- currentNode = tree_getNode(currentNode, hwThreadPool[i].coreId);
-
- if (!tree_nodeExists(currentNode, i))
- {
- /*
- printf("WARNING: Thread already exists!\n");
- */
- tree_insertNode(currentNode, i);
- }
-
- }
-
- cpuid_topology.threadPool = hwThreadPool;
- cpuid_topology.numSockets = tree_countChildren(cpuid_topology.topologyTree);
- currentNode = tree_getChildNode(cpuid_topology.topologyTree);
- cpuid_topology.numCoresPerSocket = tree_countChildren(currentNode);
- currentNode = tree_getChildNode(currentNode);
- cpuid_topology.numThreadsPerCore = tree_countChildren(currentNode);
-}
-
-void cpuid_initCacheTopology()
-{
- int maxNumLevels=0;
- int id=0;
- CacheLevel* cachePool = NULL;
- CacheType type = DATACACHE;
-
- switch ( cpuid_info.family )
- {
- case MIC_FAMILY:
-
- case P6_FAMILY:
-
- if (largest_function >= 4)
- {
- maxNumLevels = intelCpuidFunc_4(&cachePool);
- }
- else
- {
- // intelCpuidFunc_2(&cachePool);
- }
-
- break;
-
- case K8_FAMILY:
- maxNumLevels = 2;
- cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel));
-
- eax = 0x80000005;
- CPUID;
- cachePool[0].level = 1;
- cachePool[0].type = DATACACHE;
- cachePool[0].associativity = extractBitField(ecx,8,16);
- cachePool[0].lineSize = extractBitField(ecx,8,0);
- cachePool[0].size = extractBitField(ecx,8,24) * 1024;
- if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
- {
- cachePool[0].sets = cachePool[0].size/
- (cachePool[0].associativity * cachePool[0].lineSize);
- }
- cachePool[0].threads = 1;
- cachePool[0].inclusive = 1;
-
- eax = 0x80000006;
- CPUID;
- cachePool[1].level = 2;
- cachePool[1].type = UNIFIEDCACHE;
- cachePool[1].associativity =
- amdGetAssociativity(extractBitField(ecx,4,12));
- cachePool[1].lineSize = extractBitField(ecx,8,0);
- cachePool[1].size = extractBitField(ecx,16,16) * 1024;
- if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
- {
- cachePool[1].sets = cachePool[1].size/
- (cachePool[1].associativity * cachePool[1].lineSize);
- }
- cachePool[1].threads = 1;
- cachePool[1].inclusive = 1;
-
- break;
-
-
- case K10_FAMILY:
- /* FIXME: Adds one level for the instruction cache on Intel
- * This fixes the level for the cores
- */
- maxNumLevels = 3;
- cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel));
-
- eax = 0x80000005;
- CPUID;
- cachePool[0].level = 1;
- cachePool[0].type = DATACACHE;
- cachePool[0].associativity = extractBitField(ecx,8,16);
- cachePool[0].lineSize = extractBitField(ecx,8,0);
- cachePool[0].size = extractBitField(ecx,8,24) * 1024;
- if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
- {
- cachePool[0].sets = cachePool[0].size/
- (cachePool[0].associativity * cachePool[0].lineSize);
- }
- cachePool[0].threads = 1;
- cachePool[0].inclusive = 1;
-
- eax = 0x80000006;
- CPUID;
- cachePool[1].level = 2;
- cachePool[1].type = UNIFIEDCACHE;
- cachePool[1].associativity =
- amdGetAssociativity(extractBitField(ecx,4,12));
- cachePool[1].lineSize = extractBitField(ecx,8,0);
- cachePool[1].size = extractBitField(ecx,16,16) * 1024;
- if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
- {
- cachePool[1].sets = cachePool[1].size/
- (cachePool[1].associativity * cachePool[1].lineSize);
- }
- cachePool[1].threads = 1;
- cachePool[1].inclusive = 1;
-
- cachePool[2].level = 3;
- cachePool[2].type = UNIFIEDCACHE;
- cachePool[2].associativity =
- amdGetAssociativity(extractBitField(edx,4,12));
- cachePool[2].lineSize = extractBitField(edx,8,0);
- cachePool[2].size = (extractBitField(edx,14,18)+1) * 524288;
- if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
- {
- cachePool[2].sets = cachePool[1].size/
- (cachePool[1].associativity * cachePool[1].lineSize);
- }
-
- if (cpuid_info.model != MAGNYCOURS)
- {
- cachePool[2].threads = cpuid_topology.numCoresPerSocket;
- }
- else
- {
- cachePool[2].threads = cpuid_topology.numCoresPerSocket/2;
- cachePool[2].size /= 2 ;
- }
-
- cachePool[2].inclusive = 1;
-
- break;
-
- case K16_FAMILY:
-
- case K15_FAMILY:
-
- maxNumLevels = 0;
- cachePool = (CacheLevel*) malloc(3 * sizeof(CacheLevel));
-
- while (type)
- {
- ecx = id;
- eax = 0x8000001D;
- CPUID;
- type = (CacheType) extractBitField(eax,4,0);
-
- if ((type == DATACACHE) || (type == UNIFIEDCACHE))
- {
- cachePool[maxNumLevels].level = extractBitField(eax,3,5);
- cachePool[maxNumLevels].type = type;
- cachePool[maxNumLevels].associativity = extractBitField(ebx,10,22)+1;
- cachePool[maxNumLevels].lineSize = extractBitField(ebx,12,0)+1;
- cachePool[maxNumLevels].sets = extractBitField(ecx,32,0)+1;
- cachePool[maxNumLevels].size = cachePool[maxNumLevels].associativity *
- cachePool[maxNumLevels].lineSize * cachePool[maxNumLevels].sets;
- cachePool[maxNumLevels].threads = extractBitField(eax,12,14)+1;
- cachePool[maxNumLevels].inclusive = (edx & (0x1<<1));
- maxNumLevels++;
- }
- id++;
- }
- break;
-
- default:
- ERROR_PLAIN_PRINT(Processor is not supported);
- break;
- }
-
- cpuid_topology.numCacheLevels = maxNumLevels;
- cpuid_topology.cacheLevels = cachePool;
-}
-
-
-
diff --git a/src/cpustring.c b/src/cpustring.c
new file mode 100644
index 0000000..7b57ed0
--- /dev/null
+++ b/src/cpustring.c
@@ -0,0 +1,577 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: cpustring.c
+ *
+ * Description: Parser for CPU selection strings
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <likwid.h>
+
+
+static int cpulist_sort(int* incpus, int* outcpus, int length)
+{
+ int insert = 0;
+ topology_init();
+ CpuTopology_t cpuid_topology = get_cpuTopology();
+ if (length <= 0)
+ {
+ return -1;
+ }
+ for (int off=0;off < cpuid_topology->numThreadsPerCore;off++)
+ {
+ for (int i=0; i<length/cpuid_topology->numThreadsPerCore;i++)
+ {
+ outcpus[insert] = incpus[(i*cpuid_topology->numThreadsPerCore)+off];
+ insert++;
+ }
+ }
+ return insert;
+}
+
+static int cpulist_concat(int* cpulist, int startidx, int* addlist, int addlength)
+{
+ int count = 0;
+ if (addlength <= 0)
+ {
+ return 0;
+ }
+ for (int i=startidx;i<(startidx+addlength);i++)
+ {
+ cpulist[i] = addlist[i-startidx];
+ count++;
+ }
+ return count;
+}
+
+static int cpu_in_domain(int domainidx, int cpu)
+{
+ affinity_init();
+ AffinityDomains_t affinity = get_affinityDomains();
+ for (int i=0;i<affinity->domains[domainidx].numberOfProcessors; i++)
+ {
+ if (cpu == affinity->domains[domainidx].processorList[i])
+ {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static int cpuexpr_to_list(bstring bcpustr, bstring prefix, int* list, int length)
+{
+ topology_init();
+ CpuTopology_t cpuid_topology = get_cpuTopology();
+ affinity_init();
+ AffinityDomains_t affinity = get_affinityDomains();
+ struct bstrList* strlist;
+ strlist = bsplit(bcpustr, ',');
+ int oldinsert = 0;
+ int insert = 0;
+ for (int i=0;i < strlist->qty; i++)
+ {
+ bstring newstr = bstrcpy(prefix);
+ bconcat(newstr, strlist->entry[i]);
+ oldinsert = insert;
+ for (int j = 0; j < affinity->numberOfAffinityDomains; j++)
+ {
+ if (bstrcmp(affinity->domains[j].tag, newstr) == 0)
+ {
+ list[insert] = atoi(bdata(strlist->entry[i]));
+ insert++;
+ if (insert == length)
+ goto list_done;
+ break;
+ }
+ }
+ if (insert == oldinsert)
+ {
+ fprintf(stderr,"Domain %s cannot be found\n", bdata(newstr));
+ }
+ bdestroy(newstr);
+ }
+list_done:
+ bstrListDestroy(strlist);
+ return insert;
+}
+
+static int cpustr_to_cpulist_scatter(bstring bcpustr, int* cpulist, int length)
+{
+ topology_init();
+ CpuTopology_t cpuid_topology = get_cpuTopology();
+ affinity_init();
+ AffinityDomains_t affinity = get_affinityDomains();
+ char* cpustring = bstr2cstr(bcpustr, '\0');
+ if (bstrchrp(bcpustr, ':', 0) != BSTR_ERR)
+ {
+ int insert = 0;
+ int suitidx = 0;
+ int* suitable = (int*)malloc(affinity->numberOfAffinityDomains*sizeof(int));
+ if (!suitable)
+ {
+ bcstrfree(cpustring);
+ return -ENOMEM;
+ }
+ for (int i=0; i<affinity->numberOfAffinityDomains; i++)
+ {
+ if (bstrchrp(affinity->domains[i].tag, cpustring[0], 0) != BSTR_ERR)
+ {
+ suitable[suitidx] = i;
+ suitidx++;
+ }
+ }
+ int* sortedList = (int*) malloc(affinity->domains[suitable[0]].numberOfProcessors * sizeof(int));
+ if (!sortedList)
+ {
+ free(suitable);
+ bcstrfree(cpustring);
+ return -ENOMEM;
+ }
+ for (int off=0;off<affinity->domains[suitable[0]].numberOfProcessors;off++)
+ {
+ for(int i=0;i < suitidx; i++)
+ {
+ cpulist_sort(affinity->domains[suitable[i]].processorList, sortedList, affinity->domains[suitable[i]].numberOfProcessors);
+ cpulist[insert] = sortedList[off];
+ insert++;
+ if (insert == length)
+ goto scatter_done;
+ }
+ }
+scatter_done:
+ bcstrfree(cpustring);
+ free(sortedList);
+ free(suitable);
+ return insert;
+ }
+ bcstrfree(cpustring);
+ return 0;
+}
+
+static int cpustr_to_cpulist_expression(bstring bcpustr, int* cpulist, int length)
+{
+ topology_init();
+ CpuTopology_t cpuid_topology = get_cpuTopology();
+ affinity_init();
+ AffinityDomains_t affinity = get_affinityDomains();
+ bstring bdomain;
+ int domainidx = -1;
+ int count = 0;
+ int stride = 0;
+ int chunk = 0;
+ if (bstrchrp(bcpustr, 'E', 0) != 0)
+ {
+ fprintf(stderr, "Not a valid CPU expression\n");
+ return 0;
+ }
+ struct bstrList* strlist;
+ strlist = bsplit(bcpustr, ':');
+ if (strlist->qty == 3)
+ {
+ bdomain = bstrcpy(strlist->entry[1]);
+ count = atoi(bdata(strlist->entry[2]));
+ stride = 1;
+ chunk = 1;
+ }
+ else if (strlist->qty == 5)
+ {
+ bdomain = bstrcpy(strlist->entry[1]);
+ count = atoi(bdata(strlist->entry[2]));
+ chunk = atoi(bdata(strlist->entry[3]));
+ stride = atoi(bdata(strlist->entry[4]));
+ }
+ for (int i=0; i<affinity->numberOfAffinityDomains; i++)
+ {
+ if (bstrcmp(bdomain, affinity->domains[i].tag) == 0)
+ {
+ domainidx = i;
+ break;
+ }
+ }
+ if (domainidx < 0)
+ {
+ fprintf(stderr, "Cannot find domain %s\n", bdata(bdomain));
+ bdestroy(bdomain);
+ bstrListDestroy(strlist);
+ return 0;
+ }
+ int offset = 0;
+ int insert = 0;
+ for (int i=0;i<count;i++)
+ {
+ for (int j=0;j<chunk && offset+j<affinity->domains[domainidx].numberOfProcessors;j++)
+ {
+ cpulist[insert] = affinity->domains[domainidx].processorList[offset + j];
+ insert++;
+ if (insert == length)
+ goto expression_done;
+ }
+ offset += stride;
+ if (offset >= affinity->domains[domainidx].numberOfProcessors)
+ {
+ offset = 0;
+ }
+ if (insert >= count)
+ goto expression_done;
+ }
+ bdestroy(bdomain);
+ bstrListDestroy(strlist);
+ return 0;
+expression_done:
+ bdestroy(bdomain);
+ bstrListDestroy(strlist);
+ return insert;
+}
+
+static int cpustr_to_cpulist_logical(bstring bcpustr, int* cpulist, int length)
+{
+ topology_init();
+ CpuTopology_t cpuid_topology = get_cpuTopology();
+ affinity_init();
+ AffinityDomains_t affinity = get_affinityDomains();
+ int domainidx = -1;
+ bstring bdomain;
+ bstring blist;
+ struct bstrList* strlist;
+ if (bstrchrp(bcpustr, 'L', 0) != 0)
+ {
+ fprintf(stderr, "Not a valid CPU expression\n");
+ return 0;
+ }
+
+ strlist = bsplit(bcpustr, ':');
+ if (strlist->qty != 3)
+ {
+ fprintf(stderr, "ERROR: Invalid expression, should look like L:<domain>:<indexlist> or be in a cpuset\n");
+ bstrListDestroy(strlist);
+ return 0;
+ }
+ bdomain = bstrcpy(strlist->entry[1]);
+ blist = bstrcpy(strlist->entry[2]);
+ bstrListDestroy(strlist);
+ for (int i=0; i<affinity->numberOfAffinityDomains; i++)
+ {
+ if (bstrcmp(bdomain, affinity->domains[i].tag) == 0)
+ {
+ domainidx = i;
+ break;
+ }
+ }
+ if (domainidx < 0)
+ {
+ fprintf(stderr, "Cannot find domain %s\n", bdata(bdomain));
+ bdestroy(bdomain);
+ bdestroy(blist);
+ return 0;
+ }
+ int *inlist = malloc(affinity->domains[domainidx].numberOfProcessors * sizeof(int));
+ if (inlist == NULL)
+ {
+ bdestroy(bdomain);
+ bdestroy(blist);
+ return -ENOMEM;
+ }
+ int ret = cpulist_sort(affinity->domains[domainidx].processorList, inlist, affinity->domains[domainidx].numberOfProcessors);
+
+ strlist = bsplit(blist, ',');
+ int insert = 0;
+ for (int i=0; i< strlist->qty; i++)
+ {
+ if (bstrchrp(strlist->entry[i], '-', 0) != BSTR_ERR)
+ {
+ struct bstrList* indexlist;
+ indexlist = bsplit(strlist->entry[i], '-');
+ if (atoi(bdata(indexlist->entry[0])) <= atoi(bdata(indexlist->entry[1])))
+ {
+ for (int j=atoi(bdata(indexlist->entry[0])); j<=atoi(bdata(indexlist->entry[1]));j++)
+ {
+ cpulist[insert] = inlist[j];
+ insert++;
+ if (insert == length)
+ {
+ bstrListDestroy(indexlist);
+ goto logical_done;
+ }
+ }
+ }
+ else
+ {
+ for (int j=atoi(bdata(indexlist->entry[0])); j>=atoi(bdata(indexlist->entry[1]));j--)
+ {
+ cpulist[insert] = inlist[j];
+ insert++;
+ if (insert == length)
+ {
+ bstrListDestroy(indexlist);
+ goto logical_done;
+ }
+ }
+ }
+ bstrListDestroy(indexlist);
+ }
+ else
+ {
+ cpulist[insert] = inlist[atoi(bdata(strlist->entry[i]))];
+ insert++;
+ if (insert == length)
+ {
+ goto logical_done;
+ }
+ }
+ }
+logical_done:
+ bdestroy(bdomain);
+ bdestroy(blist);
+ bstrListDestroy(strlist);
+ free(inlist);
+ return insert;
+}
+
+
+
+static int cpustr_to_cpulist_physical(bstring bcpustr, int* cpulist, int length)
+{
+ topology_init();
+ CpuTopology_t cpuid_topology = get_cpuTopology();
+ affinity_init();
+ AffinityDomains_t affinity = get_affinityDomains();
+ bstring bdomain;
+ bstring blist;
+ int domainidx = -1;
+ struct bstrList* strlist;
+ if (bstrchrp(bcpustr, ':', 0) != BSTR_ERR)
+ {
+ strlist = bsplit(bcpustr, ':');
+ bdomain = bstrcpy(strlist->entry[0]);
+ blist = bstrcpy(strlist->entry[1]);
+ bstrListDestroy(strlist);
+ }
+ else
+ {
+ bdomain = bformat("N");
+ blist = bstrcpy(bcpustr);
+ }
+ for (int i=0; i<affinity->numberOfAffinityDomains; i++)
+ {
+ if (bstrcmp(bdomain, affinity->domains[i].tag) == 0)
+ {
+ domainidx = i;
+ break;
+ }
+ }
+ if (domainidx < 0)
+ {
+ fprintf(stderr, "Cannot find domain %s\n", bdata(bdomain));
+ bdestroy(bdomain);
+ bdestroy(blist);
+ return 0;
+ }
+
+ strlist = bsplit(blist, ',');
+ int insert = 0;
+ for (int i=0;i< strlist->qty; i++)
+ {
+ if (bstrchrp(strlist->entry[i], '-', 0) != BSTR_ERR)
+ {
+ struct bstrList* indexlist;
+ indexlist = bsplit(strlist->entry[i], '-');
+ if (atoi(bdata(indexlist->entry[0])) <= atoi(bdata(indexlist->entry[1])))
+ {
+ for (int j=atoi(bdata(indexlist->entry[0])); j<=atoi(bdata(indexlist->entry[1]));j++)
+ {
+ if (cpu_in_domain(domainidx, j))
+ {
+ cpulist[insert] = j;
+ insert++;
+ if (insert == length)
+ {
+ bstrListDestroy(indexlist);
+ goto physical_done;
+ }
+ }
+ else
+ {
+ fprintf(stderr, "CPU %d not in domain %s\n", j, bdata(affinity->domains[domainidx].tag));
+ }
+ }
+ }
+ else
+ {
+ for (int j=atoi(bdata(indexlist->entry[0])); j>=atoi(bdata(indexlist->entry[1]));j--)
+ {
+ if (cpu_in_domain(domainidx, j))
+ {
+ cpulist[insert] = j;
+ insert++;
+ if (insert == length)
+ {
+ bstrListDestroy(indexlist);
+ goto physical_done;
+ }
+ }
+ else
+ {
+ fprintf(stderr, "CPU %d not in domain %s\n", j, bdata(affinity->domains[domainidx].tag));
+ }
+ }
+ }
+ bstrListDestroy(indexlist);
+ }
+ else
+ {
+ int cpu = atoi(bdata(strlist->entry[i]));
+ if (cpu_in_domain(domainidx, cpu))
+ {
+ cpulist[insert] = cpu;
+ insert++;
+ if (insert == length)
+ {
+ goto physical_done;
+ }
+ }
+ else
+ {
+ fprintf(stderr, "CPU %d not in domain %s\n", cpu, bdata(affinity->domains[domainidx].tag));
+ }
+ }
+ }
+physical_done:
+ bstrListDestroy(strlist);
+ bdestroy(bdomain);
+ bdestroy(blist);
+ return insert;
+}
+
+int cpustr_to_cpulist(char* cpustring, int* cpulist, int length)
+{
+ int insert = 0;
+ int len = 0;
+ int ret = 0;
+ bstring bcpustr = bfromcstr(cpustring);
+ struct bstrList* strlist;
+ bstring scattercheck = bformat("scatter");
+ topology_init();
+ CpuTopology_t cpuid_topology = get_cpuTopology();
+ strlist = bsplit(bcpustr, '@');
+
+ int* tmpList = (int*)malloc(length * sizeof(int));
+ if (tmpList == NULL)
+ {
+ bstrListDestroy(strlist);
+ bdestroy(scattercheck);
+ bdestroy(bcpustr);
+ return -ENOMEM;
+ }
+ for (int i=0; i< strlist->qty; i++)
+ {
+ if (binstr(strlist->entry[i], 0, scattercheck) != BSTR_ERR)
+ {
+ ret = cpustr_to_cpulist_scatter(strlist->entry[i], tmpList, length);
+ insert += cpulist_concat(cpulist, insert, tmpList, ret);
+ }
+ else if (bstrchrp(strlist->entry[i], 'E', 0) == 0)
+ {
+ ret = cpustr_to_cpulist_expression(strlist->entry[i], tmpList, length);
+ insert += cpulist_concat(cpulist, insert, tmpList, ret);
+ }
+ else if (bstrchrp(strlist->entry[i], 'L', 0) == 0)
+ {
+ ret = cpustr_to_cpulist_logical(strlist->entry[i], tmpList, length);
+ insert += cpulist_concat(cpulist, insert, tmpList, ret);
+ }
+ else if (cpuid_topology->activeHWThreads < cpuid_topology->numHWThreads)
+ {
+ fprintf(stdout, "INFO: You are running LIKWID in a cpuset with %d CPUs, only logical numbering allowed\n", cpuid_topology->activeHWThreads);
+ if (((bstrchrp(strlist->entry[i], 'N', 0) == 0) ||
+ (bstrchrp(strlist->entry[i], 'S', 0) == 0) ||
+ (bstrchrp(strlist->entry[i], 'C', 0) == 0) ||
+ (bstrchrp(strlist->entry[i], 'M', 0) == 0)) &&
+ (bstrchrp(strlist->entry[i], ':', 0) != BSTR_ERR))
+ {
+ bstring newstr = bformat("L:");
+ bconcat(newstr, strlist->entry[i]);
+ ret = cpustr_to_cpulist_logical(newstr, tmpList, length);
+ insert += cpulist_concat(cpulist, insert, tmpList, ret);
+ bdestroy(newstr);
+ }
+ else
+ {
+ bstring newstr = bformat("L:N:");
+ bconcat(newstr, strlist->entry[i]);
+ ret = cpustr_to_cpulist_logical(newstr, tmpList, length);
+ insert += cpulist_concat(cpulist, insert, tmpList, ret);
+ bdestroy(newstr);
+ }
+ }
+ else if (((bstrchrp(strlist->entry[i], 'N', 0) == 0) ||
+ (bstrchrp(strlist->entry[i], 'S', 0) == 0) ||
+ (bstrchrp(strlist->entry[i], 'C', 0) == 0) ||
+ (bstrchrp(strlist->entry[i], 'M', 0) == 0)) &&
+ (bstrchrp(strlist->entry[i], ':', 0) != BSTR_ERR))
+ {
+ bstring newstr = bformat("L:");
+ bconcat(newstr, strlist->entry[i]);
+ ret = cpustr_to_cpulist_logical(newstr, tmpList, length);
+ insert += cpulist_concat(cpulist, insert, tmpList, ret);
+ bdestroy(newstr);
+ }
+
+ else
+ {
+ ret = cpustr_to_cpulist_physical(strlist->entry[i], tmpList, length);
+ insert += cpulist_concat(cpulist, insert, tmpList, ret);
+ }
+ }
+ free(tmpList);
+ bdestroy(bcpustr);
+ bdestroy(scattercheck);
+ bstrListDestroy(strlist);
+ return insert;
+}
+
+int nodestr_to_nodelist(char* nodestr, int* nodes, int length)
+{
+ int ret = 0;
+ bstring prefix = bformat("M");
+ bstring bnodestr = bfromcstr(nodestr);
+ ret = cpuexpr_to_list(bnodestr, prefix, nodes, length);
+ bdestroy(bnodestr);
+ bdestroy(prefix);
+ return ret;
+}
+
+int sockstr_to_socklist(char* sockstr, int* sockets, int length)
+{
+ int ret = 0;
+ bstring prefix = bformat("S");
+ bstring bsockstr = bfromcstr(sockstr);
+ ret = cpuexpr_to_list(bsockstr, prefix, sockets, length);
+ bdestroy(bsockstr);
+ bdestroy(prefix);
+ return ret;
+}
diff --git a/src/daemon.c b/src/daemon.c
deleted file mode 100644
index de5bfa5..0000000
--- a/src/daemon.c
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: daemon.c
- *
- * Description: C Module implementing a daemon time loop
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <signal.h>
-#include <sys/time.h>
-#include <time.h>
-
-#include <timer.h>
-#include <perfmon.h>
-#include <daemon.h>
-
-static volatile int daemon_run = 0;
-static bstring eventString;
-static TimerData timeData;
-static pid_t daemonpid = 0;
-
-
-void
-daemon_start(bstring str, struct timespec interval)
-{
- daemonpid = fork();
- if (daemonpid == 0)
- {
- eventString = bstrcpy(str);
- signal(SIGINT, daemon_interrupt);
- signal(SIGUSR1, daemon_interrupt);
- daemon_run = 1;
- perfmon_setupEventSet(eventString, NULL);
- perfmon_startCounters();
- timer_start(&timeData);
-
- while (1)
- {
- if (daemon_run)
- {
- timer_stop(&timeData);
- perfmon_readCounters();
- perfmon_logCounterResults( timer_print(&timeData) );
- timer_start(&timeData);
- }
- else
- {
- break;
- }
- nanosleep( &interval, NULL);
- }
- signal(SIGINT, SIG_DFL);
- signal(SIGUSR1, SIG_DFL);
- exit(EXIT_SUCCESS);
- }
-}
-
-void
-daemon_stop(int sig)
-{
- if (daemonpid > 0)
- {
- printf("PARENT: KILL daemon with signal %d\n", sig);
- kill(daemonpid, sig);
- //perfmon_stopCounters();
- }
-}
-
-void
-daemon_interrupt(int sig)
-{
- if (sig == SIGUSR1)
- {
- if (daemon_run)
- {
- perfmon_stopCounters();
- daemon_run = 0;
- printf("DAEMON: STOP on %d\n",sig);
- exit(EXIT_SUCCESS);
- }
- else
- {
- perfmon_setupEventSet(eventString, NULL);
- perfmon_startCounters();
- daemon_run = 1;
- printf("DAEMON: START with events %s\n",bdata(eventString));
- }
- } else
- {
- printf("DAEMON: EXIT on %d\n", sig);
- daemon_run = 0;
- exit(EXIT_SUCCESS);
- }
-}
-
-
diff --git a/src/ghash.c b/src/ghash.c
index 87e0ed0..e385a7b 100644
--- a/src/ghash.c
+++ b/src/ghash.c
@@ -1,19 +1,20 @@
-/*
- * =======================================================================================
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
+/* GLIB - Library of useful routines for C programming
+ * Copyright (C) 1995-1997 Peter Mattis, Spencer Kimball and Josh MacDonald
*
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
*
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
*
- * =======================================================================================
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
*/
/*
@@ -42,30 +43,30 @@
#define HASH_IS_TOMBSTONE(h_) ((h_) == TOMBSTONE_HASH_VALUE)
#define HASH_IS_REAL(h_) ((h_) >= 2)
-#ifndef FALSE
-#define FALSE (0)
+#ifndef FALSE
+#define FALSE (0)
#endif
-#ifndef TRUE
-#define TRUE (!FALSE)
+#ifndef TRUE
+#define TRUE (!FALSE)
#endif
-#undef MAX
+#undef MAX
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
-#undef MIN
+#undef MIN
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
-#undef ABS
-#define ABS(a) (((a) < 0) ? -(a) : (a))
+#undef ABS
+#define ABS(a) (((a) < 0) ? -(a) : (a))
#define G_LIKELY(expr) (expr)
#define G_UNLIKELY(expr) (expr)
#define _G_NEW(struct_type, n_structs, func) \
((struct_type *) g_##func##_n ((n_structs), sizeof (struct_type)))
-#define g_new(struct_type, n_structs) _G_NEW (struct_type, n_structs, malloc)
-#define g_new0(struct_type, n_structs) _G_NEW (struct_type, n_structs, malloc0)
+#define g_new(struct_type, n_structs) _G_NEW (struct_type, n_structs, malloc)
+#define g_new0(struct_type, n_structs) _G_NEW (struct_type, n_structs, malloc0)
struct _GHashTable
{
@@ -470,7 +471,10 @@ GHashTable *
g_hash_table_new (GHashFunc hash_func,
GEqualFunc key_equal_func)
{
- return g_hash_table_new_full (hash_func, key_equal_func, NULL, NULL);
+ /* Thomas Roehl added g_free as destructor of hash table keys. This reduces
+ * memory leaks since we know that all key strings are duplicated.
+ */
+ return g_hash_table_new_full (hash_func, key_equal_func, g_free, NULL);
}
diff --git a/src/hashTable.c b/src/hashTable.c
index bf6c3d8..46c0c66 100644
--- a/src/hashTable.c
+++ b/src/hashTable.c
@@ -6,13 +6,13 @@
* Description: Hashtable implementation based on SGLIB.
* Used for Marker API result handling.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -60,6 +60,20 @@ void hashTable_init()
}
}
+void hashTable_initThread(int coreID)
+{
+ ThreadList* resPtr = threadList[coreID];
+ /* check if thread was already initialized */
+ if (resPtr == NULL)
+ {
+ resPtr = (ThreadList*) malloc(sizeof(ThreadList));
+ /* initialize structure */
+ resPtr->tid = pthread_self();
+ resPtr->coreId = coreID;
+ resPtr->hashTable = g_hash_table_new(g_str_hash, g_str_equal);
+ threadList[coreID] = resPtr;
+ }
+}
int hashTable_get(bstring label, LikwidThreadResults** resEntry)
{
@@ -86,7 +100,7 @@ int hashTable_get(bstring label, LikwidThreadResults** resEntry)
(*resEntry)->label = bstrcpy (label);
(*resEntry)->time = 0.0;
(*resEntry)->count = 0;
- for (int i=0; i< NUM_PMC; i++)
+ for (int i=0; i< NUM_PMC; i++)
{
(*resEntry)->PMcounters[i] = 0.0;
(*resEntry)->StartPMcounters[i] = 0.0;
@@ -109,7 +123,6 @@ void hashTable_finalize(int* numThreads, int* numRegions, LikwidResults** result
GHashTable* regionLookup;
regionLookup = g_hash_table_new(g_str_hash, g_str_equal);
-
/* determine number of active threads */
for (int i=0; i<MAX_NUM_THREADS; i++)
{
@@ -128,22 +141,57 @@ void hashTable_finalize(int* numThreads, int* numRegions, LikwidResults** result
/* allocate data structures */
(*results) = (LikwidResults*) malloc(numberOfRegions * sizeof(LikwidResults));
-
- for ( uint32_t i=0; i < numberOfRegions; i++ )
+ if (!(*results))
{
- (*results)[i].time = (double*) malloc(numberOfThreads * sizeof(double));
- (*results)[i].count = (uint32_t*) malloc(numberOfThreads * sizeof(uint32_t));
- (*results)[i].counters = (double**) malloc(numberOfThreads * sizeof(double*));
-
- for ( uint32_t j=0; j < numberOfThreads; j++ )
+ fprintf(stderr, "Failed to allocate %lu bytes for the results\n", numberOfRegions * sizeof(LikwidResults));
+ }
+ else
+ {
+ for ( uint32_t i=0; i < numberOfRegions; i++ )
{
- (*results)[i].time[j] = 0.0;
- (*results)[i].count[j] = 0;
- (*results)[i].counters[j] = (double*) malloc(NUM_PMC * sizeof(double));
+ (*results)[i].time = (double*) malloc(numberOfThreads * sizeof(double));
+ if (!(*results)[i].time)
+ {
+ fprintf(stderr, "Failed to allocate %lu bytes for the time storage\n", numberOfThreads * sizeof(double));
+ break;
+ }
+ (*results)[i].count = (uint32_t*) malloc(numberOfThreads * sizeof(uint32_t));
+ if (!(*results)[i].count)
+ {
+ fprintf(stderr, "Failed to allocate %lu bytes for the count storage\n", numberOfThreads * sizeof(uint32_t));
+ break;
+ }
+ (*results)[i].cpulist = (int*) malloc(numberOfThreads * sizeof(int));
+ if (!(*results)[i].count)
+ {
+ fprintf(stderr, "Failed to allocate %lu bytes for the cpulist storage\n", numberOfThreads * sizeof(int));
+ break;
+ }
+ (*results)[i].counters = (double**) malloc(numberOfThreads * sizeof(double*));
+ if (!(*results)[i].counters)
+ {
+ fprintf(stderr, "Failed to allocate %lu bytes for the counter result storage\n", numberOfThreads * sizeof(double*));
+ break;
+ }
- for ( uint32_t k=0; k < NUM_PMC; k++ )
+ for ( uint32_t j=0; j < numberOfThreads; j++ )
{
- (*results)[i].counters[j][k] = 0.0;
+ (*results)[i].time[j] = 0.0;
+ (*results)[i].count[j] = 0;
+ (*results)[i].cpulist[j] = -1;
+ (*results)[i].counters[j] = (double*) malloc(NUM_PMC * sizeof(double));
+ if (!(*results)[i].counters)
+ {
+ fprintf(stderr, "Failed to allocate %lu bytes for the counter result storage for thread %d\n", NUM_PMC * sizeof(double), j);
+ break;
+ }
+ else
+ {
+ for ( uint32_t k=0; k < NUM_PMC; k++ )
+ {
+ (*results)[i].counters[j][k] = 0.0;
+ }
+ }
}
}
}
@@ -174,6 +222,7 @@ void hashTable_finalize(int* numThreads, int* numRegions, LikwidResults** result
if ( regionId == NULL )
{
(*results)[currentRegion].tag = bstrcpy (threadResult->label);
+ (*results)[currentRegion].groupID = threadResult->groupID;
regionIds[currentRegion] = currentRegion;
regionId = regionIds + currentRegion;
g_hash_table_insert(regionLookup, g_strdup(key), (regionIds+currentRegion));
@@ -182,17 +231,24 @@ void hashTable_finalize(int* numThreads, int* numRegions, LikwidResults** result
(*results)[*regionId].count[threadId] = threadResult->count;
(*results)[*regionId].time[threadId] = threadResult->time;
+ (*results)[*regionId].cpulist[threadId] = threadResult->cpuID;
for ( int j=0; j < NUM_PMC; j++ )
{
(*results)[*regionId].counters[threadId][j] = threadResult->PMcounters[j];
}
+ bdestroy(threadResult->label);
+ free(threadResult);
}
threadId++;
+ g_hash_table_destroy(resPtr->hashTable);
+ free(resPtr);
+ threadList[core] = NULL;
}
}
-
+ g_hash_table_destroy(regionLookup);
+ regionLookup = NULL;
(*numThreads) = numberOfThreads;
(*numRegions) = numberOfRegions;
}
diff --git a/src/includes/access.h b/src/includes/access.h
new file mode 100644
index 0000000..b81beb8
--- /dev/null
+++ b/src/includes/access.h
@@ -0,0 +1,44 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: access.h
+ *
+ * Description: Header File HPM access Module
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef ACCESS_H
+#define ACCESS_H
+
+void HPMmode(int mode);
+int HPMinit(void);
+int HPMinitialized(void);
+int HPMaddThread(int cpu_id);
+void HPMfinalize();
+int HPMread(int cpu_id, PciDeviceIndex dev, uint32_t reg, uint64_t* data);
+int HPMwrite(int cpu_id, PciDeviceIndex dev, uint32_t reg, uint64_t data);
+int HPMcheck(PciDeviceIndex dev, int cpu_id);
+
+
+#endif
diff --git a/src/includes/accessClient.h b/src/includes/accessClient.h
deleted file mode 100644
index 0058182..0000000
--- a/src/includes/accessClient.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: accessClient.h
- *
- * Description: Header File accessClient Module.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef ACCESSCLIENT_H
-#define ACCESSCLIENT_H
-
-#include <types.h>
-
-extern int accessClient_mode;
-
-/* This needs to be called BEFORE msr_init and
- * sets how the module tries to access the MSR registers. */
-extern void accessClient_setaccessmode(int mode);
-
-/* This needs to be called BEFORE msr_init and
- * sets the priority the module reports to the daemon.
- * This is a noop in any msr access mode except sysmsrd. */
-extern void accessClient_setlowaccesspriority(void);
-
-/* Initializes the MSR module, trying to open either the MSR files or
- * the connection to the msr daemon. */
-extern void accessClient_init(int* socket_fd);
-extern void accessClient_initThread(int* socket_fd);
-extern void accessClient_finalize(int socket_fd);
-extern uint64_t accessClient_read(int socket_fd, int cpu, int device, uint32_t reg);
-extern void accessClient_write(int socket_fd, int cpu, int device, uint32_t reg, uint64_t data);
-
-#endif /* ACCESSCLIENT_H */
diff --git a/src/includes/accessClient_types.h b/src/includes/accessClient_types.h
deleted file mode 100644
index a0c7a84..0000000
--- a/src/includes/accessClient_types.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: accessClient_types.h
- *
- * Description: Types file for accessClient module.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef ACCESSCLIENT_TYPES_H
-#define ACCESSCLIENT_TYPES_H
-
-#include <stdint.h>
-
-/* This naming with AccessType and AccessMode is admittedly a bit confusing */
-typedef enum {
- DAEMON_AM_DIRECT = 0,
- DAEMON_AM_ACCESS_D
-} AccessMode;
-
-typedef enum {
- DAEMON_READ = 0,
- DAEMON_WRITE,
- DAEMON_EXIT
-} AccessType;
-
-typedef enum {
- DAEMON_AD_PCI_R3QPI_LINK_0 = 0,
- DAEMON_AD_PCI_R3QPI_LINK_1,
- DAEMON_AD_PCI_R2PCIE,
- DAEMON_AD_PCI_IMC_CH_0,
- DAEMON_AD_PCI_IMC_CH_1,
- DAEMON_AD_PCI_IMC_CH_2,
- DAEMON_AD_PCI_IMC_CH_3,
- DAEMON_AD_PCI_HA,
- DAEMON_AD_PCI_QPI_PORT_0,
- DAEMON_AD_PCI_QPI_PORT_1,
- DAEMON_AD_PCI_QPI_MASK_PORT_0,
- DAEMON_AD_PCI_QPI_MASK_PORT_1,
- DAEMON_AD_PCI_QPI_MISC_PORT_0,
- DAEMON_AD_PCI_QPI_MISC_PORT_1,
- DAEMON_AD_MSR
-} AccessDevice;
-
-typedef enum {
- ERR_NOERROR = 0, /* no error */
- ERR_UNKNOWN, /* unknown command */
- ERR_RESTREG, /* attempt to access restricted MSR */
- ERR_OPENFAIL, /* failure to open msr files */
- ERR_RWFAIL, /* failure to read/write msr */
- ERR_DAEMONBUSY, /* daemon already has another client */
- ERR_LOCKED, /* access to HPM is locked */
- ERR_UNSUPPORTED, /* unsupported processor */
- ERR_NODEV /* No such device */
-} AccessErrorType;
-
-typedef struct {
- uint32_t cpu;
- uint32_t reg;
- uint64_t data;
- AccessDevice device;
- AccessType type;
- AccessErrorType errorcode; /* Only in replies - 0 if no error. */
-} AccessDataRecord;
-
-#endif /*ACCESSCLIENT_TYPES_H*/
diff --git a/src/includes/access_client.h b/src/includes/access_client.h
new file mode 100644
index 0000000..46f1dbb
--- /dev/null
+++ b/src/includes/access_client.h
@@ -0,0 +1,11 @@
+#ifndef LIKWID_ACCESS_CLIENT_H
+#define LIKWID_ACCESS_CLIENT_H
+
+
+int access_client_init(int cpu_id);
+int access_client_read(PciDeviceIndex dev, const int cpu_id, uint32_t reg, uint64_t *data);
+int access_client_write(PciDeviceIndex dev, const int cpu_id, uint32_t reg, uint64_t data);
+void access_client_finalize(int cpu_id);
+int access_client_check(PciDeviceIndex dev, int cpu_id);
+
+#endif
diff --git a/src/includes/access_client_types.h b/src/includes/access_client_types.h
new file mode 100644
index 0000000..214aae8
--- /dev/null
+++ b/src/includes/access_client_types.h
@@ -0,0 +1,65 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: access_client_types.h
+ *
+ * Description: Types file for access_client access module.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef ACCESSCLIENT_TYPES_H
+#define ACCESSCLIENT_TYPES_H
+
+#include <stdint.h>
+#include <pci_types.h>
+
+typedef enum {
+ DAEMON_READ = 0,
+ DAEMON_WRITE,
+ DAEMON_CHECK,
+ DAEMON_EXIT
+} AccessType;
+
+typedef enum {
+ ERR_NOERROR = 0, /* no error */
+ ERR_UNKNOWN, /* unknown command */
+ ERR_RESTREG, /* attempt to access restricted MSR */
+ ERR_OPENFAIL, /* failure to open msr files */
+ ERR_RWFAIL, /* failure to read/write msr */
+ ERR_DAEMONBUSY, /* daemon already has another client */
+ ERR_NODEV /* No such device */
+} AccessErrorType;
+
+typedef struct {
+ uint32_t cpu;
+ uint32_t reg;
+ uint64_t data;
+ PciDeviceIndex device;
+ AccessType type;
+ AccessErrorType errorcode; /* Only in replies - 0 if no error. */
+} AccessDataRecord;
+
+extern int accessClient_mode;
+
+#endif /*ACCESSCLIENT_TYPES_H*/
diff --git a/src/includes/access_x86.h b/src/includes/access_x86.h
new file mode 100644
index 0000000..1628bee
--- /dev/null
+++ b/src/includes/access_x86.h
@@ -0,0 +1,13 @@
+#ifndef LIKWID_ACCESS_X86_H
+#define LIKWID_ACCESS_X86_H
+
+#include <types.h>
+
+int access_x86_init(int cpu_id);
+int access_x86_read(PciDeviceIndex dev, const int cpu_id, uint32_t reg, uint64_t *data);
+int access_x86_write(PciDeviceIndex dev, const int cpu_id, uint32_t reg, uint64_t data);
+void access_x86_finalize(int cpu_id);
+int access_x86_check(PciDeviceIndex dev, int cpu_id);
+
+
+#endif
diff --git a/src/includes/access_x86_msr.h b/src/includes/access_x86_msr.h
new file mode 100644
index 0000000..a00c45b
--- /dev/null
+++ b/src/includes/access_x86_msr.h
@@ -0,0 +1,12 @@
+#ifndef LIKWID_ACCESS_X86_MSR_H
+#define LIKWID_ACCESS_X86_MSR_H
+
+#include <types.h>
+
+int access_x86_msr_init(const int cpu_id);
+void access_x86_msr_finalize(const int cpu_id);
+int access_x86_msr_read(const int cpu, uint32_t reg, uint64_t *data);
+int access_x86_msr_write(const int cpu, uint32_t reg, uint64_t data);
+int access_x86_msr_check(PciDeviceIndex dev, int cpu_id);
+
+#endif
diff --git a/src/includes/access_x86_pci.h b/src/includes/access_x86_pci.h
new file mode 100644
index 0000000..e932e57
--- /dev/null
+++ b/src/includes/access_x86_pci.h
@@ -0,0 +1,12 @@
+#ifndef LIKWID_ACCESS_X86_PCI_H
+#define LIKWID_ACCESS_X86_PCI_H
+
+#include <types.h>
+
+int access_x86_pci_init(const int socket);
+void access_x86_pci_finalize(const int socket);
+int access_x86_pci_read(PciDeviceIndex dev, const int socket, uint32_t reg, uint64_t *data);
+int access_x86_pci_write(PciDeviceIndex dev, const int socket, uint32_t reg, uint64_t data);
+int access_x86_pci_check(PciDeviceIndex dev, int socket);
+
+#endif
diff --git a/src/includes/affinity.h b/src/includes/affinity.h
index f347e64..6f2215c 100644
--- a/src/includes/affinity.h
+++ b/src/includes/affinity.h
@@ -5,13 +5,14 @@
*
* Description: Header File affinity Module
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -32,17 +33,18 @@
#define AFFINITY_H
#include <types.h>
+#include <likwid.h>
+
+int socket_lock[MAX_NUM_NODES];
+int tile_lock[MAX_NUM_THREADS];
+extern AffinityDomains affinityDomains;
extern int affinity_core2node_lookup[MAX_NUM_THREADS];
-extern void affinity_init();
-extern void affinity_finalize();
-extern int affinity_processGetProcessorId();
-extern int affinity_threadGetProcessorId();
-extern void affinity_pinProcess(int processorId);
-extern void affinity_pinThread(int processorId);
+extern int affinity_processGetProcessorId();
+extern int affinity_threadGetProcessorId();
extern const AffinityDomain* affinity_getDomain(bstring domain);
-extern void affinity_printDomains(FILE* OUTSTREAM);
+
#endif /*AFFINITY_H*/
diff --git a/src/includes/affinity_types.h b/src/includes/affinity_types.h
deleted file mode 100644
index 2b08bfe..0000000
--- a/src/includes/affinity_types.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: affinity_types.h
- *
- * Description: Type Definitions for affinity Module
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef AFFINITY_TYPES_H
-#define AFFINITY_TYPES_H
-
-typedef struct {
- bstring tag;
- uint32_t numberOfProcessors;
- uint32_t numberOfCores;
- int* processorList;
-} AffinityDomain;
-
-
-#endif /*AFFINITY_TYPES_H*/
diff --git a/src/includes/allocator.h b/src/includes/allocator.h
deleted file mode 100644
index a21555c..0000000
--- a/src/includes/allocator.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: allocator.h
- *
- * Description: Header File allocator Module.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: none
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef ALLOCATOR_H
-#define ALLOCATOR_H
-
-#include <types.h>
-#include <bstrlib.h>
-
-extern void allocator_init(int numVectors);
-extern void allocator_finalize();
-extern void allocator_allocateVector(FILE* OUTSTREAM,
- void** ptr,
- int alignment,
- uint64_t size,
- int offset,
- DataType type,
- bstring domain);
-
-#endif /*ALLOCATOR_H*/
-
diff --git a/src/includes/asciiBoxes.h b/src/includes/asciiBoxes.h
deleted file mode 100644
index dd37a05..0000000
--- a/src/includes/asciiBoxes.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: asciiBoxes.h
- *
- * Description: Module to draw nested ascii art boxes.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef ASCIIBOXES_H
-#define ASCIIBOXES_H
-
-#include <types.h>
-#include <bstrlib.h>
-
-extern BoxContainer* asciiBoxes_allocateContainer(int numLines,int numColumns);
-extern void asciiBoxes_addBox(BoxContainer* container, int line, int column, bstring label);
-extern void asciiBoxes_addJoinedBox(BoxContainer* container, int line, int startColumn, int endColumn, bstring label);
-extern void asciiBoxes_print(FILE* OUTSTREAM, BoxContainer* container);
-
-#endif /*ASCIIBOXES_H*/
diff --git a/src/includes/asciiBoxes_types.h b/src/includes/asciiBoxes_types.h
deleted file mode 100644
index f09c4b3..0000000
--- a/src/includes/asciiBoxes_types.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: asciiBoxes_types.h
- *
- * Description: Types file for asciiBoxes module.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef ASCIIBOXES_TYPES_H
-#define ASCIIBOXES_TYPES_H
-
-#include <bstrlib.h>
-
-typedef struct box {
- int width;
- bstring label;
-} Box;
-
-typedef struct boxContainer {
- int numLines;
- int numColumns;
- Box** boxes;
-} BoxContainer;
-
-#endif /*ASCIIBOXES_TYPES_H*/
diff --git a/src/includes/asciiTable.h b/src/includes/asciiTable.h
deleted file mode 100644
index 6096c4a..0000000
--- a/src/includes/asciiTable.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: asciiTable.h
- *
- * Description: Module to create and print a ascii table
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef ASCIITABLE_H
-#define ASCIITABLE_H
-
-#include <types.h>
-#include <bstrlib.h>
-
-extern TableContainer* asciiTable_allocate(int numRows,int numColumns, bstrList* headerLabels);
-extern void asciiTable_free(TableContainer* container);
-extern void asciiTable_insertRow(TableContainer* container, int row, bstrList* fields);
-extern void asciiTable_appendRow(TableContainer* container, bstrList* fields);
-extern void asciiTable_setCurrentRow(TableContainer* container, int row);
-extern void asciiTable_print(TableContainer* container);
-extern void asciiTable_setOutput(FILE* stream);
-
-#endif /*ASCIITABLE_H*/
diff --git a/src/includes/asciiTable_types.h b/src/includes/asciiTable_types.h
deleted file mode 100644
index 986a8a2..0000000
--- a/src/includes/asciiTable_types.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: asciiTable_types.h
- *
- * Description: Types file for asciiTable module.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef ASCIITABLE_TYPES_H
-#define ASCIITABLE_TYPES_H
-
-#include <bstrlib.h>
-
-typedef struct bstrList bstrList;
-
-typedef struct {
- int numRows;
- int numColumns;
- int currentRow;
- int printed;
- bstrList* header;
- bstrList** rows;
-} TableContainer;
-
-
-#endif /*ASCIITABLE_TYPES_H*/
diff --git a/src/includes/barrier.h b/src/includes/barrier.h
deleted file mode 100644
index 5f4142d..0000000
--- a/src/includes/barrier.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: barrier.h
- *
- * Description: Header File barrier Module
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef BARRIER_H
-#define BARRIER_H
-
-#include <types.h>
-
-/**
- * @brief Initialize the barrier module
- * @param numberOfThreads The total number of threads in the barrier
- */
-extern void barrier_init(int numberOfGroups);
-
-/**
- * @brief Destroy data structures of the barrier module
- */
-extern void barrier_destroy(void);
-
-/**
- * @brief Register a thread for a barrier
- * @param threadId The id of the thread to register
- */
-extern int barrier_registerGroup(int numThreads);
-extern void barrier_registerThread(BarrierData* barr, int groupsId, int threadId);
-
-/**
- * @brief Synchronize threads
- * @param threadId The id of the calling thread
- * @param numberOfThreads Total number of threads in the barrier
- */
-extern void barrier_synchronize(BarrierData* barr);
-
-
-#endif /*BARRIER_H*/
diff --git a/src/includes/barrier_types.h b/src/includes/barrier_types.h
deleted file mode 100644
index d0abb55..0000000
--- a/src/includes/barrier_types.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: barrier_types.h
- *
- * Description: Type Definitions for barrier Module
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef BARRIER_TYPES_H
-#define BARRIER_TYPES_H
-
-#include <stdint.h>
-
-typedef struct {
- int numberOfThreads;
- int offset;
- int val;
- int* index;
- volatile int* bval;
-} BarrierData;
-
-typedef struct {
- int* groupBval;
- int numberOfThreads;
-} BarrierGroup;
-
-#endif /*BARRIER_TYPES_H*/
diff --git a/src/includes/bitUtil.h b/src/includes/bitUtil.h
index c876eea..e10ad65 100644
--- a/src/includes/bitUtil.h
+++ b/src/includes/bitUtil.h
@@ -6,13 +6,13 @@
* Description: Header File bitUtil Module.
* Helper routines for dealing with bit manipulations
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
diff --git a/src/includes/bstrlib.h b/src/includes/bstrlib.h
index abdbef3..a1160b6 100644
--- a/src/includes/bstrlib.h
+++ b/src/includes/bstrlib.h
@@ -113,11 +113,11 @@ extern struct bstrList * bsplits (const_bstring str, const_bstring splitStr);
extern struct bstrList * bsplitstr (const_bstring str, const_bstring splitStr);
extern bstring bjoin (const struct bstrList * bl, const_bstring sep);
extern int bsplitcb (const_bstring str, unsigned char splitChar, int pos,
- int (* cb) (void * parm, int ofs, int len), void * parm);
+ int (* cb) (void * parm, int ofs, int len), void * parm);
extern int bsplitscb (const_bstring str, const_bstring splitStr, int pos,
- int (* cb) (void * parm, int ofs, int len), void * parm);
+ int (* cb) (void * parm, int ofs, int len), void * parm);
extern int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos,
- int (* cb) (void * parm, int ofs, int len), void * parm);
+ int (* cb) (void * parm, int ofs, int len), void * parm);
/* Miscellaneous functions */
extern int bpattern (bstring b, int len);
@@ -137,21 +137,21 @@ extern int bvcformata (bstring b, int count, const char * fmt, va_list arglist);
bstring bstrtmp_b = (b); \
const char * bstrtmp_fmt = (fmt); \
int bstrtmp_r = BSTR_ERR, bstrtmp_sz = 16; \
- for (;;) { \
- va_list bstrtmp_arglist; \
- va_start (bstrtmp_arglist, lastarg); \
- bstrtmp_r = bvcformata (bstrtmp_b, bstrtmp_sz, bstrtmp_fmt, bstrtmp_arglist); \
- va_end (bstrtmp_arglist); \
- if (bstrtmp_r >= 0) { /* Everything went ok */ \
- bstrtmp_r = BSTR_OK; \
- break; \
- } else if (-bstrtmp_r <= bstrtmp_sz) { /* A real error? */ \
- bstrtmp_r = BSTR_ERR; \
- break; \
- } \
- bstrtmp_sz = -bstrtmp_r; /* Doubled or target size */ \
- } \
- ret = bstrtmp_r; \
+ for (;;) { \
+ va_list bstrtmp_arglist; \
+ va_start (bstrtmp_arglist, lastarg); \
+ bstrtmp_r = bvcformata (bstrtmp_b, bstrtmp_sz, bstrtmp_fmt, bstrtmp_arglist); \
+ va_end (bstrtmp_arglist); \
+ if (bstrtmp_r >= 0) { /* Everything went ok */ \
+ bstrtmp_r = BSTR_OK; \
+ break; \
+ } else if (-bstrtmp_r <= bstrtmp_sz) { /* A real error? */ \
+ bstrtmp_r = BSTR_ERR; \
+ break; \
+ } \
+ bstrtmp_sz = -bstrtmp_r; /* Doubled or target size */ \
+ } \
+ ret = bstrtmp_r; \
}
#endif
@@ -179,15 +179,15 @@ extern int bsreada (bstring b, struct bStream * s, int n);
extern int bsunread (struct bStream * s, const_bstring b);
extern int bspeek (bstring r, const struct bStream * s);
extern int bssplitscb (struct bStream * s, const_bstring splitStr,
- int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
+ int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
extern int bssplitstrcb (struct bStream * s, const_bstring splitStr,
- int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
+ int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
extern int bseof (const struct bStream * s);
struct tagbstring {
- int mlen;
- int slen;
- unsigned char * data;
+ int mlen;
+ int slen;
+ unsigned char * data;
};
/* Accessor macros */
diff --git a/src/includes/calculator.h b/src/includes/calculator.h
new file mode 100644
index 0000000..67ca564
--- /dev/null
+++ b/src/includes/calculator.h
@@ -0,0 +1,38 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: calculator.h
+ *
+ * Description: Header file for infix calculator
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef CALCULATOR_H
+#define CALCULATOR_H
+
+
+int calculate_infix(char* finfix, double *result);
+
+#endif
diff --git a/src/includes/calculator_stack.h b/src/includes/calculator_stack.h
new file mode 100644
index 0000000..670f317
--- /dev/null
+++ b/src/includes/calculator_stack.h
@@ -0,0 +1,48 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: calculator_stack.h
+ *
+ * Description: Stack implementation for infix calculator
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Brandon Mills (bm), mills.brandont at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) Brandon Mills
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef CALCULATOR_STACK_H
+#define CALCULATOR_STACK_H
+
+typedef struct
+{
+ void **content;
+ int size;
+ int top;
+} Stack;
+
+void stackInit(Stack *s, int size);
+void stackPush(Stack *s, void* val);
+void* stackTop(Stack *s);
+void* stackPop(Stack *s);
+int stackSize(Stack *s);
+void stackFree(Stack *s);
+
+#endif /* CALCULATOR_STACK_H */
diff --git a/src/includes/configuration.h b/src/includes/configuration.h
new file mode 100644
index 0000000..a6a3334
--- /dev/null
+++ b/src/includes/configuration.h
@@ -0,0 +1,46 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: configuration.h
+ *
+ * Description: Header File of Module configuration.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef CONFIGURATION_H
+#define CONFIGURATION_H
+
+#include <types.h>
+#include <likwid.h>
+#include <error.h>
+
+
+extern Configuration config;
+extern int init_config;
+
+
+
+
+
+#endif
diff --git a/src/includes/cpuFeatures.h b/src/includes/cpuFeatures.h
index 9274e40..af4d7c2 100644
--- a/src/includes/cpuFeatures.h
+++ b/src/includes/cpuFeatures.h
@@ -5,13 +5,13 @@
*
* Description: Header File of Module cpuFeatures.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
diff --git a/src/includes/cpuFeatures_types.h b/src/includes/cpuFeatures_types.h
index 3e7ec5d..87ed2a2 100644
--- a/src/includes/cpuFeatures_types.h
+++ b/src/includes/cpuFeatures_types.h
@@ -5,13 +5,13 @@
*
* Description: Types file for CpuFeature module.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -38,23 +38,23 @@ typedef enum {
IP_PREFETCHER} CpuFeature;
typedef struct {
- unsigned int fastStrings:1;
- unsigned int thermalControl:1;
- unsigned int perfMonitoring:1;
- unsigned int hardwarePrefetcher:1;
- unsigned int ferrMultiplex:1;
- unsigned int branchTraceStorage:1;
- unsigned int pebs:1;
- unsigned int speedstep:1;
- unsigned int monitor:1;
- unsigned int clPrefetcher:1;
- unsigned int speedstepLock:1;
- unsigned int cpuidMaxVal:1;
- unsigned int xdBit:1;
- unsigned int dcuPrefetcher:1;
- unsigned int dynamicAcceleration:1;
- unsigned int turboMode:1;
- unsigned int ipPrefetcher:1;
+ unsigned int fastStrings:1;
+ unsigned int thermalControl:1;
+ unsigned int perfMonitoring:1;
+ unsigned int hardwarePrefetcher:1;
+ unsigned int ferrMultiplex:1;
+ unsigned int branchTraceStorage:1;
+ unsigned int pebs:1;
+ unsigned int speedstep:1;
+ unsigned int monitor:1;
+ unsigned int clPrefetcher:1;
+ unsigned int speedstepLock:1;
+ unsigned int cpuidMaxVal:1;
+ unsigned int xdBit:1;
+ unsigned int dcuPrefetcher:1;
+ unsigned int dynamicAcceleration:1;
+ unsigned int turboMode:1;
+ unsigned int ipPrefetcher:1;
} CpuFeatureFlags;
diff --git a/src/includes/cpuid.h b/src/includes/cpuid.h
index 80c426a..7970ced 100644
--- a/src/includes/cpuid.h
+++ b/src/includes/cpuid.h
@@ -1,19 +1,17 @@
/*
* =======================================================================================
*
- * Filename: cpuid.h
+ * Filename: configuration.h
*
- * Description: Header File cpuid Module.
- * Reads out cpuid information and initilaizes a global
- * data structure cpuid_info.
+ * Description: Common macro definition for CPUID instruction
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Thomas Roehl (tr), thomas.roehl at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -30,101 +28,32 @@
* =======================================================================================
*/
-#ifndef CPUID_H
-#define CPUID_H
-
-#include <types.h>
-
-/* Intel P6 */
-#define PENTIUM_M_BANIAS 0x09U
-#define PENTIUM_M_DOTHAN 0x0DU
-#define CORE_DUO 0x0EU
-#define CORE2_65 0x0FU
-#define CORE2_45 0x17U
-#define ATOM 0x1CU
-#define ATOM_45 0x26U
-#define ATOM_32 0x36U
-#define ATOM_22 0x27U
-#define ATOM_SILVERMONT_E 0x37U
-#define ATOM_SILVERMONT_C 0x4DU
-#define ATOM_SILVERMONT_F1 0x4AU
-#define ATOM_SILVERMONT_F2 0x5AU
-#define ATOM_SILVERMONT_F3 0x5DU
-#define NEHALEM 0x1AU
-#define NEHALEM_BLOOMFIELD 0x1AU
-#define NEHALEM_LYNNFIELD 0x1EU
-#define NEHALEM_LYNNFIELD_M 0x1FU
-#define NEHALEM_WESTMERE 0x2CU
-#define NEHALEM_WESTMERE_M 0x25U
-#define SANDYBRIDGE 0x2AU
-#define SANDYBRIDGE_EP 0x2DU
-#define HASWELL 0x3CU
-#define HASWELL_EX 0x3FU
-#define HASWELL_M1 0x45U
-#define HASWELL_M2 0x46U
-#define IVYBRIDGE 0x3AU
-#define IVYBRIDGE_EP 0x3EU
-#define NEHALEM_EX 0x2EU
-#define WESTMERE_EX 0x2FU
-#define XEON_MP 0x1DU
-
-/* Intel MIC */
-#define XEON_PHI 0x01U
-
-/* AMD K10 */
-#define BARCELONA 0x02U
-#define SHANGHAI 0x04U
-#define ISTANBUL 0x08U
-#define MAGNYCOURS 0x09U
-
-/* AMD K8 */
-#define OPTERON_SC_1MB 0x05U
-#define OPTERON_DC_E 0x21U
-#define OPTERON_DC_F 0x41U
-#define ATHLON64_X2 0x43U
-#define ATHLON64_X2_F 0x4BU
-#define ATHLON64_F1 0x4FU
-#define ATHLON64_F2 0x5FU
-#define ATHLON64_X2_G 0x6BU
-#define ATHLON64_G1 0x6FU
-#define ATHLON64_G2 0x7FU
-
-
-#define P6_FAMILY 0x6U
-#define MIC_FAMILY 0xBU
-#define NETBURST_FAMILY 0xFFU
-#define K15_FAMILY 0x15U
-#define K16_FAMILY 0x16U
-#define K10_FAMILY 0x10U
-#define K8_FAMILY 0xFU
-
-/** Structure holding cpuid information
- *
- */
-extern CpuInfo cpuid_info;
-extern CpuTopology cpuid_topology;
-
-/** Init routine to intialize global structure.
- *
- * Determines:
- * - cpu family
- * - cpu model
- * - cpu stepping
- * - cpu clock
- * - Instruction Set Extension Flags
- * - Performance counter features (Intel P6 only)
- *
- */
-extern int cpuid_init (void);
-extern void cpuid_print (void);
-extern void cpuid_initTopology (void);
-extern void cpuid_initCacheTopology (void);
-extern int cpuid_isInCpuset(void);
-
-static inline int cpuid_hasFeature(FeatureBit bit)
-{
- return (cpuid_info.featureFlags & (1<<bit));
-}
-
-
-#endif /*CPUID_H*/
+#ifndef LIKWID_CPUID_H
+#define LIKWID_CPUID_H
+
+/* This was taken from the linux kernel
+ * Kernel version 3.19
+ * File: arch/x86/boot/cpuflags.c
+*/
+
+
+#if defined(__i386__) && defined(__PIC__)
+# define EBX_REG "=r"
+#else
+# define EBX_REG "=b"
+#endif
+
+#ifndef __clang__
+#define CPUID(eax,ebx,ecx,edx) \
+ __asm__ volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t" \
+ "cpuid \n\t" \
+ ".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif \n\t" \
+ : "=a" (eax), "=c" (ecx), "=d" (edx), EBX_REG (ebx) \
+ : "a" (eax), "c" (ecx) \
+ )
+#else
+#define CPUID(eax,ebx,ecx,edx) \
+ __asm__ volatile("cpuid" : "=a" (eax), "=c" (ecx), "=d" (edx), EBX_REG (ebx) : "a" (eax), "c" (ecx) );
+#endif
+
+#endif
diff --git a/src/includes/cpuid_types.h b/src/includes/cpuid_types.h
deleted file mode 100644
index cccc22d..0000000
--- a/src/includes/cpuid_types.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: cpuid_types.h
- *
- * Description: Types file for cpuid module.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef CPUID_TYPES_H
-#define CPUID_TYPES_H
-
-typedef enum {
- NOCACHE=0,
- DATACACHE,
- INSTRUCTIONCACHE,
- UNIFIEDCACHE,
- ITLB,
- DTLB} CacheType;
-
-typedef enum {
- NODE=0,
- SOCKET,
- CORE,
- THREAD} NodeLevel;
-
-typedef enum {
- SSE3=0,
- VSX,
- MMX,
- SSE,
- SSE2,
- MONITOR,
- ACPI,
- RDTSCP,
- VMX,
- EIST,
- TM,
- TM2,
- AES,
- RDRAND,
- SSSE3,
- SSE41,
- SSE42,
- AVX,
- FMA} FeatureBit;
-
-typedef struct {
- uint32_t family;
- uint32_t model;
- uint32_t stepping;
- uint64_t clock;
- int turbo;
- char* name;
- char* features;
- uint32_t featureFlags;
- uint32_t perf_version;
- uint32_t perf_num_ctr;
- uint32_t perf_width_ctr;
- uint32_t perf_num_fixed_ctr;
- int supportUncore;
-} CpuInfo;
-
-typedef struct {
- uint32_t threadId;
- uint32_t coreId;
- uint32_t packageId;
- uint32_t apicId;
-} HWThread;
-
-typedef struct {
- int level;
- CacheType type;
- int associativity;
- int sets;
- int lineSize;
- int size;
- int threads;
- int inclusive;
-} CacheLevel;
-
-typedef struct {
- uint32_t numHWThreads;
- uint32_t numSockets;
- uint32_t numCoresPerSocket;
- uint32_t numThreadsPerCore;
- uint32_t numCacheLevels;
- HWThread* threadPool;
- CacheLevel* cacheLevels;
- TreeNode* topologyTree;
-} CpuTopology;
-
-
-#endif /*CPUID_TYPES_H*/
diff --git a/src/includes/daemon.h b/src/includes/daemon.h
deleted file mode 100644
index 3272636..0000000
--- a/src/includes/daemon.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: daemon.h
- *
- * Description: Header File daemon Module.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef DAEMON_H
-#define DAEMON_H
-
-#include <types.h>
-#include <time.h>
-
-extern void daemon_init();
-extern void daemon_start(bstring str, struct timespec interval);
-extern void daemon_stop(int sig);
-extern void daemon_interrupt(int sig);
-
-#endif /* DAEMON_H */
diff --git a/src/includes/error.h b/src/includes/error.h
index 3c1526f..faabb2e 100644
--- a/src/includes/error.h
+++ b/src/includes/error.h
@@ -5,13 +5,14 @@
*
* Description: Central error handling macros
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -31,60 +32,81 @@
#ifndef ERROR_H
#define ERROR_H
-#include <errno.h>
-#include <string.h>
-#include <msr.h>
+
+#include <likwid.h>
+
+
#define str(x) #x
-#define FINALIZE msr_finalize()
#define ERRNO_PRINT fprintf(stderr, "ERROR - [%s:%d] %s\n", __FILE__, __LINE__, strerror(errno))
#define ERROR \
ERRNO_PRINT; \
- FINALIZE; \
exit(EXIT_FAILURE)
#define ERROR_PLAIN_PRINT(msg) \
- fprintf(stderr, "ERROR - [%s:%d] " str(msg) "\n", __FILE__, __LINE__); \
- FINALIZE; \
- exit(EXIT_FAILURE)
+ fprintf(stderr, "ERROR - [%s:%s:%d] " str(msg) "\n", __FILE__, __func__,__LINE__);
#define ERROR_PRINT(fmt, ...) \
- fprintf(stderr, "ERROR - [%s:%d] " str(fmt) "\n", __FILE__, __LINE__, __VA_ARGS__); \
- FINALIZE; \
- exit(EXIT_FAILURE)
+ fprintf(stderr, "ERROR - [%s:%s:%d] %s.\n" str(fmt) "\n", __FILE__, __func__,__LINE__, strerror(errno), __VA_ARGS__);
#define CHECK_ERROR(func, msg) \
if ((func) < 0) { \
fprintf(stderr, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); \
}
+#define CHECK_AND_RETURN_ERROR(func, msg) \
+ if ((func) < 0) { \
+ fprintf(stderr, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); \
+ return errno; \
+ }
+
#define EXIT_IF_ERROR(func, msg) \
if ((func) < 0) { \
fprintf(stderr,"ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); \
- FINALIZE; \
exit(EXIT_FAILURE); \
}
-#ifndef DEBUGLEV
-#define DEBUGLEV 0
-#endif
+
#define VERBOSEPRINTREG(cpuid,reg,flags,msg) \
- if (perfmon_verbose) { \
+ if (perfmon_verbosity >= DEBUGLEV_DETAIL) \
+ { \
printf("DEBUG - [%s:%d] " str(msg) " [%d] Register 0x%llX , Flags: 0x%llX \n", \
- __FILE__, __LINE__, (cpuid), LLU_CAST (reg), LLU_CAST (flags)); \
+ __func__, __LINE__, (cpuid), LLU_CAST (reg), LLU_CAST (flags)); \
fflush(stdout); \
- }
+ }
+
+#define VERBOSEPRINTPCIREG(cpuid,dev,reg,flags,msg) \
+ if (perfmon_verbosity >= DEBUGLEV_DETAIL) \
+ { \
+ printf("DEBUG - [%s:%d] " str(msg) " [%d] Device %d Register 0x%llX , Flags: 0x%llX \n", \
+ __func__, __LINE__, (cpuid), dev, LLU_CAST (reg), LLU_CAST (flags)); \
+ fflush(stdout); \
+ }
#define DEBUG_PRINT(lev, fmt, ...) \
- if (DEBUGLEV > lev) { \
- printf(fmt, __VA_ARGS__); \
+ if ((lev >= 0) && (lev <= perfmon_verbosity)) { \
+ fprintf(stdout, "DEBUG - [%s:%d] " str(fmt) "\n", __func__, __LINE__,__VA_ARGS__); \
+ fflush(stdout); \
+ }
+
+#define DEBUG_PLAIN_PRINT(lev, msg) \
+ if ((lev >= 0) && (lev <= perfmon_verbosity)) { \
+ fprintf(stdout, "DEBUG - [%s:%d] " str(msg) "\n",__func__, __LINE__); \
fflush(stdout); \
}
+
+#define CHECK_MSR_WRITE_ERROR(func) CHECK_AND_RETURN_ERROR(func, MSR write operation failed);
+#define CHECK_MSR_READ_ERROR(func) CHECK_AND_RETURN_ERROR(func, MSR read operation failed);
+#define CHECK_PCI_WRITE_ERROR(func) CHECK_AND_RETURN_ERROR(func, PCI write operation failed);
+#define CHECK_PCI_READ_ERROR(func) CHECK_AND_RETURN_ERROR(func, PCI read operation failed);
+#define CHECK_POWER_READ_ERROR(func) CHECK_AND_RETURN_ERROR(func, Power register read operation failed);
+#define CHECK_TEMP_READ_ERROR(func) CHECK_AND_RETURN_ERROR(func, Temperature register read operation failed);
+
#endif /*ERROR_H*/
diff --git a/src/includes/ghash.h b/src/includes/ghash.h
index f33e9fb..75a17fd 100644
--- a/src/includes/ghash.h
+++ b/src/includes/ghash.h
@@ -1,20 +1,20 @@
-/*
- * =======================================================================================
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
+/* GLIB - Library of useful routines for C programming
+ * Copyright (C) 1995-1997 Peter Mattis, Spencer Kimball and Josh MacDonald
*
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
*
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
*
- * =======================================================================================
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
*/
/*
@@ -59,13 +59,13 @@ typedef struct _GHashTableIter GHashTableIter;
struct _GHashTableIter
{
- /*< private >*/
- gpointer dummy1;
- gpointer dummy2;
- gpointer dummy3;
- int dummy4;
- gboolean dummy5;
- gpointer dummy6;
+ /*< private >*/
+ gpointer dummy1;
+ gpointer dummy2;
+ gpointer dummy3;
+ int dummy4;
+ gboolean dummy5;
+ gpointer dummy6;
};
char* g_strdup (const char *str);
diff --git a/src/includes/hashTable.h b/src/includes/hashTable.h
index 078fff9..4da4cbf 100644
--- a/src/includes/hashTable.h
+++ b/src/includes/hashTable.h
@@ -3,17 +3,17 @@
*
* Filename: hashTable.h
*
- * Description: Header File hashtable Module.
- * Wrapper for HAshTable data structure holding thread
+ * Description: Header File hashtable Module.
+ * Wrapper for HashTable data structure holding thread
* specific region information.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -37,6 +37,7 @@
#include <types.h>
extern void hashTable_init();
+void hashTable_initThread(int coreID);
extern int hashTable_get(bstring regionTag, LikwidThreadResults** result);
extern void hashTable_finalize(int* numberOfThreads, int* numberOfRegions, LikwidResults** results);
diff --git a/src/includes/libperfctr_types.h b/src/includes/libperfctr_types.h
index 99a38dc..6e375b6 100644
--- a/src/includes/libperfctr_types.h
+++ b/src/includes/libperfctr_types.h
@@ -5,13 +5,13 @@
*
* Description: Types file for libperfctr module.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -37,15 +37,22 @@ typedef struct LikwidThreadResults{
bstring label;
double time;
TimerData startTime;
+ int groupID;
+ int cpuID;
uint32_t count;
double StartPMcounters[NUM_PMC];
+ int StartOverflows[NUM_PMC];
double PMcounters[NUM_PMC];
} LikwidThreadResults;
typedef struct {
bstring tag;
+ int groupID;
+ int threadCount;
+ int eventCount;
double* time;
uint32_t* count;
+ int* cpulist;
double** counters;
} LikwidResults;
diff --git a/src/includes/likwid.h b/src/includes/likwid.h
index dd4cdfd..d900a0d 100644
--- a/src/includes/likwid.h
+++ b/src/includes/likwid.h
@@ -3,15 +3,16 @@
*
* Filename: likwid.h
*
- * Description: Header File of likwid marker API
+ * Description: Header File of likwid API
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Authors: Thomas Roehl (tr), thomas.roehl at googlemail.com
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -31,35 +32,1387 @@
#ifndef LIKWID_H
#define LIKWID_H
+#include <stdint.h>
+#include <errno.h>
+#include <string.h>
+
+#include <bstrlib.h>
+
+#define DEBUGLEV_ONLY_ERROR 0
+#define DEBUGLEV_INFO 1
+#define DEBUGLEV_DETAIL 2
+#define DEBUGLEV_DEVELOP 3
+
+extern int perfmon_verbosity;
+
+/** \addtogroup MarkerAPI Marker API module
+* @{
+*/
+/*!
+\def LIKWID_MARKER_INIT
+Shortcut for likwid_markerInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_THREADINIT
+Shortcut for likwid_markerThreadInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_REGISTER(regionTag)
+Shortcut for likwid_markerRegisterRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_START(regionTag)
+Shortcut for likwid_markerStartRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_STOP(regionTag)
+Shortcut for likwid_markerStopRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
+Shortcut for likwid_markerGetResults() for \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_SWITCH
+Shortcut for likwid_markerNextGroup() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_CLOSE
+Shortcut for likwid_markerClose() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/** @}*/
+
#ifdef LIKWID_PERFMON
#define LIKWID_MARKER_INIT likwid_markerInit()
#define LIKWID_MARKER_THREADINIT likwid_markerThreadInit()
-#define LIKWID_MARKER_START(reg) likwid_markerStartRegion(reg)
-#define LIKWID_MARKER_STOP(reg) likwid_markerStopRegion(reg)
+#define LIKWID_MARKER_SWITCH likwid_markerNextGroup()
+#define LIKWID_MARKER_REGISTER(regionTag) likwid_markerRegisterRegion(regionTag)
+#define LIKWID_MARKER_START(regionTag) likwid_markerStartRegion(regionTag)
+#define LIKWID_MARKER_STOP(regionTag) likwid_markerStopRegion(regionTag)
#define LIKWID_MARKER_CLOSE likwid_markerClose()
+#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count) likwid_markerGetRegion(regionTag, nevents, events, time, count)
#else
#define LIKWID_MARKER_INIT
#define LIKWID_MARKER_THREADINIT
-#define LIKWID_MARKER_START(reg)
-#define LIKWID_MARKER_STOP(reg)
+#define LIKWID_MARKER_SWITCH
+#define LIKWID_MARKER_REGISTER(regionTag)
+#define LIKWID_MARKER_START(regionTag)
+#define LIKWID_MARKER_STOP(regionTag)
#define LIKWID_MARKER_CLOSE
+#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
#endif
#ifdef __cplusplus
extern "C" {
#endif
-/* marker API routines */
-extern void likwid_markerInit(void);
-extern void likwid_markerThreadInit(void);
-extern void likwid_markerClose(void);
-extern void likwid_markerStartRegion(const char* regionTag);
-extern void likwid_markerStopRegion(const char* regionTag);
+
+/*
+################################################################################
+# Marker API related functions
+################################################################################
+*/
+/** \addtogroup MarkerAPI Marker API module
+* @{
+*/
+/*! \brief Initialize LIKWID's marker API
+
+Must be called in serial region of the application to set up basic data structures
+of LIKWID.
+Reads environment variables:
+- LIKWID_MODE (access mode)
+- LIKWID_MASK (event bitmask)
+- LIKWID_EVENTS (event string)
+- LIKWID_THREADS (cpu list separated by ,)
+- LIKWID_GROUPS (amount of groups)
+*/
+extern void likwid_markerInit(void) __attribute__ ((visibility ("default") ));
+/*! \brief Initialize LIKWID's marker API for the current thread
+
+Must be called in parallel region of the application to set up basic data structures
+of LIKWID. Before you can call likwid_markerThreadInit() you have to call likwid_markerInit().
+
+*/
+extern void likwid_markerThreadInit(void) __attribute__ ((visibility ("default") ));
+/*! \brief Select next group to measure
+
+Must be called in parallel region of the application to switch group on every CPU.
+*/
+extern void likwid_markerNextGroup(void) __attribute__ ((visibility ("default") ));
+/*! \brief Close LIKWID's marker API
+
+Must be called in serial region of the application. It gathers all data of regions and
+writes them out to a file (filepath in env variable LIKWID_FILEPATH).
+*/
+extern void likwid_markerClose(void) __attribute__ ((visibility ("default") ));
+/*! \brief Register a measurement region
+
+Initializes the hashTable entry in order to reduce execution time of likwid_markerStartRegion()
+ at param regionTag [in] Initialize data using this string
+ at return Error code
+*/
+extern int likwid_markerRegisterRegion(const char* regionTag) __attribute__ ((visibility ("default") ));
+/*! \brief Start a measurement region
+
+Reads the values of all configured counters and saves the results under the name given
+in regionTag.
+ at param regionTag [in] Store data using this string
+ at return Error code of start operation
+*/
+extern int likwid_markerStartRegion(const char* regionTag) __attribute__ ((visibility ("default") ));
+/*! \brief Stop a measurement region
+
+Reads the values of all configured counters and saves the results under the name given
+in regionTag. The measurement data of the stopped region gets summed up in global region counters.
+ at param regionTag [in] Store data using this string
+ at return Error code of stop operation
+*/
+extern int likwid_markerStopRegion(const char* regionTag) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get accumulated data of a code region
+
+Get the accumulated data of the current thread for the given regionTag.
+ at param regionTag [in] Print data using this string
+ at param nr_events [in,out] Length of events array
+ at param events [out] Events array for the intermediate results
+ at param time [out] Accumulated measurement time
+ at param count [out] Call count of the code region
+*/
+extern void likwid_markerGetRegion(const char* regionTag, int* nr_events, double* events, double *time, int *count) __attribute__ ((visibility ("default") ));
/* utility routines */
-extern int likwid_getProcessorId();
-extern int likwid_pinProcess(int processorId);
-extern int likwid_pinThread(int processorId);
+/*! \brief Get CPU ID of the current process/thread
+
+Returns the ID of the CPU the current process or thread is running on.
+ at return current CPU ID
+*/
+extern int likwid_getProcessorId() __attribute__ ((visibility ("default") ));
+/*! \brief Pin the current process to given CPU
+
+Pin the current process to the given CPU ID. The process cannot be scheduled to
+another CPU after pinning but the pinning can be changed anytime with this function.
+ at param [in] processorId CPU ID to pin the current process to
+ at return error code (1 for success, 0 for error)
+*/
+extern int likwid_pinProcess(int processorId) __attribute__ ((visibility ("default") ));
+/*! \brief Pin the current thread to given CPU
+
+Pin the current thread to the given CPU ID. The thread cannot be scheduled to
+another CPU after pinning but the pinning can be changed anytime with this function
+ at param [in] processorId CPU ID to pin the current thread to
+ at return error code (1 for success, 0 for error)
+*/
+extern int likwid_pinThread(int processorId) __attribute__ ((visibility ("default") ));
+/** @}*/
+
+/*
+################################################################################
+# Access client related functions
+################################################################################
+*/
+/** \addtogroup Access Access module
+ * @{
+ */
+
+/*! \brief Enum for the access modes
+
+LIKWID supports multiple access modes to the MSR and PCI performance monitoring
+registers. For direct access the user must have enough priviledges to access the
+MSR and PCI devices. The daemon mode forwards the operations to a daemon with
+higher priviledges.
+*/
+typedef enum {
+ ACCESSMODE_DIRECT = 0, /*!< \brief Access performance monitoring registers directly */
+ ACCESSMODE_DAEMON = 1 /*!< \brief Use the access daemon to access the registers */
+} AccessMode;
+
+/*! \brief Set access mode
+
+Sets the mode how the MSR and PCI registers should be accessed. 0 for direct access (propably root priviledges required) and 1 for accesses through the access daemon. It must be called before HPMinit()
+ at param [in] mode (0=direct, 1=daemon)
+*/
+extern void HPMmode(int mode) __attribute__ ((visibility ("default") ));
+/*! \brief Initialize access module
+
+Initialize the module internals to either the MSR/PCI files or the access daemon
+ at return error code (0 for sccess)
+*/
+extern int HPMinit() __attribute__ ((visibility ("default") ));
+/*! \brief Add CPU to access module
+
+Add the given CPU to the access module. This opens the commnunication to either the MSR/PCI files or the access daemon.
+ at param [in] cpu_id CPU that should be enabled for measurements
+ at return error code (0 for success, -ENODEV if access cannot be initialized
+*/
+extern int HPMaddThread(int cpu_id) __attribute__ ((visibility ("default") ));
+/*! \brief Close connections
+
+Close the connections to the MSR/PCI files or the access daemon
+*/
+extern void HPMfinalize() __attribute__ ((visibility ("default") ));
+/** @}*/
+
+/*
+################################################################################
+# Config file related functions
+################################################################################
+*/
+/** \addtogroup Config Config file module
+* @{
+*/
+/*! \brief Structure holding values of the configuration file
+
+LIKWID supports the definition of runtime values in a configuration file. The
+most important configurations in most cases are the path the access daemon and
+the corresponding access mode. In order to avoid reading in the system topology
+at each start, a path to a topology file can be set. The other values are mostly
+used internally.
+*/
+typedef struct {
+ char* configFileName; /*!< \brief Path to the configuration file */
+ char* topologyCfgFileName; /*!< \brief Path to the topology file */
+ char* daemonPath; /*!< \brief Path of the access daemon */
+ char* groupPath; /*!< \brief Path of default performance group directory */
+ AccessMode daemonMode; /*!< \brief Access mode to the MSR and PCI registers */
+ int maxNumThreads; /*!< \brief Maximum number of HW threads */
+ int maxNumNodes; /*!< \brief Maximum number of NUMA nodes */
+} Configuration;
+
+/** \brief Pointer for exporting the Configuration data structure */
+typedef Configuration* Configuration_t;
+/*! \brief Read the config file of LIKWID, if it exists
+
+Search for LIKWID config file and read the values in
+Currently the paths /usr/local/etc/likwid.cfg, /etc/likwid.cfg and the path
+defined in config.mk are checked.
+ at return error code (0 for success, -EFAULT if no file can be found)
+*/
+extern int init_configuration(void) __attribute__ ((visibility ("default") ));
+/*! \brief Destroy the config structure
+
+Destroys the current config structure and frees all allocated memory for path names
+ at return error code (0 for success, -EFAULT if config structure not initialized)
+*/
+extern int destroy_configuration(void) __attribute__ ((visibility ("default") ));
+
+
+/*! \brief Retrieve the config structure
+
+Get the initialized configuration
+\sa Configuration_t
+ at return Configuration_t (pointer to internal Configuration structure)
+*/
+extern Configuration_t get_configuration(void) __attribute__ ((visibility ("default") ));
+
+/*! \brief Set group path in the config struction
+
+Set group path in the config struction. The path must be a directory.
+ at param [in] path
+ at return error code (0 for success, -ENOMEM if reallocation failed, -ENOTDIR if no directoy)
+*/
+extern int config_setGroupPath(char* path) __attribute__ ((visibility ("default") ));
+
+/** @}*/
+/*
+################################################################################
+# CPU topology related functions
+################################################################################
+*/
+/** \addtogroup CPUTopology CPU information module
+* @{
+*/
+/*! \brief Structure with general CPU information
+
+General information covers CPU family, model, name and current clock and vendor
+specific information like the version of Intel's performance monitoring facility.
+*/
+typedef struct {
+ uint32_t family; /*!< \brief CPU family ID*/
+ uint32_t model; /*!< \brief CPU model ID */
+ uint32_t stepping; /*!< \brief Stepping (version) of the CPU */
+ uint64_t clock; /*!< \brief Current clock frequency of the executing CPU*/
+ int turbo; /*!< \brief Flag if CPU has a turbo mode */
+ char* osname; /*!< \brief Name of the CPU reported by OS */
+ char* name; /*!< \brief Name of the CPU as identified by LIKWID */
+ char* short_name; /*!< \brief Short name of the CPU*/
+ char* features; /*!< \brief String with all features supported by the CPU*/
+ int isIntel; /*!< \brief Flag if it is an Intel CPU*/
+ int supportUncore; /*!< \brief Flag if system has Uncore performance monitors */
+ uint32_t featureFlags; /*!< \brief Mask of all features supported by the CPU*/
+ uint32_t perf_version; /*!< \brief Version of Intel's performance monitoring facility */
+ uint32_t perf_num_ctr; /*!< \brief Number of general purpose core-local performance monitoring counters */
+ uint32_t perf_width_ctr; /*!< \brief Bit width of fixed and general purpose counters */
+ uint32_t perf_num_fixed_ctr; /*!< \brief Number of fixed purpose core-local performance monitoring counters */
+} CpuInfo;
+
+/*! \brief Structure with IDs of a HW thread
+
+For each HW thread this structure stores the ID of the thread inside a CPU, the
+CPU core ID of the HW thread and the CPU socket ID.
+\extends CpuTopology
+*/
+typedef struct {
+ uint32_t threadId; /*!< \brief ID of HW thread inside the CPU core */
+ uint32_t coreId; /*!< \brief ID of CPU core that executes the HW thread */
+ uint32_t packageId; /*!< \brief ID of CPU socket containing the HW thread */
+ uint32_t apicId; /*!< \brief ID of HW thread retrieved through the Advanced Programmable Interrupt Controller */
+ uint32_t inCpuSet; /*!< \brief ID of HW thread inside the CPU core */
+} HWThread;
+
+/*! \brief Enum of possible caches
+
+CPU caches can have different tasks and hold different kind of data. This enum lists all shapes used in all supported CPUs
+\extends CacheLevel
+*/
+typedef enum {
+ NOCACHE=0, /*!< \brief No cache used as undef value */
+ DATACACHE, /*!< \brief Cache holding data cache lines */
+ INSTRUCTIONCACHE, /*!< \brief Cache holding instruction cache lines */
+ UNIFIEDCACHE, /*!< \brief Cache holding both instruction and data cache lines */
+ ITLB, /*!< \brief Translation Lookaside Buffer cache for instruction pages */
+ DTLB /*!< \brief Translation Lookaside Buffer cache for data pages */
+} CacheType;
+
+/*! \brief Structure describing a cache level
+
+CPUs are connected to a cache hierarchy with different amount of caches at each level. The CacheLevel structure holds general information about the cache.
+\extends CpuTopology
+*/
+typedef struct {
+ uint32_t level; /*!< \brief Level of the cache in the hierarchy */
+ CacheType type; /*!< \brief Type of the cache */
+ uint32_t associativity; /*!< \brief Amount of cache lines hold by each set */
+ uint32_t sets; /*!< \brief Amount of sets */
+ uint32_t lineSize; /*!< \brief Size in bytes of one cache line */
+ uint32_t size; /*!< \brief Size in bytes of the cache */
+ uint32_t threads; /*!< \brief Number of HW thread connected to the cache */
+ uint32_t inclusive; /*!< \brief Flag if cache is inclusive (holds also cache lines available in caches nearer to the CPU) or exclusive */
+} CacheLevel;
+
+/*! \brief Structure describing the topology of the HW threads in the system
+
+This structure describes the topology at HW thread level like the amount of HW threads, how they are distributed over the CPU sockets/packages and how the caching hierarchy is assembled.
+*/
+typedef struct {
+ uint32_t numHWThreads; /*!< \brief Amount of HW threads in the system and length of \a threadPool */
+ uint32_t activeHWThreads; /*!< \brief Amount of HW threads in the system and length of \a threadPool */
+ uint32_t numSockets; /*!< \brief Amount of CPU sockets/packages in the system */
+ uint32_t numCoresPerSocket; /*!< \brief Amount of physical cores in one CPU socket/package */
+ uint32_t numThreadsPerCore; /*!< \brief Amount of HW threads in one physical CPU core */
+ uint32_t numCacheLevels; /*!< \brief Amount of caches for each HW thread and length of \a cacheLevels */
+ HWThread* threadPool; /*!< \brief List of all HW thread descriptions */
+ CacheLevel* cacheLevels; /*!< \brief List of all caches in the hierarchy */
+ struct treeNode* topologyTree; /*!< \brief Anchor for a tree structure describing the system topology */
+} CpuTopology;
+
+/*! \brief Variable holding the global cpu information structure */
+extern CpuInfo cpuid_info;
+/*! \brief Variable holding the global cpu topology structure */
+extern CpuTopology cpuid_topology;
+
+/** \brief Pointer for exporting the CpuInfo data structure */
+typedef CpuInfo* CpuInfo_t;
+/** \brief Pointer for exporting the CpuTopology data structure */
+typedef CpuTopology* CpuTopology_t;
+/*! \brief Initialize topology information
+
+CpuInfo_t and CpuTopology_t are initialized by either HWLOC, CPUID/ProcFS or topology file if present. The topology file name can be configured in the configuration file. Furthermore, the paths /etc/likwid_topo.cfg and <PREFIX>/etc/likwid_topo.cfg are checked.
+\sa CpuInfo_t and CpuTopology_t
+ at return always 0
+*/
+extern int topology_init(void) __attribute__ ((visibility ("default") ));
+/*! \brief Retrieve CPU topology of the current machine
+
+\sa CpuTopology_t
+ at return CpuTopology_t (pointer to internal cpuid_topology structure)
+*/
+extern CpuTopology_t get_cpuTopology(void) __attribute__ ((visibility ("default") ));
+/*! \brief Retrieve CPU information of the current machine
+
+Get the previously initialized CPU info structure containing number of CPUs/Threads
+\sa CpuInfo_t
+ at return CpuInfo_t (pointer to internal cpuid_info structure)
+*/
+extern CpuInfo_t get_cpuInfo(void) __attribute__ ((visibility ("default") ));
+/*! \brief Destroy topology structures CpuInfo_t and CpuTopology_t.
+
+Retrieved pointers to the structures are not valid anymore after this function call
+\sa CpuInfo_t and CpuTopology_t
+*/
+extern void topology_finalize(void) __attribute__ ((visibility ("default") ));
+/*! \brief Print all supported architectures
+*/
+extern void print_supportedCPUs(void) __attribute__ ((visibility ("default") ));
+/** @}*/
+/*
+################################################################################
+# NUMA related functions
+################################################################################
+*/
+/** \addtogroup NumaTopology NUMA memory topology module
+ * @{
+ */
+/*! \brief CPUs in NUMA node and general information about a NUMA domain
+
+The NumaNode structure describes the topology and holds general information of a
+NUMA node. The structure is filled by calling numa_init() by either the HWLOC
+library or by evaluating the /proc filesystem.
+\extends NumaTopology
+*/
+typedef struct {
+ uint32_t id; /*!< \brief ID of the NUMA node */
+ uint64_t totalMemory; /*!< \brief Amount of memory in the NUMA node */
+ uint64_t freeMemory; /*!< \brief Amount of free memory in the NUMA node */
+ uint32_t numberOfProcessors; /*!< \brief umber of processors covered by the NUMA node and length of \a processors */
+ uint32_t* processors; /*!< \brief List of HW threads in the NUMA node */
+ uint32_t numberOfDistances; /*!< \brief Amount of distances to the other NUMA nodes in the system and self */
+ uint32_t* distances; /*!< \brief List of distances to the other NUMA nodes and self */
+} NumaNode;
+
+
+/*! \brief The NumaTopology structure describes all NUMA nodes in the current system.
+*/
+typedef struct {
+ uint32_t numberOfNodes; /*!< \brief Number of NUMA nodes in the system and length of \a nodes */
+ NumaNode* nodes; /*!< \brief List of NUMA nodes */
+} NumaTopology;
+
+/*! \brief Variable holding the global NUMA information structure */
+extern NumaTopology numa_info;
+
+/** \brief Pointer for exporting the NumaTopology data structure */
+typedef NumaTopology* NumaTopology_t;
+
+/*! \brief Initialize NUMA information
+
+Initialize NUMA information NumaTopology_t using either HWLOC or CPUID/ProcFS. If
+a topology config file is present it is read at topology_init() and fills \a NumaTopology_t
+\sa NumaTopology_t
+ at return error code (0 for success, -1 if initialization failed)
+*/
+extern int numa_init(void) __attribute__ ((visibility ("default") ));
+/*! \brief Retrieve NUMA information of the current machine
+
+Get the previously initialized NUMA info structure
+\sa NumaTopology_t
+ at return NumaTopology_t (pointer to internal numa_info structure)
+*/
+extern NumaTopology_t get_numaTopology(void) __attribute__ ((visibility ("default") ));
+/*! \brief Set memory allocation policy to interleaved
+
+Set the memory allocation policy to interleaved for given list of CPUs
+ at param [in] processorList List of processors
+ at param [in] numberOfProcessors Length of processor list
+*/
+extern void numa_setInterleaved(int* processorList, int numberOfProcessors) __attribute__ ((visibility ("default") ));
+/*! \brief Allocate memory from a specific specific NUMA node
+ at param [in,out] ptr Start pointer of memory
+ at param [in] size Size for the allocation
+ at param [in] domainId ID of NUMA node for the allocation
+*/
+extern void numa_membind(void* ptr, size_t size, int domainId) __attribute__ ((visibility ("default") ));
+/*! \brief Destroy NUMA information structure
+
+Destroys the NUMA information structure NumaTopology_t. Retrieved pointers
+to the structures are not valid anymore after this function call
+\sa NumaTopology_t
+*/
+extern void numa_finalize(void) __attribute__ ((visibility ("default") ));
+/*! \brief Retrieve the number of NUMA nodes
+
+Returns the number of NUMA nodes of the current machine. Can also be read out of
+NumaTopology_t
+\sa NumaTopology_t
+ at return Number of NUMA nodes
+*/
+extern int likwid_getNumberOfNodes(void) __attribute__ ((visibility ("default") ));
+/** @}*/
+/*
+################################################################################
+# Affinity domains related functions
+################################################################################
+*/
+/** \addtogroup AffinityDomains Thread affinity module
+ * @{
+ */
+
+/*! \brief The AffinityDomain data structure describes a single domain in the current system
+
+The AffinityDomain data structure describes a single domain in the current system. Example domains are NUMA nodes, CPU sockets/packages or LLC (Last Level Cache) cache domains.
+\extends AffinityDomains
+*/
+typedef struct {
+ bstring tag; /*!< \brief Bstring with the ID for the affinity domain. Currently possible values: N (node), SX (socket/package X), CX (LLC cache domain X) and MX (memory domain X) */
+ uint32_t numberOfProcessors; /*!< \brief Number of HW threads in the domain and length of \a processorList */
+ uint32_t numberOfCores; /*!< \brief Number of CPU cores in the domain */
+ int* processorList; /*!< \brief List of HW thread IDs in the domain */
+} AffinityDomain;
+
+/*! \brief The AffinityDomains data structure holds different count variables describing the
+various system layers
+
+Affinity domains are for example the amount of NUMA domains, CPU sockets/packages or LLC
+(Last Level Cache) cache domains of the current machine. Moreover a list of
+\a domains holds the processor lists for each domain that are used for
+scheduling processes to domain specific HW threads. Some amounts are duplicates
+or derivation of values in \a CpuInfo, \a CpuTopology and \a NumaTopology.
+*/
+typedef struct {
+ uint32_t numberOfSocketDomains; /*!< \brief Number of CPU sockets/packages in the system */
+ uint32_t numberOfNumaDomains; /*!< \brief Number of NUMA nodes in the system */
+ uint32_t numberOfProcessorsPerSocket; /*!< \brief Number of HW threads per socket/package in the system */
+ uint32_t numberOfCacheDomains; /*!< \brief Number of LLC caches in the system */
+ uint32_t numberOfCoresPerCache; /*!< \brief Number of HW threads per LLC cache in the system */
+ uint32_t numberOfProcessorsPerCache; /*!< \brief Number of CPU cores per LLC cache in the system */
+ uint32_t numberOfAffinityDomains; /*!< \brief Number of affinity domains in the current system and length of \a domains array */
+ AffinityDomain* domains; /*!< \brief List of all domains in the system */
+} AffinityDomains;
+
+/** \brief Pointer for exporting the AffinityDomains data structure */
+typedef AffinityDomains* AffinityDomains_t;
+
+/*! \brief Initialize affinity information
+
+Initialize affinity information AffinityDomains_t using the data of the structures
+\a CpuInfo_t, CpuTopology_t and NumaTopology_t
+\sa AffinityDomains_t
+*/
+extern void affinity_init() __attribute__ ((visibility ("default") ));
+/*! \brief Retrieve affinity structure
+
+Get the previously initialized affinity info structure
+\sa AffinityDomains_t
+ at return AffinityDomains_t (pointer to internal affinityDomains structure)
+*/
+extern AffinityDomains_t get_affinityDomains(void) __attribute__ ((visibility ("default") ));
+/*! \brief Pin process to a CPU
+
+Pin process to a CPU. Duplicate of likwid_pinProcess()
+ at param [in] processorId CPU ID for pinning
+*/
+extern void affinity_pinProcess(int processorId) __attribute__ ((visibility ("default") ));
+/*! \brief Pin processes to a CPU
+
+Pin processes to a CPU. Creates a cpuset with the given processor IDs
+ at param [in] cpu_count Number of processors in processorIds
+ at param [in] processorIds Array of processor IDs
+*/
+extern void affinity_pinProcesses(int cpu_count, int* processorIds) __attribute__ ((visibility ("default") ));
+/*! \brief Pin thread to a CPU
+
+Pin thread to a CPU. Duplicate of likwid_pinThread()
+ at param [in] processorId CPU ID for pinning
+*/
+extern void affinity_pinThread(int processorId) __attribute__ ((visibility ("default") ));
+/*! \brief Return the CPU ID where the current process runs.
+
+ at return CPU ID
+*/
+extern int affinity_processGetProcessorId() __attribute__ ((visibility ("default") ));
+/*! \brief Return the CPU ID where the current thread runs.
+
+ at return CPU ID
+*/
+extern int affinity_threadGetProcessorId() __attribute__ ((visibility ("default") ));
+/*! \brief Destroy affinity information structure
+
+Destroys the affinity information structure AffinityDomains_t. Retrieved pointers
+to the structures are not valid anymore after this function call
+\sa AffinityDomains_t
+*/
+extern void affinity_finalize() __attribute__ ((visibility ("default") ));
+/** @}*/
+
+/*
+################################################################################
+# CPU string parsing related functions
+################################################################################
+*/
+/** \addtogroup CPUParse CPU string parser module
+ * @{
+ */
+
+/*! \brief Read CPU selection string and resolve to available CPU numbers
+
+Reads the CPU selection string and fills the given list with the CPU numbers
+defined in the selection string. This function is a interface function for the
+different selection modes: scatter, expression, logical and physical.
+ at param [in] cpustring Selection string
+ at param [in,out] cpulist List of CPUs
+ at param [in] length Length of cpulist
+ at return error code (>0 on success for the returned list length, -ERRORCODE on failure)
+*/
+extern int cpustr_to_cpulist(char* cpustring, int* cpulist, int length) __attribute__ ((visibility ("default") ));
+/*! \brief Read NUMA node selection string and resolve to available NUMA node numbers
+
+Reads the NUMA node selection string and fills the given list with the NUMA node numbers
+defined in the selection string.
+ at param [in] nodestr Selection string
+ at param [out] nodes List of available NUMA nodes
+ at param [in] length Length of NUMA node list
+ at return error code (>0 on success for the returned list length, -ERRORCODE on failure)
+*/
+extern int nodestr_to_nodelist(char* nodestr, int* nodes, int length) __attribute__ ((visibility ("default") ));
+/*! \brief Read CPU socket selection string and resolve to available CPU socket numbers
+
+Reads the CPU socket selection string and fills the given list with the CPU socket numbers
+defined in the selection string.
+ at param [in] sockstr Selection string
+ at param [out] sockets List of available CPU sockets
+ at param [in] length Length of CPU socket list
+ at return error code (>0 on success for the returned list length, -ERRORCODE on failure)
+*/
+extern int sockstr_to_socklist(char* sockstr, int* sockets, int length) __attribute__ ((visibility ("default") ));
+
+/** @}*/
+
+/*
+################################################################################
+# Performance monitoring related functions
+################################################################################
+*/
+/** \addtogroup PerfMon Performance monitoring module
+ * @{
+ */
+/*! \brief Get all groups
+
+Checks the configured performance group path for the current architecture and
+returns all found group names
+ at return Amount of found performance groups
+*/
+extern int perfmon_getGroups(char*** groups, char*** shortinfos, char*** longinfos) __attribute__ ((visibility ("default") ));
+
+/*! \brief Free all group information
+
+ at param [in] nrgroups Number of groups
+ at param [in] groups List of group names
+ at param [in] shortinfos List of short information string about group
+ at param [in] longinfos List of long information string about group
+*/
+extern void perfmon_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos) __attribute__ ((visibility ("default") ));
+
+/*! \brief Initialize performance monitoring facility
+
+Initialize the performance monitoring feature by creating basic data structures.
+The access mode must already be set when calling perfmon_init()
+ at param [in] nrThreads Amount of threads
+ at param [in] threadsToCpu List of CPUs
+ at return error code (0 on success, -ERRORCODE on failure)
+*/
+extern int perfmon_init(int nrThreads, int threadsToCpu[]) __attribute__ ((visibility ("default") ));
+
+/*! \brief Initialize performance monitoring maps
+
+Initialize the performance monitoring maps for counters, events and Uncore boxes#
+for the current architecture. topology_init() and numa_init() must be called before calling
+perfmon_init_maps()
+\sa RegisterMap list, PerfmonEvent list and BoxMap list
+*/
+extern void perfmon_init_maps(void) __attribute__ ((visibility ("default") ));
+/*! \brief Add an event string to LIKWID
+
+A event string looks like Eventname:Countername(:Option1:Option2:...),...
+The eventname, countername and options are checked if they are available.
+ at param [in] eventCString Event string
+ at return Returns the ID of the new eventSet
+*/
+extern int perfmon_addEventSet(char* eventCString) __attribute__ ((visibility ("default") ));
+/*! \brief Setup all performance monitoring counters of an eventSet
+
+A event string looks like Eventname:Countername(:Option1:Option2:...),...
+The eventname, countername and options are checked if they are available.
+ at param [in] groupId (returned from perfmon_addEventSet()
+ at return error code (-ENOENT if groupId is invalid and -1 if the counters of one CPU cannot be set up)
+*/
+extern int perfmon_setupCounters(int groupId) __attribute__ ((visibility ("default") ));
+/*! \brief Start performance monitoring counters
+
+Start the counters that have been previously set up by perfmon_setupCounters().
+The counter registered are zeroed before enabling the counters
+ at return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_startCounters(void) __attribute__ ((visibility ("default") ));
+/*! \brief Stop performance monitoring counters
+
+Stop the counters that have been previously started by perfmon_startCounters().
+All config registers get zeroed before reading the counter register.
+ at return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_stopCounters(void) __attribute__ ((visibility ("default") ));
+/*! \brief Read the performance monitoring counters on all CPUs
+
+Read the counters that have been previously started by perfmon_startCounters().
+The counters are stopped directly to avoid interference of LIKWID with the measured
+code. Before returning, the counters are started again.
+ at return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_readCounters(void) __attribute__ ((visibility ("default") ));
+/*! \brief Read the performance monitoring counters on one CPU
+
+Read the counters that have been previously started by perfmon_startCounters().
+The counters are stopped directly to avoid interference of LIKWID with the measured
+code. Before returning, the counters are started again. Only one CPU is read.
+ at param [in] cpu_id CPU ID of the CPU that should be read
+ at return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_readCountersCpu(int cpu_id) __attribute__ ((visibility ("default") ));
+/*! \brief Read the performance monitoring counters of all threads in a group
+
+Read the counters that have been previously started by perfmon_startCounters().
+The counters are stopped directly to avoid interference of LIKWID with the measured
+code. Before returning, the counters are started again.
+ at param [in] groupId Read the counters for all threads taking part in group
+ at return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_readGroupCounters(int groupId) __attribute__ ((visibility ("default") ));
+/*! \brief Read the performance monitoring counters of on thread in a group
+
+Read the counters that have been previously started by perfmon_startCounters().
+The counters are stopped directly to avoid interference of LIKWID with the measured
+code. Before returning, the counters are started again. Only one thread's CPU is read.
+ at param [in] groupId Read the counters for on thread taking part in group
+ at return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_readGroupThreadCounters(int groupId, int threadId) __attribute__ ((visibility ("default") ));
+/*! \brief Switch the active eventSet to a new one
+
+Stops the currently running counters, switches the eventSet by setting up the
+counters and start the counters.
+ at param [in] new_group ID of group that should be switched to.
+ at return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_switchActiveGroup(int new_group) __attribute__ ((visibility ("default") ));
+/*! \brief Close the perfomance monitoring facility of LIKWID
+
+Deallocates all internal data that is used during performance monitoring. Also
+the counter values are not accessible after this function.
+*/
+extern void perfmon_finalize(void) __attribute__ ((visibility ("default") ));
+/*! \brief Get the results of the specified group, counter and thread
+
+Get the result of all measurement cycles. The function takes care of happened
+overflows and if the counter values need to be calculated with multipliers.
+ at param [in] groupId ID of the group that should be read
+ at param [in] eventId ID of the event that should be read
+ at param [in] threadId ID of the thread/cpu that should be read
+ at return The counter result
+*/
+extern double perfmon_getResult(int groupId, int eventId, int threadId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the last results of the specified group, counter and thread
+
+Get the result of the last measurement cycle. The function takes care of happened
+overflows and if the counter values need to be calculated with multipliers.
+ at param [in] groupId ID of the group that should be read
+ at param [in] eventId ID of the event that should be read
+ at param [in] threadId ID of the thread/cpu that should be read
+ at return The counter result
+*/
+extern double perfmon_getLastResult(int groupId, int eventId, int threadId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the metric result of the specified group, counter and thread
+
+Get the metric result of all measurement cycles. It reads all raw results for the given groupId and threadId.
+ at param [in] groupId ID of the group that should be read
+ at param [in] metricId ID of the metric that should be calculated
+ at param [in] threadId ID of the thread/cpu that should be read
+ at return The metric result
+*/
+extern double perfmon_getMetric(int groupId, int metricId, int threadId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the last metric result of the specified group, counter and thread
+
+Get the metric result of the last measurement cycle. It reads all raw results for the given groupId and threadId.
+ at param [in] groupId ID of the group that should be read
+ at param [in] metricId ID of the metric that should be calculated
+ at param [in] threadId ID of the thread/cpu that should be read
+ at return The metric result
+*/
+extern double perfmon_getLastMetric(int groupId, int metricId, int threadId) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the number of configured event groups
+
+ at return Number of groups
+*/
+extern int perfmon_getNumberOfGroups(void) __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of configured eventSets in group
+
+ at param [in] groupId ID of group
+ at return Number of eventSets
+*/
+extern int perfmon_getNumberOfEvents(int groupId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the accumulated measurement time a group
+
+ at param [in] groupId ID of group
+ at return Time in seconds the event group was measured
+*/
+extern double perfmon_getTimeOfGroup(int groupId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the ID of the currently set up event group
+
+ at return Number of active group
+*/
+extern int perfmon_getIdOfActiveGroup(void) __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of threads specified at perfmon_init()
+
+ at return Number of threads
+*/
+extern int perfmon_getNumberOfThreads(void) __attribute__ ((visibility ("default") ));
+
+
+/*! \brief Set verbosity of LIKWID library
+
+*/
+extern void perfmon_setVerbosity(int verbose) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the event name of the specified group and event
+
+Get the metric name as defined in the performance group file
+ at param [in] groupId ID of the group that should be read
+ at param [in] eventId ID of the event that should be returned
+ at return The event name or NULL in case of failure
+*/
+extern char* perfmon_getEventName(int groupId, int eventId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the counter name of the specified group and event
+
+Get the counter name as defined in the performance group file
+ at param [in] groupId ID of the group that should be read
+ at param [in] eventId ID of the event of which the counter should be returned
+ at return The counter name or NULL in case of failure
+*/
+extern char* perfmon_getCounterName(int groupId, int eventId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the name group
+
+Get the name of group. Either it is the name of the performance group or "Custom"
+ at param [in] groupId ID of the group that should be read
+ at return The group name or NULL in case of failure
+*/
+extern char* perfmon_getGroupName(int groupId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the metric name of the specified group and metric
+
+Get the metric name as defined in the performance group file
+ at param [in] groupId ID of the group that should be read
+ at param [in] metricId ID of the metric that should be calculated
+ at return The metric name or NULL in case of failure
+*/
+extern char* perfmon_getMetricName(int groupId, int metricId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the short informational string of the specified group
+
+Returns the short information string as defined by performance groups or "Custom"
+in case of custom event sets
+ at param [in] groupId ID of the group that should be read
+ at return The short information or NULL in case of failure
+*/
+extern char* perfmon_getGroupInfoShort(int groupId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the long descriptive string of the specified group
+
+Returns the long descriptive string as defined by performance groups or NULL
+in case of custom event sets
+ at param [in] groupId ID of the group that should be read
+ at return The long description or NULL in case of failure
+*/
+extern char* perfmon_getGroupInfoLong(int groupId) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the number of configured metrics for group
+
+ at param [in] groupId ID of group
+ at return Number of metrics
+*/
+extern int perfmon_getNumberOfMetrics(int groupId) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the last measurement time a group
+
+ at param [in] groupId ID of group
+ at return Time in seconds the event group was measured the last time
+*/
+extern double perfmon_getLastTimeOfGroup(int groupId) __attribute__ ((visibility ("default") ));
+
+/*! \brief Read the output file of the Marker API
+ at param [in] filename Filename with Marker API results
+ at return 0 or negative error number
+*/
+extern int perfmon_readMarkerFile(const char* filename) __attribute__ ((visibility ("default") ));
+/*! \brief Free space for read in Marker API file
+*/
+extern void perfmon_destroyMarkerResults() __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of regions listed in Marker API result file
+
+ at return Number of regions
+*/
+extern int perfmon_getNumberOfRegions() __attribute__ ((visibility ("default") ));
+/*! \brief Get the groupID of a region
+
+ at param [in] region ID of region
+ at return Group ID of region
+*/
+extern int perfmon_getGroupOfRegion(int region) __attribute__ ((visibility ("default") ));
+/*! \brief Get the tag of a region
+ at param [in] region ID of region
+ at return tag of region
+*/
+extern char* perfmon_getTagOfRegion(int region) __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of events of a region
+ at param [in] region ID of region
+ at return Number of events of region
+*/
+extern int perfmon_getEventsOfRegion(int region) __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of metrics of a region
+ at param [in] region ID of region
+ at return Number of metrics of region
+*/
+extern int perfmon_getMetricsOfRegion(int region) __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of threads of a region
+ at param [in] region ID of region
+ at return Number of threads of region
+*/
+extern int perfmon_getThreadsOfRegion(int region) __attribute__ ((visibility ("default") ));
+/*! \brief Get the cpulist of a region
+ at param [in] region ID of region
+ at param [in] count Length of cpulist array
+ at param [in] cpulist cpulist array
+ at return Number of threads of region or count, whatever is lower
+*/
+extern int perfmon_getCpulistOfRegion(int region, int count, int* cpulist) __attribute__ ((visibility ("default") ));
+/*! \brief Get the accumulated measurement time of a region for a thread
+ at param [in] region ID of region
+ at param [in] thread ID of thread
+ at return Measurement time of a region for a thread
+*/
+extern double perfmon_getTimeOfRegion(int region, int thread) __attribute__ ((visibility ("default") ));
+/*! \brief Get the call count of a region for a thread
+ at param [in] region ID of region
+ at param [in] thread ID of thread
+ at return Call count of a region for a thread
+*/
+extern int perfmon_getCountOfRegion(int region, int thread) __attribute__ ((visibility ("default") ));
+/*! \brief Get the event result of a region for an event and thread
+ at param [in] region ID of region
+ at param [in] event ID of event
+ at param [in] thread ID of thread
+ at return Result of a region for an event and thread
+*/
+extern double perfmon_getResultOfRegionThread(int region, int event, int thread) __attribute__ ((visibility ("default") ));
+/*! \brief Get the metric result of a region for a metric and thread
+ at param [in] region ID of region
+ at param [in] metricId ID of metric
+ at param [in] threadId ID of thread
+ at return Metric result of a region for a thread
+*/
+extern double perfmon_getMetricOfRegionThread(int region, int metricId, int threadId) __attribute__ ((visibility ("default") ));
+
+/** @}*/
+
+/*
+################################################################################
+# Time measurements related functions
+################################################################################
+*/
+
+/** \addtogroup TimerMon Time measurement module
+ * @{
+ */
+
+/*! \brief Struct defining the start and stop time of a time interval
+\extends TimerData
+*/
+typedef union
+{
+ uint64_t int64; /*!< \brief Cycle count in 64 bit */
+ struct {uint32_t lo, hi;} int32; /*!< \brief Cycle count stored in two 32 bit fields */
+} TscCounter;
+
+/*! \brief Struct defining the start and stop time of a time interval
+*/
+typedef struct {
+ TscCounter start; /*!< \brief Cycles at start */
+ TscCounter stop; /*!< \brief Cycles at stop */
+} TimerData;
+
+/*! \brief Initialize timer by retrieving baseline frequency and cpu clock
+*/
+extern void timer_init( void ) __attribute__ ((visibility ("default") ));
+/*! \brief Return the measured interval in seconds
+
+ at param [in] time Structure holding the cycle count at start and stop
+ at return Time in seconds
+*/
+extern double timer_print( TimerData* time) __attribute__ ((visibility ("default") ));
+/*! \brief Return the measured interval in cycles
+
+ at param [in] time Structure holding the cycle count at start and stop
+ at return Time in cycles
+*/
+extern uint64_t timer_printCycles( TimerData* time) __attribute__ ((visibility ("default") ));
+/*! \brief Reset values in TimerData
+
+ at param [in] time Structure holding the cycle count at start and stop
+*/
+extern void timer_reset( TimerData* time ) __attribute__ ((visibility ("default") ));
+/*! \brief Return the CPU clock determined at timer_init
+
+ at return CPU clock
+*/
+extern uint64_t timer_getCpuClock( void ) __attribute__ ((visibility ("default") ));
+/*! \brief Return the current CPU clock read from sysfs
+
+ at return CPU clock
+*/
+extern uint64_t timer_getCpuClockCurrent( int cpu_id ) __attribute__ ((visibility ("default") ));
+/*! \brief Return the cycles clock determined at timer_init
+
+ at return cycle clock
+*/
+extern uint64_t timer_getCycleClock( void ) __attribute__ ((visibility ("default") ));
+/*! \brief Return the baseline CPU clock determined at timer_init
+
+ at return Baseline CPU clock
+*/
+extern uint64_t timer_getBaseline( void ) __attribute__ ((visibility ("default") ));
+/*! \brief Start time measurement
+
+ at param [in,out] time Structure holding the cycle count at start
+*/
+extern void timer_start( TimerData* time ) __attribute__ ((visibility ("default") ));
+/*! \brief Stop time measurement
+
+ at param [in,out] time Structure holding the cycle count at stop
+*/
+extern void timer_stop ( TimerData* time) __attribute__ ((visibility ("default") ));
+/*! \brief Sleep for specified usecs
+
+ at param [in] usec Amount of usecs to sleep
+*/
+extern int timer_sleep(unsigned long usec) __attribute__ ((visibility ("default") ));
+
+/*! \brief Finalize timer module
+
+*/
+extern void timer_finalize(void) __attribute__ ((visibility ("default") ));
+
+/** @}*/
+
+/*
+################################################################################
+# Power measurements related functions
+################################################################################
+*/
+/** \addtogroup PowerMon Power and Energy monitoring module
+ * @{
+ */
+
+/*!
+\def NUM_POWER_DOMAINS
+Amount of currently supported RAPL domains
+*/
+#define NUM_POWER_DOMAINS 4
+/*! \brief List of all RAPL domain names
+*/
+extern const char* power_names[NUM_POWER_DOMAINS] __attribute__ ((visibility ("default") ));
+
+/*!
+\def POWER_DOMAIN_SUPPORT_STATUS
+Flag to check in PowerDomain's supportFlag if the status msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_STATUS (1ULL<<0)
+/*!
+\def POWER_DOMAIN_SUPPORT_LIMIT
+Flag to check in PowerDomain's supportFlag if the limit msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_LIMIT (1ULL<<1)
+/*!
+\def POWER_DOMAIN_SUPPORT_POLICY
+Flag to check in PowerDomain's supportFlag if the policy msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_POLICY (1ULL<<2)
+/*!
+\def POWER_DOMAIN_SUPPORT_PERF
+Flag to check in PowerDomain's supportFlag if the perf msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_PERF (1ULL<<3)
+/*!
+\def POWER_DOMAIN_SUPPORT_INFO
+Flag to check in PowerDomain's supportFlag if the info msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_INFO (1ULL<<4)
+
+
+/*! \brief Information structure of CPU's turbo mode
+\extends PowerInfo
+*/
+typedef struct {
+ int numSteps; /*!< \brief Amount of turbo mode steps/frequencies */
+ double* steps; /*!< \brief List of turbo mode steps */
+} TurboBoost;
+
+/*! \brief Enum for all supported RAPL domains
+\extends PowerDomain
+*/
+typedef enum {
+ PKG = 0, /*!< \brief PKG domain, mostly one CPU socket/package */
+ PP0 = 1, /*!< \brief PP0 domain, not clearly defined by Intel */
+ PP1 = 2, /*!< \brief PP1 domain, not clearly defined by Intel */
+ DRAM = 3 /*!< \brief DRAM domain, the memory modules */
+} PowerType;
+
+/*! \brief Structure describing an RAPL power domain
+\extends PowerInfo
+*/
+typedef struct {
+ PowerType type; /*!< \brief Identifier which RAPL domain is managed by this struct */
+ uint32_t supportFlags; /*!< \brief Bitmask which features are supported by the power domain */
+ double energyUnit; /*!< \brief Multiplier for energy measurements */
+ double tdp; /*!< \brief Thermal Design Power (maximum amount of heat generated by the CPU) */
+ double minPower; /*!< \brief Minimal power consumption of the CPU */
+ double maxPower; /*!< \brief Maximal power consumption of the CPU */
+ double maxTimeWindow; /*!< \brief Minimal power measurement interval */
+} PowerDomain;
+
+/*! \brief Information structure of CPU's power measurement facility
+*/
+typedef struct {
+ double baseFrequency; /*!< \brief Base frequency of the CPU */
+ double minFrequency; /*!< \brief Minimal frequency of the CPU */
+ TurboBoost turbo; /*!< \brief Turbo boost information */
+ int hasRAPL; /*!< \brief RAPL support flag */
+ double powerUnit; /*!< \brief Multiplier for power measurements */
+ double timeUnit; /*!< \brief Multiplier for time information */
+ PowerDomain domains[NUM_POWER_DOMAINS]; /*!< \brief List of power domains */
+} PowerInfo;
+
+/*! \brief Power measurement data for start/stop measurements
+*/
+typedef struct {
+ int domain; /*!< \brief RAPL domain identifier */
+ uint32_t before; /*!< \brief Counter state at start */
+ uint32_t after; /*!< \brief Counter state at stop */
+} PowerData;
+
+/*! \brief Variable holding the global power information structure */
+extern PowerInfo power_info;
+
+/** \brief Pointer for exporting the PowerInfo data structure */
+typedef PowerInfo* PowerInfo_t;
+/** \brief Pointer for exporting the PowerData data structure */
+typedef PowerData* PowerData_t;
+
+/*! \brief Initialize energy measurements on specific CPU
+
+Additionally, it reads basic information about the energy measurements like
+minimal measurement time.
+ at param [in] cpuId Initialize energy facility for this CPU
+ at return error code
+*/
+extern int power_init(int cpuId) __attribute__ ((visibility ("default") ));
+/*! \brief Get a pointer to the energy facility information
+
+ at return PowerInfo_t pointer
+\sa PowerInfo_t
+*/
+extern PowerInfo_t get_powerInfo(void) __attribute__ ((visibility ("default") ));
+/*! \brief Read the current power value
+
+ at param [in] cpuId Read energy facility for this CPU
+ at param [in] reg Energy register
+ at param [out] data Energy data
+*/
+extern int power_read(int cpuId, uint64_t reg, uint32_t *data) __attribute__ ((visibility ("default") ));
+/*! \brief Read the current energy value using a specific communication socket
+
+ at param [in] socket_fd Communication socket for the read operation
+ at param [in] cpuId Read energy facility for this CPU
+ at param [in] reg Energy register
+ at param [out] data Energy data
+*/
+extern int power_tread(int socket_fd, int cpuId, uint64_t reg, uint32_t *data) __attribute__ ((visibility ("default") ));
+/*! \brief Start energy measurements
+
+ at param [in,out] data Data structure holding start and stop values for energy measurements
+ at param [in] cpuId Start energy facility for this CPU
+ at param [in] type Which type should be measured
+ at return error code
+*/
+extern int power_start(PowerData_t data, int cpuId, PowerType type) __attribute__ ((visibility ("default") ));
+/*! \brief Stop energy measurements
+
+ at param [in,out] data Data structure holding start and stop values for energy measurements
+ at param [in] cpuId Start energy facility for this CPU
+ at param [in] type Which type should be measured
+ at return error code
+*/
+extern int power_stop(PowerData_t data, int cpuId, PowerType type) __attribute__ ((visibility ("default") ));
+/*! \brief Print energy measurements gathered by power_start() and power_stop()
+
+ at param [in] data Data structure holding start and stop values for energy measurements
+ at return Consumed energy in Joules
+*/
+extern double power_printEnergy(PowerData* data) __attribute__ ((visibility ("default") ));
+/*! \brief Get energy Unit
+
+ at param [in] domain RAPL domain ID
+ at return Energy unit of the given RAPL domain
+*/
+extern double power_getEnergyUnit(int domain) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the values of the limit register of a domain
+NOT IMPLEMENTED
+
+ at param [in] cpuId CPU ID
+ at param [in] domain RAPL domain ID
+ at param [out] power Energy limit
+ at param [out] time Time limit
+ at return error code
+*/
+int power_limitGet(int cpuId, PowerType domain, double* power, double* time) __attribute__ ((visibility ("default") ));
+
+/*! \brief Set the values of the limit register of a domain
+NOT IMPLEMENTED
+
+ at param [in] cpuId CPU ID
+ at param [in] domain RAPL domain ID
+ at param [in] power Energy limit
+ at param [in] time Time limit
+ at param [in] doClamping Activate clamping (going below OS-requested power level)
+ at return error code
+*/
+int power_limitSet(int cpuId, PowerType domain, double power, double time, int doClamping) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the state of a energy limit, activated or deactivated
+NOT IMPLEMENTED
+
+ at param [in] cpuId CPU ID
+ at param [in] domain RAPL domain ID
+ at return state, 1 for active, 0 for inactive
+*/
+int power_limitState(int cpuId, PowerType domain) __attribute__ ((visibility ("default") ));
+
+/*! \brief Free space of power_unit
+*/
+extern void power_finalize(void) __attribute__ ((visibility ("default") ));
+/** @}*/
+
+/*
+################################################################################
+# Thermal measurements related functions
+################################################################################
+*/
+/** \addtogroup ThermalMon Thermal monitoring module
+ * @{
+ */
+/*! \brief Initialize thermal measurements on specific CPU
+
+ at param [in] cpuId Initialize thermal facility for this CPU
+*/
+extern void thermal_init(int cpuId) __attribute__ ((visibility ("default") ));
+/*! \brief Read the current thermal value
+
+ at param [in] cpuId Read thermal facility for this CPU
+ at param [out] data Thermal data
+*/
+extern int thermal_read(int cpuId, uint32_t *data) __attribute__ ((visibility ("default") ));
+/*! \brief Read the current thermal value using a specific communication socket
+
+ at param [in] socket_fd Communication socket for the read operation
+ at param [in] cpuId Read thermal facility for this CPU
+ at param [out] data Thermal data
+*/
+extern int thermal_tread(int socket_fd, int cpuId, uint32_t *data) __attribute__ ((visibility ("default") ));
+/** @}*/
+
+
+/*
+################################################################################
+# Memory sweeping related functions
+################################################################################
+*/
+/** \addtogroup MemSweep Memory sweeping module
+ * @{
+ */
+/*! \brief Sweeping the memory of a NUMA node
+
+Sweeps (zeros) the memory of NUMA node with ID \a domainId
+ at param [in] domainId NUMA node ID
+*/
+extern void memsweep_domain(int domainId) __attribute__ ((visibility ("default") ));
+/*! \brief Sweeping the memory of all NUMA nodes covered by CPU list
+
+Sweeps (zeros) the memory of all NUMA nodes containing the CPUs in \a processorList
+ at param [in] processorList List of CPU IDs
+ at param [in] numberOfProcessors Number of CPUs in list
+*/
+extern void memsweep_threadGroup(int* processorList, int numberOfProcessors) __attribute__ ((visibility ("default") ));
+/** @}*/
+
+/*
+################################################################################
+# CPU feature related functions
+################################################################################
+*/
+/** \addtogroup CpuFeatures Retrieval and manipulation of processor features
+ * @{
+ */
+
+typedef enum {
+ FEAT_HW_PREFETCHER=0, /*!< \brief Hardware prefetcher */
+ FEAT_CL_PREFETCHER, /*!< \brief Adjacent cache line prefetcher */
+ FEAT_DCU_PREFETCHER, /*!< \brief DCU L1 data cache prefetcher */
+ FEAT_IP_PREFETCHER, /*!< \brief IP L1 data cache prefetcher */
+ FEAT_FAST_STRINGS, /*!< \brief Fast-strings feature */
+ FEAT_THERMAL_CONTROL, /*!< \brief Automatic Thermal Control Circuit */
+ FEAT_PERF_MON, /*!< \brief Hardware performance monitoring */
+ FEAT_FERR_MULTIPLEX, /*!< \brief FERR# Multiplexing, must be 1 for XAPIC interrupt model */
+ FEAT_BRANCH_TRACE_STORAGE, /*!< \brief Branch Trace Storage */
+ FEAT_XTPR_MESSAGE, /*!< \brief xTPR Message to set processor priority */
+ FEAT_PEBS, /*!< \brief Precise Event Based Sampling (PEBS) */
+ FEAT_SPEEDSTEP, /*!< \brief Enhanced Intel SpeedStep Technology to reduce energy consumption*/
+ FEAT_MONITOR, /*!< \brief MONITOR/MWAIT feature to monitor write-back stores*/
+ FEAT_SPEEDSTEP_LOCK, /*!< \brief Enhanced Intel SpeedStep Technology Select Lock */
+ FEAT_CPUID_MAX_VAL, /*!< \brief Limit CPUID Maxval */
+ FEAT_XD_BIT, /*!< \brief Execute Disable Bit */
+ FEAT_DYN_ACCEL, /*!< \brief Intel Dynamic Acceleration */
+ FEAT_TURBO_MODE, /*!< \brief Intel Turbo Mode */
+ FEAT_TM2, /*!< \brief Thermal Monitoring 2 */
+ CPUFEATURES_MAX
+} CpuFeature;
+
+/*! \brief Initialize the internal feature variables for all CPUs
+
+Initialize the internal feature variables for all CPUs
+*/
+extern void cpuFeatures_init() __attribute__ ((visibility ("default") ));
+/*! \brief Print state of all CPU features for a given CPU
+
+Print state of all CPU features for a given CPU
+ at param [in] cpu CPU ID
+*/
+extern void cpuFeatures_print(int cpu) __attribute__ ((visibility ("default") ));
+/*! \brief Get state of a CPU feature for a given CPU
+
+Get state of a CPU feature for a given CPU
+ at param [in] cpu CPU ID
+ at param [in] type CPU feature
+ at return State of CPU feature (1=enabled, 0=disabled)
+*/
+extern int cpuFeatures_get(int cpu, CpuFeature type) __attribute__ ((visibility ("default") ));
+/*! \brief Get the name of a CPU feature
+
+Get the name of a CPU feature
+ at param [in] type CPU feature
+ at return Name of the CPU feature or NULL if feature is not available
+*/
+extern char* cpuFeatures_name(CpuFeature type) __attribute__ ((visibility ("default") ));
+/*! \brief Enable a CPU feature for a specific CPU
+
+Enable a CPU feature for a specific CPU. Only the state of the prefetchers can be changed, all other features return -EINVAL
+ at param [in] cpu CPU ID
+ at param [in] type CPU feature
+ at return Status of operation (0=success, all others are erros, either by MSR access or invalid feature)
+*/
+extern int cpuFeatures_enable(int cpu, CpuFeature type, int print) __attribute__ ((visibility ("default") ));
+/*! \brief Disable a CPU feature for a specific CPU
+
+Disable a CPU feature for a specific CPU. Only the state of the prefetchers can be changed, all other features return -EINVAL
+ at param [in] cpu CPU ID
+ at param [in] type CPU feature
+ at return Status of operation (0=success, all others are erros, either by MSR access or invalid feature)
+*/
+extern int cpuFeatures_disable(int cpu, CpuFeature type, int print) __attribute__ ((visibility ("default") ));
+/** @}*/
#ifdef __cplusplus
}
diff --git a/src/includes/lock.h b/src/includes/lock.h
index 87d1593..93f3d9b 100644
--- a/src/includes/lock.h
+++ b/src/includes/lock.h
@@ -5,13 +5,13 @@
*
* Description: Header File Locking primitive Module
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
diff --git a/src/includes/memsweep.h b/src/includes/memsweep.h
index e29d4d8..de7a7b0 100644
--- a/src/includes/memsweep.h
+++ b/src/includes/memsweep.h
@@ -3,15 +3,16 @@
*
* Filename: memsweep.h
*
- * Description: Header File memsweep Module.
+ * Description: Header File memsweep module for internal use. External functions are
+ * defined in likwid.h
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -34,9 +35,7 @@
#include <types.h>
extern void memsweep_setMemoryFraction(uint64_t fraction);
-extern void memsweep_node(FILE* OUTSTREAM);
-extern void memsweep_domain(FILE* OUTSTREAM, int domainId);
-extern void memsweep_threadGroup(FILE* OUTSTREAM, int* processorList, int numberOfProcessors);
+extern void memsweep_node(void);
#endif /* MEMSWEEP_H */
diff --git a/src/includes/msr.h b/src/includes/msr.h
deleted file mode 100644
index 45f8069..0000000
--- a/src/includes/msr.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: msr.h
- *
- * Description: Header File msr Module.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef MSR_H
-#define MSR_H
-
-#include <types.h>
-
-/* Initializes the MSR module, trying to open either the MSR files or
- * the connection to the msr daemon. */
-extern void msr_init(int socket_fd);
-extern void msr_finalize(void);
-extern uint64_t msr_read(int cpu, uint32_t reg);
-extern void msr_write(int cpu, uint32_t reg, uint64_t data);
-
-/* variants for thread safe execution with a per thread socket */
-extern uint64_t msr_tread(int socket_fd, int cpu, uint32_t reg);
-extern void msr_twrite(int socket_fd, int cpu, uint32_t reg, uint64_t data);
-
-#endif /* MSR_H */
diff --git a/src/includes/multiplex.h b/src/includes/multiplex.h
deleted file mode 100644
index c34cac8..0000000
--- a/src/includes/multiplex.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: multiplex.h
- *
- * Description: Header File multiplex Module.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef MULTIPLEX_H
-#define MULTIPLEX_H
-
-#include <types.h>
-
-extern void multiplex_init(MultiplexCollections* set);
-extern void multiplex_start();
-extern void multiplex_stop();
-
-#endif /* MULTIPLEX_H */
diff --git a/src/includes/multiplex_types.h b/src/includes/multiplex_types.h
deleted file mode 100644
index 8578a8f..0000000
--- a/src/includes/multiplex_types.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: multiplex_types.h
- *
- * Description: Types file for multiplex module.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef MULTIPLEX_TYPES_H
-#define MULTIPLEX_TYPES_H
-
-typedef struct {
- PerfmonEventSet* collections;
- int numberOfCollections;
- double time;
-} MultiplexCollections;
-
-
-
-#endif /* MULTIPLEX_TYPES_H */
diff --git a/src/includes/numa.h b/src/includes/numa.h
index 3a2d0f1..3ca582f 100644
--- a/src/includes/numa.h
+++ b/src/includes/numa.h
@@ -3,15 +3,16 @@
*
* Filename: numa.h
*
- * Description: Header File numa Module.
+ * Description: Header File NUMA module for internal use. External functions are
+ * defined in likwid.h
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -28,18 +29,30 @@
* =======================================================================================
*/
-#ifndef NUMA_H
-#define NUMA_H
+#ifndef LIKWID_NUMA
+#define LIKWID_NUMA
+
+#include <stdlib.h>
+#include <stdio.h>
#include <types.h>
+#include <likwid.h>
+#include <numa_hwloc.h>
+#include <numa_proc.h>
+
+
+
+
+extern int str2int(const char* str);
+
+struct numa_functions {
+ int (*numa_init) (void);
+ void (*numa_setInterleaved) (int*, int);
+ void (*numa_membind) (void*, size_t, int);
+};
+
+
-/** Structure holding numa information
- *
- */
-extern NumaTopology numa_info;
-extern int numa_init (void);
-extern void numa_setInterleaved(int* processorList, int numberOfProcessors);
-extern void numa_membind(void* ptr, size_t size, int domainId);
-#endif /*NUMA_H*/
+#endif
diff --git a/src/includes/numa_hwloc.h b/src/includes/numa_hwloc.h
new file mode 100644
index 0000000..cf74238
--- /dev/null
+++ b/src/includes/numa_hwloc.h
@@ -0,0 +1,40 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: numa_hwloc.h
+ *
+ * Description: Header File hwloc NUMA backend
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+
+#ifndef LIKWID_NUMA_HWLOC
+#define LIKWID_NUMA_HWLOC
+
+extern int hwloc_numa_init(void);
+extern void hwloc_numa_membind(void* ptr, size_t size, int domainId);
+extern void hwloc_numa_setInterleaved(int* processorList, int numberOfProcessors);
+
+
+#endif
diff --git a/src/includes/numa_proc.h b/src/includes/numa_proc.h
new file mode 100644
index 0000000..71af378
--- /dev/null
+++ b/src/includes/numa_proc.h
@@ -0,0 +1,39 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: numa_proc.h
+ *
+ * Description: Header File procfs/sysfs NUMA backend
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef LIKWID_NUMA_PROC
+#define LIKWID_NUMA_PROC
+
+extern int proc_numa_init(void);
+extern void proc_numa_membind(void* ptr, size_t size, int domainId);
+extern void proc_numa_setInterleaved(int* processorList, int numberOfProcessors);
+
+
+#endif
diff --git a/src/includes/numa_types.h b/src/includes/numa_types.h
deleted file mode 100644
index bd4afda..0000000
--- a/src/includes/numa_types.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: numa_types.h
- *
- * Description: Types file for numa module.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef NUMA_TYPES_H
-#define NUMA_TYPES_H
-
-
-typedef struct {
- int id;
- uint64_t totalMemory;
- uint64_t freeMemory;
- int numberOfProcessors;
- uint32_t* processors;
- uint32_t* processorsCompact;
- int numberOfDistances;
- uint32_t* distances;
-} NumaNode;
-
-typedef struct {
- uint32_t numberOfNodes;
- NumaNode* nodes;
-} NumaTopology;
-
-
-#endif /*NUMA_TYPES_H*/
diff --git a/src/includes/pci.h b/src/includes/pci.h
deleted file mode 100644
index 1672f1c..0000000
--- a/src/includes/pci.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: pci.h
- *
- * Description: Header File pci Module.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef PCI_H
-#define PCI_H
-
-#include <types.h>
-
-
-/* PCI config memory space access is addressed
- * BUS - DEVICE - FUNCTION
- * Listing for Uncore devices DEVICE.FUNCTION
- */
-
-extern void pci_init();
-extern void pci_finalize();
-extern uint32_t pci_read(int cpu, PciDeviceIndex index, uint32_t reg);
-extern void pci_write(int cpu, PciDeviceIndex index, uint32_t reg, uint32_t data);
-extern uint32_t pci_tread(int socket_fd, int cpu, PciDeviceIndex index, uint32_t reg);
-extern void pci_twrite(int socket_fd, int cpu, PciDeviceIndex index, uint32_t reg, uint32_t data);
-
-#endif /* PCI_H */
diff --git a/src/includes/pci_hwloc.h b/src/includes/pci_hwloc.h
new file mode 100644
index 0000000..fd7db29
--- /dev/null
+++ b/src/includes/pci_hwloc.h
@@ -0,0 +1,37 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: pci_hwloc.h
+ *
+ * Description: Header File hwloc based PCI lookup backend
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef PCI_HWLOC_H
+#define PCI_HWLOC_H
+
+extern int hwloc_pci_init(uint16_t testDevice, char** socket_bus, int* nrSockets);
+
+
+#endif
diff --git a/src/includes/pci_proc.h b/src/includes/pci_proc.h
new file mode 100644
index 0000000..062daa9
--- /dev/null
+++ b/src/includes/pci_proc.h
@@ -0,0 +1,37 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: pci_proc.h
+ *
+ * Description: Header File procfs based PCI lookup backend
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef PCI_PROC_H
+#define PCI_PROC_H
+
+extern int proc_pci_init(uint16_t testDevice, char** socket_bus, int* nrSockets);
+
+
+#endif
diff --git a/src/includes/pci_types.h b/src/includes/pci_types.h
index cfb9657..7e8495b 100644
--- a/src/includes/pci_types.h
+++ b/src/includes/pci_types.h
@@ -5,13 +5,14 @@
*
* Description: Types file for pci module.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -34,23 +35,69 @@
#include <stdint.h>
+
+
+typedef enum {
+ NODEVTYPE = 0,
+ R3QPI,
+ R2PCIE,
+ IMC,
+ HA,
+ QPI,
+ IRP,
+ MAX_NUM_PCI_TYPES
+} PciDeviceType;
+
typedef enum {
- PCI_R3QPI_DEVICE_LINK_0 = 0,
+ MSR_DEV = 0,
+ PCI_R3QPI_DEVICE_LINK_0,
PCI_R3QPI_DEVICE_LINK_1,
+ PCI_R3QPI_DEVICE_LINK_2,
PCI_R2PCIE_DEVICE,
- PCI_IMC_DEVICE_CH_0,
- PCI_IMC_DEVICE_CH_1,
- PCI_IMC_DEVICE_CH_2,
- PCI_IMC_DEVICE_CH_3,
- PCI_HA_DEVICE,
+ PCI_IMC_DEVICE_0_CH_0,
+ PCI_IMC_DEVICE_0_CH_1,
+ PCI_IMC_DEVICE_0_CH_2,
+ PCI_IMC_DEVICE_0_CH_3,
+ PCI_HA_DEVICE_0,
+ PCI_HA_DEVICE_1,
PCI_QPI_DEVICE_PORT_0,
PCI_QPI_DEVICE_PORT_1,
+ PCI_QPI_DEVICE_PORT_2,
PCI_QPI_MASK_DEVICE_PORT_0,
PCI_QPI_MASK_DEVICE_PORT_1,
+ PCI_QPI_MASK_DEVICE_PORT_2,
PCI_QPI_MISC_DEVICE_PORT_0,
PCI_QPI_MISC_DEVICE_PORT_1,
- MAX_NUM_DEVICES
+ PCI_QPI_MISC_DEVICE_PORT_2,
+ PCI_IMC_DEVICE_1_CH_0,
+ PCI_IMC_DEVICE_1_CH_1,
+ PCI_IMC_DEVICE_1_CH_2,
+ PCI_IMC_DEVICE_1_CH_3,
+ PCI_IRP_DEVICE,
+ MAX_NUM_PCI_DEVICES
} PciDeviceIndex;
+typedef struct {
+ PciDeviceType type;
+ char *path;
+ char *name;
+ char *likwid_name;
+ uint32_t devid;
+ int online;
+} PciDevice;
+
+typedef struct {
+ char* name;
+ char* desc;
+} PciType;
+
+static PciType pci_types[MAX_NUM_PCI_TYPES] = {
+ [R3QPI] = {"R3QPI", "R3QPI is the interface between the Intel QPI Link Layer and the Ring."},
+ [R2PCIE] = {"R2PCIE", "R2PCIe represents the interface between the Ring and IIO traffic to/from PCIe."},
+ [IMC] = {"IMC", "The integrated Memory Controller provides the interface to DRAM and communicates to the rest of the uncore through the Home Agent."},
+ [HA] = {"HA", "The HA is responsible for the protocol side of memory interactions."},
+ [QPI] = {"QPI", "The Intel QPI Link Layer is responsible for packetizing requests from the caching agent on the way out to the system interface."},
+ [IRP] = {"IRP", "IRP is responsible for maintaining coherency for IIO traffic e.g. crosssocket P2P."}
+};
#endif /*PCI_TYPES_H*/
diff --git a/src/includes/perfgroup.h b/src/includes/perfgroup.h
new file mode 100644
index 0000000..c4f25ec
--- /dev/null
+++ b/src/includes/perfgroup.h
@@ -0,0 +1,94 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: configuration.h
+ *
+ * Description: Header File of performance group and event set handler
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef PERFGROUP_H
+#define PERFGROUP_H
+
+
+ /*! \brief The groupInfo data structure describes a performance group
+
+Groups can be either be read in from file or be a group with custom event set. For
+performance groups commonly all values are set. For groups with custom event set,
+the fields groupname and shortinfo are set to 'Custom', longinfo is NULL and in
+general the nmetrics value is 0.
+*/
+typedef struct {
+ char* groupname; /*!< \brief Name of the group: performance group name or 'Custom' */
+ char* shortinfo; /*!< \brief Short info string for the group or 'Custom' */
+ int nevents; /*!< \brief Number of event/counter combinations */
+ char** events; /*!< \brief List of events */
+ char** counters; /*!< \brief List of counter registers */
+ int nmetrics; /*!< \brief Number of metrics */
+ char** metricnames; /*!< \brief Metric names */
+ char** metricformulas; /*!< \brief Metric formulas */
+ char* longinfo; /*!< \brief Descriptive text about the group or empty */
+} GroupInfo;
+
+typedef struct {
+ int counters; /*!< \brief Number of entries in the list */
+ char** cnames; /*!< \brief List of counter names */
+ double* cvalues; /*!< \brief List of counter values */
+} CounterList;
+
+typedef enum {
+ GROUP_NONE = 0,
+ GROUP_SHORT,
+ GROUP_EVENTSET,
+ GROUP_METRICS,
+ GROUP_LONG
+} GroupFileSections;
+
+static char* groupFileSectionNames[5] = {
+ "NONE",
+ "SHORT",
+ "EVENTSET",
+ "METRICS",
+ "LONG"
+};
+
+extern int get_groups(char* grouppath, char* architecture, char*** groupnames, char*** groupshort, char*** grouplong);
+extern void return_groups(int groups, char** groupnames, char** groupshort, char** grouplong);
+extern int read_group(char* grouppath, char* architecture, char* groupname, GroupInfo* ginfo);
+extern int custom_group(char* eventStr, GroupInfo* ginfo);
+extern char* get_eventStr(GroupInfo* ginfo);
+void put_eventStr(char* eventset);
+extern char* get_shortInfo(GroupInfo* ginfo);
+void put_shortInfo(char* sinfo);
+extern char* get_longInfo(GroupInfo* ginfo);
+void put_longInfo(char* linfo);
+extern void return_group(GroupInfo* ginfo);
+
+extern void init_clist(CounterList* clist);
+extern int add_to_clist(CounterList* clist, char* counter, double result);
+extern void destroy_clist(CounterList* clist);
+
+extern int calc_metric(char* formula, CounterList* clist, double *result);
+
+#endif
diff --git a/src/includes/perfmon.h b/src/includes/perfmon.h
index 6e9d9f9..37058c1 100644
--- a/src/includes/perfmon.h
+++ b/src/includes/perfmon.h
@@ -7,13 +7,14 @@
* Configures and reads out performance counters
* on x86 based architectures. Supports multi threading.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -33,76 +34,27 @@
#ifndef PERFMON_H
#define PERFMON_H
-#include <bstrlib.h>
-#include <types.h>
-
-extern int perfmon_verbose;
-
-extern void (*perfmon_startCountersThread) (int thread_id);
-extern void (*perfmon_stopCountersThread) (int thread_id);
-extern int (*perfmon_getIndex) (bstring reg, PerfmonCounterIndex* index);
-extern void (*perfmon_setupCounterThread) (int thread_id, PerfmonEvent* event , PerfmonCounterIndex index);
-
-extern void perfmon_initEventSet(StrUtilEventSet* eventSetConfig, PerfmonEventSet* set);
-extern void perfmon_setCSVMode(int v);
-extern void perfmon_printAvailableGroups(void);
-extern void perfmon_printGroupHelp(bstring group);
-extern void perfmon_init(int numThreads, int threads[],FILE* outstream);
-extern void perfmon_finalize(void);
-extern void perfmon_setupEventSet(bstring eventString, BitMask* mask);
-extern double perfmon_getEventResult(int thread, int index);
-extern int perfmon_setupEventSetC(char* eventCString, const char*** eventnames);
-
-
-/*
-The following structure and set of functions provide an efficient and easy interface to
-access counters from different groups and switch between them.
-
-TODO: The internals need some cleanup, but the interface should remain rather stable.
-
-Usage:
-setup = perfmon_prepareEventSetup("VIEW"), etc..
-Whenever you want to use one of the prepared setups call:
-perfmon_setupCountersForEventSet(setup)
-then you can startCounters, stopCounters and then
-perfmon_getEventCounterValues() and/or
-perfmon_getDerivedCounterValues()
- */
-typedef struct {
- const char* groupName;
- int numberOfEvents;
- const char** eventNames;
- int numberOfDerivedCounters;
- const char** derivedNames;
-
- // Internal structures DO NOT ACCESS THEM, they need cleanup.
- StrUtilEventSet* eventSetConfig;
- PerfmonEventSet* perfmon_set;
- PerfmonGroup groupSet;
- int groupIndex;
-} EventSetup;
+#include <types.h>
+#include <likwid.h>
+#define FREEZE_FLAG_ONLYFREEZE 0x0ULL
+#define FREEZE_FLAG_CLEAR_CTR (1ULL<<1)
+#define FREEZE_FLAG_CLEAR_CTL (1ULL<<0)
-extern EventSetup perfmon_prepareEventSetup(char* eventGroupString);
-extern void perfmon_setupCountersForEventSet(EventSetup * setup);
+extern uint64_t currentConfig[MAX_NUM_THREADS][NUM_PMC];
-// obtain values for all cores, average, min and max for the cores.
-extern void perfmon_getEventCounterValues(uint64_t* avg_values, uint64_t* max, uint64_t* min);
-extern void perfmon_getDerivedCounterValues(float* avg_values, float* max, float* min);
-/////////////////////////
+extern int (*perfmon_startCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+extern int (*perfmon_stopCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+extern int (*perfmon_setupCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+extern int (*perfmon_readCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+extern int (*perfmon_finalizeCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+extern int (*initThreadArch) (int cpu_id);
-extern void perfmon_setupCounters(void);
-extern void perfmon_startCounters(void);
-extern void perfmon_stopCounters(void);
-extern void perfmon_readCounters(void);
-extern double perfmon_getResult(int threadId, char* counterString);
-extern void perfmon_printMarkerResults(bstring filepath);
-extern void perfmon_logCounterResults(double time);
-extern void perfmon_printCounterResults(void);
+/* Internal helpers */
+extern int getCounterTypeOffset(int index);
+extern uint64_t perfmon_getMaxCounterValue(RegisterType type);
-extern void perfmon_printCounters(void);
-extern void perfmon_printEvents(void);
#endif /*PERFMON_H*/
diff --git a/src/includes/perfmon_atom.h b/src/includes/perfmon_atom.h
index 201cea6..73cc9f9 100644
--- a/src/includes/perfmon_atom.h
+++ b/src/includes/perfmon_atom.h
@@ -5,13 +5,13 @@
*
* Description: Header file of perfmon module for Atom
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -29,8 +29,7 @@
*/
#include <perfmon_atom_events.h>
-#include <perfmon_atom_groups.h>
+#include <error.h>
-static int perfmon_numGroupsAtom = NUM_GROUPS_ATOM;
static int perfmon_numArchEventsAtom = NUM_ARCH_EVENTS_ATOM;
diff --git a/src/includes/perfmon_atom_events.txt b/src/includes/perfmon_atom_events.txt
index 4ca18e4..cb4e2fc 100644
--- a/src/includes/perfmon_atom_events.txt
+++ b/src/includes/perfmon_atom_events.txt
@@ -1,16 +1,17 @@
# =======================================================================================
-#
+#
# Filename: perfmon_atom_events.txt
-#
+#
# Description: Event list for Intel Atom
-#
-# Version: 3.1.3
-# Released: 4.11.2014
-#
-# Author: Jan Treibig (jt), jan.treibig at gmail.com
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
+# Author: Jan Treibig (jt), jan.treibig at gmail.com
+# Thomas Roehl (tr), thomas.roehl at googlemail.com
# Project: likwid
#
-# Copyright (C) 2014 Jan Treibig
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
diff --git a/src/includes/perfmon_broadwell.h b/src/includes/perfmon_broadwell.h
new file mode 100644
index 0000000..8e5fc2a
--- /dev/null
+++ b/src/includes/perfmon_broadwell.h
@@ -0,0 +1,1793 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: perfmon_broadwell.h
+ *
+ * Description: Header File of perfmon module for Intel Broadwell.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <perfmon_broadwell_events.h>
+#include <perfmon_broadwell_counters.h>
+#include <perfmon_broadwelld_events.h>
+#include <perfmon_broadwelld_counters.h>
+#include <perfmon_broadwellEP_events.h>
+#include <perfmon_broadwellEP_counters.h>
+#include <error.h>
+#include <affinity.h>
+#include <limits.h>
+#include <topology.h>
+#include <access.h>
+
+
+static int perfmon_numCountersBroadwell = NUM_COUNTERS_BROADWELL;
+static int perfmon_numCoreCountersBroadwell = NUM_COUNTERS_CORE_BROADWELL;
+static int perfmon_numArchEventsBroadwell = NUM_ARCH_EVENTS_BROADWELL;
+
+static int perfmon_numCountersBroadwellD = NUM_COUNTERS_BROADWELLD;
+static int perfmon_numCoreCountersBroadwellD = NUM_COUNTERS_CORE_BROADWELLD;
+static int perfmon_numArchEventsBroadwellD = NUM_ARCH_EVENTS_BROADWELLD;
+
+static int perfmon_numCountersBroadwellEP = NUM_COUNTERS_BROADWELLEP;
+static int perfmon_numCoreCountersBroadwellEP = NUM_COUNTERS_CORE_BROADWELLEP;
+static int perfmon_numArchEventsBroadwellEP = NUM_ARCH_EVENTS_BROADWELLEP;
+
+int bdw_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event);
+int bdwep_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event);
+int (*broadwell_cbox_setup)(int, RegisterIndex, PerfmonEvent *);
+
+int perfmon_init_broadwell(int cpu_id)
+{
+ lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+ lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+ if ((cpuid_info.model == BROADWELL_E) || (cpuid_info.model == BROADWELL_D))
+ {
+ broadwell_cbox_setup = bdwep_cbox_setup;
+ }
+ else
+ {
+ broadwell_cbox_setup = bdw_cbox_setup;
+ }
+ return 0;
+}
+
+
+uint32_t bdw_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint32_t flags = (1ULL<<(1+(index*4)));
+ for(j=0;j<event->numberOfOptions;j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<(index*4));
+ break;
+ case EVENT_OPTION_ANYTHREAD:
+ flags |= (1ULL<<(2+(index*4)));
+ default:
+ break;
+ }
+ }
+ return flags;
+}
+
+int bdw_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ uint64_t offcore_flags = 0x0ULL;
+
+ flags = (1ULL<<22)|(1ULL<<16);
+ /* Intel with standard 8 bit event mask: [7:0] */
+ flags |= (event->umask<<8) + event->eventId;
+
+ /* set custom cfg and cmask */
+ if ((event->cfgBits != 0) &&
+ (event->eventId != 0xB7) &&
+ (event->eventId != 0xBB))
+ {
+ flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+ }
+
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<17);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_ANYTHREAD:
+ flags |= (1ULL<<21);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL) << 24;
+ break;
+ case EVENT_OPTION_IN_TRANS:
+ flags |= (1ULL<<32);
+ break;
+ case EVENT_OPTION_IN_TRANS_ABORT:
+ flags |= (1ULL<<33);
+ break;
+ case EVENT_OPTION_MATCH0:
+ offcore_flags |= (event->options[j].value & 0x8FFFULL);
+ break;
+ case EVENT_OPTION_MATCH1:
+ offcore_flags |= (event->options[j].value<<16);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ if (event->eventId == 0xB7)
+ {
+ if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+ {
+ offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+ }
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, offcore_flags));
+ }
+ else if (event->eventId == 0xBB)
+ {
+ if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+ {
+ offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+ }
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, offcore_flags));
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister , flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int bdw_ubox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+
+ flags = (1ULL<<22)|(1ULL<<20);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0x1FULL) << 24;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_UBOX);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int bdw_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ flags = (1ULL<<22)|(1ULL<<20);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0x1FULL) << 24;
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_CBOX);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int bdwep_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ uint64_t filter_flags0 = 0x0ULL;
+ uint64_t filter_flags1 = 0x0ULL;
+ uint32_t filter0 = box_map[counter_map[index].type].filterRegister1;
+ uint32_t filter1 = box_map[counter_map[index].type].filterRegister2;
+ int set_state_all = 0;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+
+ flags = (1ULL<<22);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->eventId == 0x34)
+ {
+ set_state_all = 1;
+ }
+ if ((event->eventId == 0x13 || event->eventId == 0x11) && (event->umask & 0x2ULL))
+ {
+ fprintf(stderr, "IRQ_REJECTED should not be Ored with the other umasks.");
+ }
+
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL) << 24;
+ break;
+ case EVENT_OPTION_OPCODE:
+ filter_flags1 |= (0x3<<27);
+ filter_flags1 |= (extractBitField(event->options[j].value,5,0) << 20);
+ break;
+ case EVENT_OPTION_NID:
+ filter_flags1 |= (extractBitField(event->options[j].value,16,0));
+ break;
+ case EVENT_OPTION_STATE:
+ filter_flags0 |= (extractBitField(event->options[j].value,6,0) << 17);
+ set_state_all = 0;
+ break;
+ case EVENT_OPTION_TID:
+ filter_flags0 |= (extractBitField(event->options[j].value,6,0));
+ flags |= (1ULL<<19);
+ break;
+ case EVENT_OPTION_MATCH0:
+ filter_flags1 |= (extractBitField(event->options[j].value,2,0) << 30);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ if (filter_flags0 != 0x0ULL)
+ {
+ VERBOSEPRINTREG(cpu_id, filter0, filter_flags0, SETUP_CBOX_FILTER0);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter0, filter_flags0));
+ }
+ else
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter0, 0x0ULL));
+ }
+ if (filter_flags1 != 0x0ULL)
+ {
+ VERBOSEPRINTREG(cpu_id, filter1, filter_flags1, SETUP_CBOX_FILTER1);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter1, filter_flags1));
+ }
+ else
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter1, 0x0ULL));
+ }
+
+ if (set_state_all)
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, filter0, &filter_flags0));
+ filter_flags0 |= (0x1F << 17);
+ VERBOSEPRINTREG(cpu_id, filter0, filter_flags0, SETUP_CBOX_DEF_FILTER_STATE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter0, filter_flags0));
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_CBOX);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int bdw_wbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ uint64_t filter = box_map[counter_map[index].type].filterRegister1;
+ int clean_filter = 1;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+
+ flags = (1ULL<<22)|(1ULL<<20);
+ flags |= event->eventId;
+ if ((event->umask > 0x00) && (event->umask <= 0x3))
+ {
+ flags |= (event->umask << 14);
+ }
+ else if (event->umask == 0xFF)
+ {
+ flags = (1ULL<<21);
+ }
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0x1FULL) << 24;
+ break;
+ case EVENT_OPTION_OCCUPANCY:
+ flags |= ((event->options[j].value & 0x3ULL)<<14);
+ break;
+ case EVENT_OPTION_OCCUPANCY_FILTER:
+ clean_filter = 0;
+ VERBOSEPRINTREG(cpu_id, filter, (event->options[j].value & 0xFFFFFFFFULL), SETUP_WBOX_FILTER);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter, (event->options[j].value & 0xFFFFFFFFULL)));
+ break;
+ case EVENT_OPTION_OCCUPANCY_EDGE:
+ flags |= (1ULL<<31);
+ break;
+ case EVENT_OPTION_OCCUPANCY_INVERT:
+ flags |= (1ULL<<30);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ if (clean_filter)
+ {
+ VERBOSEPRINTREG(cpu_id, filter, (event->options[j].value & 0xFFFFFFFFULL), CLEAN_WBOX_FILTER);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter, 0x0ULL));
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_WBOX);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int bdw_bbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ uint64_t filter = 0x0ULL;
+ int opcode_flag = 0;
+ int match_flag = 0;
+ PciDeviceIndex dev = counter_map[index].device;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(dev, cpu_id))
+ {
+ return -ENODEV;
+ }
+
+ flags = (1ULL<<22)|(1ULL<<20);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL) << 24;
+ break;
+ case EVENT_OPTION_OPCODE:
+ VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH,
+ (event->options[j].value & 0x3FULL), SETUP_BBOX_OPCODE);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH,
+ (event->options[j].value & 0x3FULL)));
+ opcode_flag = 1;
+ break;
+ case EVENT_OPTION_MATCH0:
+ filter = ((event->options[j].value & 0xFFFFFFC0ULL));
+ VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, filter, SETUP_ADDR0_FILTER);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, filter));
+ filter = (((event->options[j].value>>32) & 0x3FFFULL));
+ VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, filter, SETUP_ADDR1_FILTER);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, filter));
+ match_flag = 1;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (!opcode_flag)
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH, 0x0ULL, CLEAR_BBOX_OPCODE);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH, 0x0ULL));
+ }
+ if (!match_flag)
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, 0x0ULL, CLEAR_BBOX_MATCH0);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, 0x0ULL));
+ VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, 0x0ULL, CLEAR_BBOX_MATCH1);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, 0x0ULL));
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_BBOX);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int bdw_mbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ PciDeviceIndex dev = counter_map[index].device;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(dev, cpu_id))
+ {
+ return -ENODEV;
+ }
+
+ flags = (1ULL<<20)|(1ULL<<22);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL) << 24;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_MBOX);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int bdw_mboxfix_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ PciDeviceIndex dev = counter_map[index].device;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(dev, cpu_id))
+ {
+ return -ENODEV;
+ }
+
+ flags = (1ULL<<20)|(1ULL<<22);
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_MBOX);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int bdw_ibox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ PciDeviceIndex dev = counter_map[index].device;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(counter_map[index].device, cpu_id))
+ {
+ return -ENODEV;
+ }
+
+ flags = (1ULL<<20)|(1ULL<<22);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL) << 24;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_IBOX);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int bdw_pbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ PciDeviceIndex dev = counter_map[index].device;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(counter_map[index].device, cpu_id))
+ {
+ return -ENODEV;
+ }
+
+ flags = (1ULL<<20);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL) << 24;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if ((flags|(1ULL<<22)) != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_PBOX);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ /* Intel notes the registers must be written twice to hold, once without enable and again with enable */
+ flags |= (1ULL<<22);
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_PBOX_TWICE);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int bdw_rbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ PciDeviceIndex dev = counter_map[index].device;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(counter_map[index].device, cpu_id))
+ {
+ return -ENODEV;
+ }
+
+ flags = (1ULL<<20);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL) << 24;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if ((flags|(1ULL<<22)) != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_PBOX);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ /* Intel notes the registers must be written twice to hold, once without enable and again with enable */
+ flags |= (1ULL<<22);
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_PBOX_TWICE);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int bdw_sbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(counter_map[index].device, cpu_id))
+ {
+ return -ENODEV;
+ }
+
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_TID:
+ flags |= (1ULL<<19);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0xFFULL)<<24);
+ break;
+
+ default:
+ break;
+ }
+ }
+ }
+ if ((flags|(1ULL<<22)) != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_SBOX);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, counter_map[index].device, counter_map[index].configRegister, flags));
+ flags |= (1ULL<<22);
+ /* Due to an issue found with the Intel® Xeon® Processor E5 and E7 v4 Product Families
+ * hardware, it will be necessary to write each control register twice in a row in order for
+ * the Event Select field to take hold. It is recommended that SW perform the first write
+ * with the enable bit set to 0 followed by a write of the same control register value but
+ * with the enable bit set to 1.*/
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_SBOX_TWICE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, counter_map[index].device, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int bdw_qbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event, PciDeviceIndex filterdev)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ uint64_t filterreg;
+ uint64_t filterval = 0x0ULL;
+ PciDeviceIndex dev = counter_map[index].device;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(counter_map[index].device, cpu_id))
+ {
+ return -ENODEV;
+ }
+
+ flags = (1ULL<<20)|(1ULL<<22);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->cfgBits == 0x01)
+ {
+ flags |= (1ULL<<21);
+ }
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL) << 24;
+ break;
+ case EVENT_OPTION_MATCH0:
+ if (HPMcheck(filterdev, cpu_id))
+ {
+ filterreg = PCI_UNC_V3_QPI_PMON_RX_MATCH_0;
+ filterval = event->options[j].value & 0x8003FFF8ULL;
+ VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_QBOX_RX_MATCH0);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+ }
+ break;
+ case EVENT_OPTION_MATCH1:
+ if (HPMcheck(filterdev, cpu_id))
+ {
+ filterreg = PCI_UNC_V3_QPI_PMON_RX_MATCH_1;
+ filterval = event->options[j].value & 0x000F000FULL;
+ VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_QBOX_RX_MATCH1);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+ }
+ break;
+ case EVENT_OPTION_MATCH2:
+ if (HPMcheck(filterdev, cpu_id))
+ {
+ filterreg = PCI_UNC_V3_QPI_PMON_TX_MATCH_0;
+ filterval = event->options[j].value & 0x8003FFF8ULL;
+ VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_QBOX_TX_MATCH0);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+ }
+ break;
+ case EVENT_OPTION_MATCH3:
+ if (HPMcheck(filterdev, cpu_id))
+ {
+ filterreg = PCI_UNC_V3_QPI_PMON_TX_MATCH_1;
+ filterval = event->options[j].value & 0x000F000FULL;
+ VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_QBOX_TX_MATCH1);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+ }
+ break;
+ case EVENT_OPTION_MASK0:
+ if (HPMcheck(filterdev, cpu_id))
+ {
+ filterreg = PCI_UNC_V3_QPI_PMON_RX_MASK_0;
+ filterval = event->options[j].value & 0x8003FFF8ULL;
+ VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_QBOX_RX_MASK0);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+ }
+ break;
+ case EVENT_OPTION_MASK1:
+ if (HPMcheck(filterdev, cpu_id))
+ {
+ filterreg = PCI_UNC_V3_QPI_PMON_RX_MASK_1;
+ filterval = event->options[j].value & 0x000F000FULL;
+ VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_QBOX_RX_MASK1);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+ }
+ break;
+ case EVENT_OPTION_MASK2:
+ if (HPMcheck(filterdev, cpu_id))
+ {
+ filterreg = PCI_UNC_V3_QPI_PMON_TX_MASK_0;
+ filterval = event->options[j].value & 0x8003FFF8ULL;
+ VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_QBOX_TX_MASK0);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+ }
+ break;
+ case EVENT_OPTION_MASK3:
+ if (HPMcheck(filterdev, cpu_id))
+ {
+ filterreg = PCI_UNC_V3_QPI_PMON_TX_MASK_0;
+ filterval = event->options[j].value & 0x000F000FULL;
+ VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_QBOX_TX_MASK1);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_QBOX);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+#define BDW_FREEZE_UNCORE \
+ if (haveLock && eventSet->regTypeMask & ~(0xFULL)) \
+ { \
+ VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST (1ULL<<31), FREEZE_UNCORE); \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_CTL, (1ULL<<31))); \
+ }
+
+#define BDW_UNFREEZE_UNCORE \
+ if (haveLock && eventSet->regTypeMask & ~(0xFULL)) \
+ { \
+ VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST (1ULL<<29), UNFREEZE_UNCORE); \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_CTL, (1ULL<<29))); \
+ }
+
+#define BDW_UNFREEZE_UNCORE_AND_RESET_CTR \
+ if (haveLock && (eventSet->regTypeMask & ~(0xFULL))) \
+ { \
+ for (int i=0;i < eventSet->numberOfEvents;i++) \
+ { \
+ RegisterIndex index = eventSet->events[i].index; \
+ RegisterType type = counter_map[index].type; \
+ if ((type < UNCORE) || (type == WBOX0FIX)) \
+ { \
+ continue; \
+ } \
+ PciDeviceIndex dev = counter_map[index].device; \
+ if (HPMcheck(dev, cpu_id)) { \
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR_MANUAL); \
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL)); \
+ if (counter_map[index].counterRegister2 != 0x0) \
+ { \
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL, CLEAR_CTR_MANUAL); \
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL)); \
+ } \
+ } \
+ } \
+ VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST (1ULL<<29), UNFREEZE_UNCORE); \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_CTL, (1ULL<<29))); \
+ }
+
+int perfmon_setupCounterThread_broadwell(
+ int thread_id,
+ PerfmonEventSet* eventSet)
+{
+ int haveLock = 0;
+ uint64_t flags;
+ uint64_t fixed_flags = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveLock = 1;
+ }
+ BDW_FREEZE_UNCORE;
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, 0xC00000070000000F));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+ }
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ uint64_t reg = counter_map[index].configRegister;
+ PciDeviceIndex dev = counter_map[index].device;
+ eventSet->events[i].threadCounter[thread_id].init = TRUE;
+ flags = 0x0ULL;
+ switch (type)
+ {
+ case PMC:
+ bdw_pmc_setup(cpu_id, index, event);
+ break;
+
+ case FIXED:
+ fixed_flags |= bdw_fixed_setup(cpu_id, index, event);
+ break;
+
+ case POWER:
+ case THERMAL:
+ break;
+
+ case UBOX:
+ bdw_ubox_setup(cpu_id, index, event);
+ break;
+ case UBOXFIX:
+ if (haveLock)
+ {
+ flags = (1ULL<<22)|(1ULL<<20);
+ VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_UBOXFIX);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+ }
+ break;
+
+ case CBOX0:
+ case CBOX1:
+ case CBOX2:
+ case CBOX3:
+ case CBOX4:
+ case CBOX5:
+ case CBOX6:
+ case CBOX7:
+ case CBOX8:
+ case CBOX9:
+ case CBOX10:
+ case CBOX11:
+ case CBOX12:
+ case CBOX13:
+ case CBOX14:
+ case CBOX15:
+ broadwell_cbox_setup(cpu_id, index, event);
+ break;
+
+ case BBOX0:
+ case BBOX1:
+ bdw_bbox_setup(cpu_id, index, event);
+ break;
+
+ case WBOX:
+ bdw_wbox_setup(cpu_id, index, event);
+ break;
+ case WBOX0FIX:
+ break;
+
+ case MBOX0:
+ case MBOX1:
+ case MBOX2:
+ case MBOX3:
+ case MBOX4:
+ case MBOX5:
+ case MBOX6:
+ case MBOX7:
+ bdw_mbox_setup(cpu_id, index, event);
+ break;
+ case MBOX0FIX:
+ case MBOX1FIX:
+ case MBOX2FIX:
+ case MBOX3FIX:
+ case MBOX4FIX:
+ case MBOX5FIX:
+ case MBOX6FIX:
+ case MBOX7FIX:
+ bdw_mboxfix_setup(cpu_id, index, event);
+ break;
+
+ case PBOX:
+ bdw_pbox_setup(cpu_id, index, event);
+ break;
+
+ case IBOX0:
+ case IBOX1:
+ bdw_ibox_setup(cpu_id, index, event);
+ break;
+
+ case RBOX0:
+ case RBOX1:
+ bdw_rbox_setup(cpu_id, index, event);
+ break;
+
+ case SBOX0:
+ case SBOX1:
+ case SBOX2:
+ case SBOX3:
+ bdw_sbox_setup(cpu_id, index, event);
+ break;
+
+ case QBOX0:
+ bdw_qbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_0);
+ break;
+ case QBOX1:
+ bdw_qbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_1);
+ break;
+ case QBOX2:
+ bdw_qbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_2);
+ break;
+
+ default:
+ break;
+ }
+ }
+ if (fixed_flags > 0x0ULL)
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
+ }
+ return 0;
+}
+
+int perfmon_startCountersThread_broadwell(int thread_id, PerfmonEventSet* eventSet)
+{
+ int haveLock = 0;
+ uint64_t flags = 0x0ULL;
+ uint64_t tmp = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveLock = 1;
+ }
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ uint64_t counter1 = counter_map[index].counterRegister;
+ PciDeviceIndex dev = counter_map[index].device;
+ eventSet->events[i].threadCounter[thread_id].startData = 0;
+ eventSet->events[i].threadCounter[thread_id].counterData = 0;
+ switch (type)
+ {
+ case PMC:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+ flags |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr)); /* enable counter */
+ break;
+
+ case FIXED:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+ flags |= (1ULL<<(index+32)); /* enable fixed counter */
+ break;
+
+ case POWER:
+ if (haveLock)
+ {
+ tmp = 0x0ULL;
+ CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1,(uint32_t*)&tmp));
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST tmp, START_POWER)
+ eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[type].regWidth);
+ }
+ break;
+
+ case WBOX0FIX:
+ if (haveLock)
+ {
+ tmp = 0x0ULL;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, dev, counter1, &tmp));
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST tmp, START_WBOXFIX);
+ eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[type].regWidth);
+ }
+ break;
+ case QBOX0FIX:
+ case QBOX1FIX:
+ if (haveLock && HPMcheck(dev, cpu_id))
+ {
+ if (eventSet->events[i].event.eventId != 0x00)
+ {
+ CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter1, &tmp));
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST tmp, START_QBOXFIX);
+ eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[type].regWidth);
+ }
+ }
+ break;
+
+ default:
+ break;
+ }
+ eventSet->events[i].threadCounter[thread_id].counterData = eventSet->events[i].threadCounter[thread_id].startData;
+ }
+ }
+
+ BDW_UNFREEZE_UNCORE_AND_RESET_CTR;
+
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST (1ULL<<63)|(1ULL<<62)|flags, CLEAR_PMC_AND_FIXED_OVERFLOW)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<63)|(1ULL<<62)|flags));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, UNFREEZE_PMC_AND_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
+ }
+
+ return 0;
+}
+
+int bdw_uncore_read(int cpu_id, RegisterIndex index, PerfmonEvent *event,
+ uint64_t* cur_result, int* overflows, int flags,
+ int global_offset, int box_offset)
+{
+ uint64_t result = 0x0ULL;
+ uint64_t tmp = 0x0ULL;
+ RegisterType type = counter_map[index].type;
+ PciDeviceIndex dev = counter_map[index].device;
+ uint64_t counter1 = counter_map[index].counterRegister;
+ uint64_t counter2 = counter_map[index].counterRegister2;
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+
+ CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter1, &result));
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST result, READ_REG_1);
+ if (flags & FREEZE_FLAG_CLEAR_CTR)
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST 0x0U, CLEAR_PCI_REG_1);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0U));
+ }
+ if (counter2 != 0x0)
+ {
+ result <<= 32;
+ CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter2, &tmp));
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter2, LLU_CAST tmp, READ_REG_2);
+ result += tmp;
+ if (flags & FREEZE_FLAG_CLEAR_CTR)
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter2, LLU_CAST 0x0U, CLEAR_PCI_REG_2);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter2, 0x0U));
+ }
+ }
+ result = field64(result, 0, box_map[type].regWidth);
+ if (result < *cur_result)
+ {
+ uint64_t ovf_values = 0x0ULL;
+ int global_offset = box_map[type].ovflOffset;
+ int test_local = 0;
+ if (global_offset != -1)
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV,
+ MSR_UNC_V3_U_PMON_GLOBAL_STATUS,
+ &ovf_values));
+ VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, LLU_CAST ovf_values, READ_GLOBAL_OVFL);
+ if (ovf_values & (1<<global_offset))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, LLU_CAST (1<<global_offset), CLEAR_GLOBAL_OVFL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV,
+ MSR_UNC_V3_U_PMON_GLOBAL_STATUS,
+ (1<<global_offset)));
+ test_local = 1;
+ }
+ }
+ else
+ {
+ test_local = 1;
+ }
+
+ if (test_local)
+ {
+ ovf_values = 0x0ULL;
+ CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev,
+ box_map[type].statusRegister,
+ &ovf_values));
+ VERBOSEPRINTPCIREG(cpu_id, dev, box_map[type].statusRegister, LLU_CAST ovf_values, READ_BOX_OVFL);
+ if (ovf_values & (1<<box_offset))
+ {
+ (*overflows)++;
+ VERBOSEPRINTPCIREG(cpu_id, dev, box_map[type].statusRegister, LLU_CAST (1<<box_offset), RESET_BOX_OVFL);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev,
+ box_map[type].statusRegister,
+ (1<<box_offset)));
+ }
+ }
+ }
+ *cur_result = result;
+ return 0;
+}
+
+
+#define BDW_CHECK_CORE_OVERFLOW(offset) \
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+ { \
+ uint64_t ovf_values = 0x0ULL; \
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values)); \
+ if (ovf_values & (1ULL<<offset)) \
+ { \
+ eventSet->events[i].threadCounter[thread_id].overflows++; \
+ } \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<offset))); \
+ }
+
+int perfmon_stopCountersThread_broadwell(int thread_id, PerfmonEventSet* eventSet)
+{
+ int haveLock = 0;
+ uint64_t counter_result = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveLock = 1;
+ }
+
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
+ BDW_FREEZE_UNCORE;
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ counter_result = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ PciDeviceIndex dev = counter_map[index].device;
+ uint64_t counter1 = counter_map[index].counterRegister;
+ uint64_t* current = &(eventSet->events[i].threadCounter[thread_id].counterData);
+ int* overflows = &(eventSet->events[i].threadCounter[thread_id].overflows);
+ int ovf_offset = box_map[type].ovflOffset;
+ switch (type)
+ {
+ case PMC:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ BDW_CHECK_CORE_OVERFLOW(index-cpuid_info.perf_num_fixed_ctr);
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC)
+ break;
+
+ case FIXED:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ BDW_CHECK_CORE_OVERFLOW(index+32);
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED)
+ break;
+
+ case POWER:
+ if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
+ {
+ CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, STOP_POWER)
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+ {
+ eventSet->events[i].threadCounter[thread_id].overflows++;
+ }
+ }
+ break;
+
+ case THERMAL:
+ CHECK_TEMP_READ_ERROR(thermal_read(cpu_id,(uint32_t*)&counter_result));
+ break;
+
+ case BBOX0:
+ case BBOX1:
+ bdw_uncore_read(cpu_id, index, event, &counter_result, overflows,
+ FREEZE_FLAG_CLEAR_CTR, ovf_offset, getCounterTypeOffset(index));
+ break;
+
+ case MBOX0:
+ case MBOX1:
+ case MBOX2:
+ case MBOX3:
+ case MBOX4:
+ case MBOX5:
+ case MBOX6:
+ case MBOX7:
+ bdw_uncore_read(cpu_id, index, event, &counter_result, overflows,
+ FREEZE_FLAG_CLEAR_CTR, ovf_offset, getCounterTypeOffset(index)+1);
+ break;
+
+ case MBOX0FIX:
+ case MBOX1FIX:
+ case MBOX2FIX:
+ case MBOX3FIX:
+ case MBOX4FIX:
+ case MBOX5FIX:
+ case MBOX6FIX:
+ case MBOX7FIX:
+ bdw_uncore_read(cpu_id, index, event, &counter_result, overflows,
+ FREEZE_FLAG_CLEAR_CTR, ovf_offset, 0);
+ break;
+
+ case IBOX1:
+ bdw_uncore_read(cpu_id, index, event, &counter_result, overflows,
+ FREEZE_FLAG_CLEAR_CTR, ovf_offset, getCounterTypeOffset(index)+2);
+ break;
+
+ case PBOX:
+ case IBOX0:
+ case WBOX:
+ case UBOX:
+ case UBOXFIX:
+ case CBOX0:
+ case CBOX1:
+ case CBOX2:
+ case CBOX3:
+ case CBOX4:
+ case CBOX5:
+ case CBOX6:
+ case CBOX7:
+ case CBOX8:
+ case CBOX9:
+ case CBOX10:
+ case CBOX11:
+ case CBOX12:
+ case CBOX13:
+ case CBOX14:
+ case CBOX15:
+ case CBOX16:
+ case CBOX17:
+ case CBOX18:
+ case CBOX19:
+ case CBOX20:
+ case CBOX21:
+ case CBOX22:
+ case CBOX23:
+ case RBOX0:
+ case RBOX1:
+ case SBOX0:
+ case SBOX1:
+ case SBOX2:
+ case SBOX3:
+ case QBOX0:
+ case QBOX1:
+ case QBOX2:
+ bdw_uncore_read(cpu_id, index, event, &counter_result, overflows,
+ FREEZE_FLAG_CLEAR_CTR, ovf_offset, getCounterTypeOffset(index));
+ break;
+
+ case QBOX0FIX:
+ case QBOX1FIX:
+ case QBOX2FIX:
+ if (eventSet->events[i].event.eventId == 0x00)
+ {
+ HPMread(cpu_id, dev, counter1, &counter_result);
+ switch(extractBitField(counter_result, 3, 0))
+ {
+ case 0x2:
+ counter_result = 5.6E9;
+ break;
+ case 0x3:
+ counter_result = 6.4E9;
+ break;
+ case 0x4:
+ counter_result = 7.2E9;
+ break;
+ case 0x5:
+ counter_result = 8.0E9;
+ break;
+ case 0x6:
+ counter_result = 8.8E9;
+ break;
+ case 0x7:
+ counter_result = 9.6E9;
+ break;
+ default:
+ counter_result = 0;
+ break;
+ }
+
+ }
+ else if ((eventSet->events[i].event.eventId == 0x01) ||
+ (eventSet->events[i].event.eventId == 0x02))
+ {
+ HPMread(cpu_id, dev, counter1, &counter_result);
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, STOP_QBOXFIX);
+ counter_result = field64(counter_result, 0, box_map[type].regWidth);
+ }
+ eventSet->events[i].threadCounter[thread_id].counterData = counter_result;
+ break;
+
+ default:
+ break;
+ }
+ *current = field64(counter_result, 0, box_map[type].regWidth);
+ }
+ eventSet->events[i].threadCounter[thread_id].init = FALSE;
+ }
+
+
+ return 0;
+}
+
+
+int perfmon_readCountersThread_broadwell(int thread_id, PerfmonEventSet* eventSet)
+{
+ uint64_t flags = 0x0ULL;
+ int haveLock = 0;
+ uint64_t counter_result = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveLock = 1;
+ }
+
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &flags));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, SAFE_PMC_FLAGS)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, RESET_PMC_FLAGS)
+ }
+ BDW_FREEZE_UNCORE;
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+ {
+ counter_result= 0x0ULL;
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ PciDeviceIndex dev = counter_map[index].device;
+ uint64_t counter1 = counter_map[index].counterRegister;
+ uint64_t* current = &(eventSet->events[i].threadCounter[thread_id].counterData);
+ int* overflows = &(eventSet->events[i].threadCounter[thread_id].overflows);
+ int ovf_offset = box_map[type].ovflOffset;
+ switch (type)
+ {
+ case PMC:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ BDW_CHECK_CORE_OVERFLOW(index-cpuid_info.perf_num_fixed_ctr);
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC)
+ *current = field64(counter_result, 0, box_map[type].regWidth);
+ break;
+
+ case FIXED:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ BDW_CHECK_CORE_OVERFLOW(index+32);
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED)
+ *current = field64(counter_result, 0, box_map[type].regWidth);
+ break;
+
+ case POWER:
+ if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
+ {
+ CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, STOP_POWER)
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+ {
+ eventSet->events[i].threadCounter[thread_id].overflows++;
+ }
+ *current = field64(counter_result, 0, box_map[type].regWidth);
+ }
+ break;
+
+ case THERMAL:
+ CHECK_TEMP_READ_ERROR(thermal_read(cpu_id,(uint32_t*)&counter_result));
+ *current = field64(counter_result, 0, box_map[type].regWidth);
+ break;
+
+ case MBOX0:
+ case MBOX1:
+ case MBOX2:
+ case MBOX3:
+ case MBOX4:
+ case MBOX5:
+ case MBOX6:
+ case MBOX7:
+ bdw_uncore_read(cpu_id, index, event, current, overflows,
+ 0, ovf_offset, getCounterTypeOffset(index)+1);
+ break;
+
+ case MBOX0FIX:
+ case MBOX1FIX:
+ case MBOX2FIX:
+ case MBOX3FIX:
+ case MBOX4FIX:
+ case MBOX5FIX:
+ case MBOX6FIX:
+ case MBOX7FIX:
+ bdw_uncore_read(cpu_id, index, event, current, overflows,
+ 0, ovf_offset, 0);
+ break;
+
+ case IBOX1:
+ bdw_uncore_read(cpu_id, index, event, current, overflows,
+ 0, ovf_offset, getCounterTypeOffset(index)+2);
+ break;
+
+ case BBOX0:
+ case BBOX1:
+ case PBOX:
+ case IBOX0:
+ case WBOX:
+ case UBOX:
+ case UBOXFIX:
+ case CBOX0:
+ case CBOX1:
+ case CBOX2:
+ case CBOX3:
+ case CBOX4:
+ case CBOX5:
+ case CBOX6:
+ case CBOX7:
+ case CBOX8:
+ case CBOX9:
+ case CBOX10:
+ case CBOX11:
+ case CBOX12:
+ case CBOX13:
+ case CBOX14:
+ case CBOX15:
+ case RBOX0:
+ case RBOX1:
+ case SBOX0:
+ case SBOX1:
+ case SBOX2:
+ case SBOX3:
+ case QBOX0:
+ case QBOX1:
+ case QBOX2:
+ bdw_uncore_read(cpu_id, index, event, current, overflows,
+ 0, ovf_offset, getCounterTypeOffset(index));
+ break;
+
+ case QBOX0FIX:
+ case QBOX1FIX:
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_QBOXFIX)
+ if (eventSet->events[i].event.eventId == 0x00)
+ {
+ HPMread(cpu_id, dev, counter1, &counter_result);
+ switch(extractBitField(counter_result, 3, 0))
+ {
+ case 0x2:
+ counter_result = 5.6E9;
+ break;
+ case 0x3:
+ counter_result = 6.4E9;
+ break;
+ case 0x4:
+ counter_result = 7.2E9;
+ break;
+ case 0x5:
+ counter_result = 8.0E9;
+ break;
+ case 0x6:
+ counter_result = 8.8E9;
+ break;
+ case 0x7:
+ counter_result = 9.6E9;
+ break;
+ default:
+ counter_result = 0;
+ break;
+ }
+
+ }
+ else if ((eventSet->events[i].event.eventId == 0x01) ||
+ (eventSet->events[i].event.eventId == 0x02))
+ {
+ HPMread(cpu_id, dev, counter1, &counter_result);
+ counter_result = field64(counter_result, 0, box_map[type].regWidth);
+ }
+ *current = counter_result;
+ break;
+
+ default:
+ break;
+ }
+ }
+ }
+ BDW_UNFREEZE_UNCORE;
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, RESTORE_PMC_FLAGS)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
+ }
+
+ return 0;
+}
+
+int perfmon_finalizeCountersThread_broadwell(int thread_id, PerfmonEventSet* eventSet)
+{
+ int haveLock = 0;
+ int haveTileLock = 0;
+ uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
+ uint64_t ovf_values_uncore = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveLock = 1;
+ }
+ if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+ {
+ haveTileLock = 1;
+ }
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ PciDeviceIndex dev = counter_map[index].device;
+ uint64_t reg = counter_map[index].configRegister;
+ switch (type)
+ {
+ case PMC:
+ ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+ if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
+ }
+ else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+ }
+ break;
+ case FIXED:
+ ovf_values_core |= (1ULL<<(index+32));
+ break;
+ default:
+ break;
+ }
+ if ((reg) && (((type == PMC)||(type == FIXED))||((type >= UNCORE) && (haveLock))))
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, dev, reg, &ovf_values_uncore));
+ VERBOSEPRINTPCIREG(cpu_id, dev, reg, ovf_values_uncore, SHOW_CTL);
+ ovf_values_uncore = 0x0ULL;
+ VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+ }
+ eventSet->events[i].threadCounter[thread_id].init = FALSE;
+ }
+
+ if (haveLock && eventSet->regTypeMask & ~(0xFULL))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, LLU_CAST ovf_values_uncore, CLEAR_UNCORE_OVF)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, ovf_values_uncore));
+ VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST 0x0ULL, CLEAR_UNCORE_CTRL)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_CTL, 0x0ULL));
+ }
+
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST ovf_values_core, CLEAR_GLOBAL_OVF)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST 0x0ULL, CLEAR_GLOBAL_CTRL)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
+ return 0;
+}
diff --git a/src/includes/perfmon_broadwellEP_counters.h b/src/includes/perfmon_broadwellEP_counters.h
new file mode 100644
index 0000000..d37c871
--- /dev/null
+++ b/src/includes/perfmon_broadwellEP_counters.h
@@ -0,0 +1,362 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: perfmon_broadwellEP_counters.h
+ *
+ * Description: Counter Header File of perfmon module for Broadwell EP/EN/EX.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+
+#define NUM_COUNTERS_BROADWELLEP 216
+#define NUM_COUNTERS_CORE_BROADWELLEP 8
+#define NUM_COUNTERS_UNCORE_BROADWELLEP 85
+
+#define BDW_EP_VALID_OPTIONS_FIXED EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_COUNT_KERNEL_MASK
+#define BDW_EP_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK| \
+ EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_IN_TRANS_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define BDW_EP_VALID_OPTIONS_UBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK
+#define BDW_EP_VALID_OPTIONS_CBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK|\
+ EVENT_OPTION_TID_MASK|EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK|EVENT_OPTION_STATE_MASK|\
+ EVENT_OPTION_MATCH0_MASK
+#define BDW_EP_VALID_OPTIONS_WBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK|\
+ EVENT_OPTION_OCCUPANCY_MASK|EVENT_OPTION_OCCUPANCY_FILTER_MASK|EVENT_OPTION_OCCUPANCY_EDGE_MASK|\
+ EVENT_OPTION_OCCUPANCY_INVERT_MASK
+#define BDW_EP_VALID_OPTIONS_BBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define BDW_EP_VALID_OPTIONS_MBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define BDW_EP_VALID_OPTIONS_IBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define BDW_EP_VALID_OPTIONS_PBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define BDW_EP_VALID_OPTIONS_RBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define BDW_EP_VALID_OPTIONS_SBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_TID_MASK
+#define BDW_EP_VALID_OPTIONS_QBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+static RegisterMap broadwellEP_counter_map[NUM_COUNTERS_BROADWELLEP] = {
+ /* Fixed Counters: instructions retired, cycles unhalted core */
+ {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_FIXED},
+ {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_FIXED},
+ {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_FIXED},
+ /* PMC Counters: 4 48bit wide */
+ {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, BDW_EP_VALID_OPTIONS_PMC},
+ {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, BDW_EP_VALID_OPTIONS_PMC},
+ {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, BDW_EP_VALID_OPTIONS_PMC|EVENT_OPTION_IN_TRANS_ABORT_MASK},
+ {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, BDW_EP_VALID_OPTIONS_PMC},
+ /* Temperature Sensor*/
+ {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ /* RAPL counters */
+ {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"UBOX0", PMC12, UBOX, MSR_UNC_V3_U_PMON_CTL0, MSR_UNC_V3_U_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_UBOX},
+ {"UBOX1", PMC13, UBOX, MSR_UNC_V3_U_PMON_CTL1, MSR_UNC_V3_U_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_UBOX},
+ {"UBOXFIX", PMC14, UBOXFIX, MSR_UNC_V3_U_UCLK_FIXED_CTL, MSR_UNC_V3_U_UCLK_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"CBOX0C0", PMC15, CBOX0, MSR_UNC_V3_C0_PMON_CTL0, MSR_UNC_V3_C0_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX0C1", PMC16, CBOX0, MSR_UNC_V3_C0_PMON_CTL1, MSR_UNC_V3_C0_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX0C2", PMC17, CBOX0, MSR_UNC_V3_C0_PMON_CTL2, MSR_UNC_V3_C0_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX0C3", PMC18, CBOX0, MSR_UNC_V3_C0_PMON_CTL3, MSR_UNC_V3_C0_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX1C0", PMC19, CBOX1, MSR_UNC_V3_C1_PMON_CTL0, MSR_UNC_V3_C1_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX1C1", PMC20, CBOX1, MSR_UNC_V3_C1_PMON_CTL1, MSR_UNC_V3_C1_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX1C2", PMC21, CBOX1, MSR_UNC_V3_C1_PMON_CTL2, MSR_UNC_V3_C1_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX1C3", PMC22, CBOX1, MSR_UNC_V3_C1_PMON_CTL3, MSR_UNC_V3_C1_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX2C0", PMC23, CBOX2, MSR_UNC_V3_C2_PMON_CTL0, MSR_UNC_V3_C2_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX2C1", PMC24, CBOX2, MSR_UNC_V3_C2_PMON_CTL1, MSR_UNC_V3_C2_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX2C2", PMC25, CBOX2, MSR_UNC_V3_C2_PMON_CTL2, MSR_UNC_V3_C2_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX2C3", PMC26, CBOX2, MSR_UNC_V3_C2_PMON_CTL3, MSR_UNC_V3_C2_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX3C0", PMC27, CBOX3, MSR_UNC_V3_C3_PMON_CTL0, MSR_UNC_V3_C3_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX3C1", PMC28, CBOX3, MSR_UNC_V3_C3_PMON_CTL1, MSR_UNC_V3_C3_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX3C2", PMC29, CBOX3, MSR_UNC_V3_C3_PMON_CTL2, MSR_UNC_V3_C3_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX3C3", PMC30, CBOX3, MSR_UNC_V3_C3_PMON_CTL3, MSR_UNC_V3_C3_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX4C0", PMC31, CBOX4, MSR_UNC_V3_C4_PMON_CTL0, MSR_UNC_V3_C4_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX4C1", PMC32, CBOX4, MSR_UNC_V3_C4_PMON_CTL1, MSR_UNC_V3_C4_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX4C2", PMC33, CBOX4, MSR_UNC_V3_C4_PMON_CTL2, MSR_UNC_V3_C4_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX4C3", PMC34, CBOX4, MSR_UNC_V3_C4_PMON_CTL3, MSR_UNC_V3_C4_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX5C0", PMC35, CBOX5, MSR_UNC_V3_C5_PMON_CTL0, MSR_UNC_V3_C5_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX5C1", PMC36, CBOX5, MSR_UNC_V3_C5_PMON_CTL1, MSR_UNC_V3_C5_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX5C2", PMC37, CBOX5, MSR_UNC_V3_C5_PMON_CTL2, MSR_UNC_V3_C5_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX5C3", PMC38, CBOX5, MSR_UNC_V3_C5_PMON_CTL3, MSR_UNC_V3_C5_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX6C0", PMC39, CBOX6, MSR_UNC_V3_C6_PMON_CTL0, MSR_UNC_V3_C6_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX6C1", PMC40, CBOX6, MSR_UNC_V3_C6_PMON_CTL1, MSR_UNC_V3_C6_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX6C2", PMC41, CBOX6, MSR_UNC_V3_C6_PMON_CTL2, MSR_UNC_V3_C6_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX6C3", PMC42, CBOX6, MSR_UNC_V3_C6_PMON_CTL3, MSR_UNC_V3_C6_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX7C0", PMC43, CBOX7, MSR_UNC_V3_C7_PMON_CTL0, MSR_UNC_V3_C7_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX7C1", PMC44, CBOX7, MSR_UNC_V3_C7_PMON_CTL1, MSR_UNC_V3_C7_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX7C2", PMC45, CBOX7, MSR_UNC_V3_C7_PMON_CTL2, MSR_UNC_V3_C7_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX7C3", PMC46, CBOX7, MSR_UNC_V3_C7_PMON_CTL3, MSR_UNC_V3_C7_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX8C0", PMC47, CBOX8, MSR_UNC_V3_C8_PMON_CTL0, MSR_UNC_V3_C8_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX8C1", PMC48, CBOX8, MSR_UNC_V3_C8_PMON_CTL1, MSR_UNC_V3_C8_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX8C2", PMC49, CBOX8, MSR_UNC_V3_C8_PMON_CTL2, MSR_UNC_V3_C8_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX8C3", PMC50, CBOX8, MSR_UNC_V3_C8_PMON_CTL3, MSR_UNC_V3_C8_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX9C0", PMC51, CBOX9, MSR_UNC_V3_C9_PMON_CTL0, MSR_UNC_V3_C9_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX9C1", PMC52, CBOX9, MSR_UNC_V3_C9_PMON_CTL1, MSR_UNC_V3_C9_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX9C2", PMC53, CBOX9, MSR_UNC_V3_C9_PMON_CTL2, MSR_UNC_V3_C9_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX9C3", PMC54, CBOX9, MSR_UNC_V3_C9_PMON_CTL3, MSR_UNC_V3_C9_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX10C0", PMC55, CBOX10, MSR_UNC_V3_C10_PMON_CTL0, MSR_UNC_V3_C10_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX10C1", PMC56, CBOX10, MSR_UNC_V3_C10_PMON_CTL1, MSR_UNC_V3_C10_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX10C2", PMC57, CBOX10, MSR_UNC_V3_C10_PMON_CTL2, MSR_UNC_V3_C10_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX10C3", PMC58, CBOX10, MSR_UNC_V3_C10_PMON_CTL3, MSR_UNC_V3_C10_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX11C0", PMC59, CBOX11, MSR_UNC_V3_C11_PMON_CTL0, MSR_UNC_V3_C11_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX11C1", PMC60, CBOX11, MSR_UNC_V3_C11_PMON_CTL1, MSR_UNC_V3_C11_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX11C2", PMC61, CBOX11, MSR_UNC_V3_C11_PMON_CTL2, MSR_UNC_V3_C11_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX11C3", PMC62, CBOX11, MSR_UNC_V3_C11_PMON_CTL3, MSR_UNC_V3_C11_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX12C0", PMC63, CBOX12, MSR_UNC_V3_C12_PMON_CTL0, MSR_UNC_V3_C12_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX12C1", PMC64, CBOX12, MSR_UNC_V3_C12_PMON_CTL1, MSR_UNC_V3_C12_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX12C2", PMC65, CBOX12, MSR_UNC_V3_C12_PMON_CTL2, MSR_UNC_V3_C12_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX12C3", PMC66, CBOX12, MSR_UNC_V3_C12_PMON_CTL3, MSR_UNC_V3_C12_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX13C0", PMC67, CBOX13, MSR_UNC_V3_C13_PMON_CTL0, MSR_UNC_V3_C13_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX13C1", PMC68, CBOX13, MSR_UNC_V3_C13_PMON_CTL1, MSR_UNC_V3_C13_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX13C2", PMC69, CBOX13, MSR_UNC_V3_C13_PMON_CTL2, MSR_UNC_V3_C13_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX13C3", PMC70, CBOX13, MSR_UNC_V3_C13_PMON_CTL3, MSR_UNC_V3_C13_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX14C0", PMC71, CBOX14, MSR_UNC_V3_C14_PMON_CTL0, MSR_UNC_V3_C14_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX14C1", PMC72, CBOX14, MSR_UNC_V3_C14_PMON_CTL1, MSR_UNC_V3_C14_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX14C2", PMC73, CBOX14, MSR_UNC_V3_C14_PMON_CTL2, MSR_UNC_V3_C14_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX14C3", PMC74, CBOX14, MSR_UNC_V3_C14_PMON_CTL3, MSR_UNC_V3_C14_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX15C0", PMC75, CBOX15, MSR_UNC_V3_C15_PMON_CTL0, MSR_UNC_V3_C15_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX15C1", PMC76, CBOX15, MSR_UNC_V3_C15_PMON_CTL1, MSR_UNC_V3_C15_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX15C2", PMC77, CBOX15, MSR_UNC_V3_C15_PMON_CTL2, MSR_UNC_V3_C15_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX15C3", PMC78, CBOX15, MSR_UNC_V3_C15_PMON_CTL3, MSR_UNC_V3_C15_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX16C0", PMC79, CBOX16, MSR_UNC_V3_C16_PMON_CTL0, MSR_UNC_V3_C16_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX16C1", PMC80, CBOX16, MSR_UNC_V3_C16_PMON_CTL1, MSR_UNC_V3_C16_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX16C2", PMC81, CBOX16, MSR_UNC_V3_C16_PMON_CTL2, MSR_UNC_V3_C16_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX16C3", PMC82, CBOX16, MSR_UNC_V3_C16_PMON_CTL3, MSR_UNC_V3_C16_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX17C0", PMC83, CBOX17, MSR_UNC_V3_C17_PMON_CTL0, MSR_UNC_V3_C17_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX17C1", PMC84, CBOX17, MSR_UNC_V3_C17_PMON_CTL1, MSR_UNC_V3_C17_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX17C2", PMC85, CBOX17, MSR_UNC_V3_C17_PMON_CTL2, MSR_UNC_V3_C17_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX17C3", PMC86, CBOX17, MSR_UNC_V3_C17_PMON_CTL3, MSR_UNC_V3_C17_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX18C0", PMC87, CBOX18, MSR_UNC_V3_C18_PMON_CTL0, MSR_UNC_V3_C18_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX18C1", PMC88, CBOX18, MSR_UNC_V3_C18_PMON_CTL1, MSR_UNC_V3_C18_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX18C2", PMC89, CBOX18, MSR_UNC_V3_C18_PMON_CTL2, MSR_UNC_V3_C18_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX18C3", PMC90, CBOX18, MSR_UNC_V3_C18_PMON_CTL3, MSR_UNC_V3_C18_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX19C0", PMC91, CBOX19, MSR_UNC_V3_C19_PMON_CTL0, MSR_UNC_V3_C19_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX19C1", PMC92, CBOX19, MSR_UNC_V3_C19_PMON_CTL1, MSR_UNC_V3_C19_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX19C2", PMC93, CBOX19, MSR_UNC_V3_C19_PMON_CTL2, MSR_UNC_V3_C19_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX19C3", PMC94, CBOX19, MSR_UNC_V3_C19_PMON_CTL3, MSR_UNC_V3_C19_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX20C0", PMC95, CBOX20, MSR_UNC_V3_C20_PMON_CTL0, MSR_UNC_V3_C20_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX20C1", PMC96, CBOX20, MSR_UNC_V3_C20_PMON_CTL1, MSR_UNC_V3_C20_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX20C2", PMC97, CBOX20, MSR_UNC_V3_C20_PMON_CTL2, MSR_UNC_V3_C20_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX20C3", PMC98, CBOX20, MSR_UNC_V3_C20_PMON_CTL3, MSR_UNC_V3_C20_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX21C0", PMC99, CBOX21, MSR_UNC_V3_C21_PMON_CTL0, MSR_UNC_V3_C21_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX21C1", PMC100, CBOX21, MSR_UNC_V3_C21_PMON_CTL1, MSR_UNC_V3_C21_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX21C2", PMC101, CBOX21, MSR_UNC_V3_C21_PMON_CTL2, MSR_UNC_V3_C21_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX21C3", PMC102, CBOX21, MSR_UNC_V3_C21_PMON_CTL3, MSR_UNC_V3_C21_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX22C0", PMC103, CBOX22, MSR_UNC_V3_C22_PMON_CTL0, MSR_UNC_V3_C22_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX22C1", PMC104, CBOX22, MSR_UNC_V3_C22_PMON_CTL1, MSR_UNC_V3_C22_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX22C2", PMC105, CBOX22, MSR_UNC_V3_C22_PMON_CTL2, MSR_UNC_V3_C22_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX22C3", PMC106, CBOX22, MSR_UNC_V3_C22_PMON_CTL3, MSR_UNC_V3_C22_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX23C0", PMC107, CBOX23, MSR_UNC_V3_C23_PMON_CTL0, MSR_UNC_V3_C23_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX23C1", PMC108, CBOX23, MSR_UNC_V3_C23_PMON_CTL1, MSR_UNC_V3_C23_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX23C2", PMC109, CBOX23, MSR_UNC_V3_C23_PMON_CTL2, MSR_UNC_V3_C23_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"CBOX23C3", PMC110, CBOX23, MSR_UNC_V3_C23_PMON_CTL3, MSR_UNC_V3_C23_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+ {"WBOX0", PMC111, WBOX, MSR_UNC_V3_PCU_PMON_CTL0, MSR_UNC_V3_PCU_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_WBOX},
+ {"WBOX1", PMC112, WBOX, MSR_UNC_V3_PCU_PMON_CTL1, MSR_UNC_V3_PCU_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_WBOX},
+ {"WBOX2", PMC113, WBOX, MSR_UNC_V3_PCU_PMON_CTL2, MSR_UNC_V3_PCU_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_WBOX},
+ {"WBOX3", PMC114, WBOX, MSR_UNC_V3_PCU_PMON_CTL3, MSR_UNC_V3_PCU_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_WBOX},
+ {"WBOX0FIX", PMC115, WBOX0FIX, 0, MSR_UNC_V3_PCU_CC3_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"WBOX1FIX", PMC116, WBOX0FIX, 0, MSR_UNC_V3_PCU_CC6_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"BBOX0C0", PMC117, BBOX0, PCI_UNC_HA_PMON_CTL_0, PCI_UNC_HA_PMON_CTR_0_A, PCI_UNC_HA_PMON_CTR_0_B, PCI_HA_DEVICE_0, BDW_EP_VALID_OPTIONS_BBOX},
+ {"BBOX0C1", PMC118, BBOX0, PCI_UNC_HA_PMON_CTL_1, PCI_UNC_HA_PMON_CTR_1_A, PCI_UNC_HA_PMON_CTR_1_B, PCI_HA_DEVICE_0, BDW_EP_VALID_OPTIONS_BBOX},
+ {"BBOX0C2", PMC119, BBOX0, PCI_UNC_HA_PMON_CTL_2, PCI_UNC_HA_PMON_CTR_2_A, PCI_UNC_HA_PMON_CTR_2_B, PCI_HA_DEVICE_0, BDW_EP_VALID_OPTIONS_BBOX},
+ {"BBOX0C3", PMC120, BBOX0, PCI_UNC_HA_PMON_CTL_3, PCI_UNC_HA_PMON_CTR_3_A, PCI_UNC_HA_PMON_CTR_3_B, PCI_HA_DEVICE_0, BDW_EP_VALID_OPTIONS_BBOX},
+ {"BBOX1C0", PMC121, BBOX1, PCI_UNC_HA_PMON_CTL_0, PCI_UNC_HA_PMON_CTR_0_A, PCI_UNC_HA_PMON_CTR_0_B, PCI_HA_DEVICE_1, BDW_EP_VALID_OPTIONS_BBOX},
+ {"BBOX1C1", PMC122, BBOX1, PCI_UNC_HA_PMON_CTL_1, PCI_UNC_HA_PMON_CTR_1_A, PCI_UNC_HA_PMON_CTR_1_B, PCI_HA_DEVICE_1, BDW_EP_VALID_OPTIONS_BBOX},
+ {"BBOX1C2", PMC123, BBOX1, PCI_UNC_HA_PMON_CTL_2, PCI_UNC_HA_PMON_CTR_2_A, PCI_UNC_HA_PMON_CTR_2_B, PCI_HA_DEVICE_1, BDW_EP_VALID_OPTIONS_BBOX},
+ {"BBOX1C3", PMC124, BBOX1, PCI_UNC_HA_PMON_CTL_3, PCI_UNC_HA_PMON_CTR_3_A, PCI_UNC_HA_PMON_CTR_3_B, PCI_HA_DEVICE_1, BDW_EP_VALID_OPTIONS_BBOX},
+ {"MBOX0C0", PMC125, MBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_0, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX0C1", PMC126, MBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_0, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX0C2", PMC127, MBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_0, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX0FIX", PMC128, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_INVERT_MASK},
+ {"MBOX0C3", PMC129, MBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_0, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX1C0", PMC130, MBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_1, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX1C1", PMC131, MBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_1, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX1C2", PMC132, MBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_1, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX1C3", PMC133, MBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_1, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX1FIX", PMC134, MBOX1FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_1, EVENT_OPTION_INVERT_MASK},
+ {"MBOX2C0", PMC135, MBOX2, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_2, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX2C1", PMC136, MBOX2, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_2, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX2C2", PMC137, MBOX2, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_2, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX2C3", PMC138, MBOX2, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_2, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX2FIX", PMC139, MBOX2FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_2, EVENT_OPTION_INVERT_MASK},
+ {"MBOX3C0", PMC140, MBOX3, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_3, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX3C1", PMC141, MBOX3, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_3, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX3C2", PMC142, MBOX3, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_3, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX3C3", PMC143, MBOX3, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_3, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX3FIX", PMC144, MBOX3FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_3, EVENT_OPTION_INVERT_MASK},
+ {"MBOX4C0", PMC145, MBOX4, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_0, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX4C1", PMC146, MBOX4, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_0, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX4C2", PMC147, MBOX4, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_0, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX4C3", PMC148, MBOX4, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_0, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX4FIX", PMC149, MBOX4FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_0, EVENT_OPTION_INVERT_MASK},
+ {"MBOX5C0", PMC150, MBOX5, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_1, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX5C1", PMC151, MBOX5, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_1, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX5C2", PMC152, MBOX5, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_1, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX5C3", PMC153, MBOX5, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_1, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX5FIX", PMC154, MBOX5FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_1, EVENT_OPTION_INVERT_MASK},
+ {"MBOX6C0", PMC155, MBOX6, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_2, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX6C1", PMC156, MBOX6, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_2, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX6C2", PMC157, MBOX6, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_2, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX6C3", PMC158, MBOX6, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_2, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX6FIX", PMC159, MBOX6FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_2, EVENT_OPTION_INVERT_MASK},
+ {"MBOX7C0", PMC160, MBOX7, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_3, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX7C1", PMC161, MBOX7, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_3, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX7C2", PMC162, MBOX7, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_3, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX7C3", PMC163, MBOX7, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_3, BDW_EP_VALID_OPTIONS_MBOX},
+ {"MBOX7FIX", PMC164, MBOX7FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_3, EVENT_OPTION_INVERT_MASK},
+ {"IBOX0C0", PMC165, IBOX0, PCI_UNC_IRP0_PMON_CTL_0, PCI_UNC_IRP0_PMON_CTR_0, 0, PCI_IRP_DEVICE, BDW_EP_VALID_OPTIONS_IBOX},
+ {"IBOX0C1", PMC166, IBOX0, PCI_UNC_IRP0_PMON_CTL_1, PCI_UNC_IRP0_PMON_CTR_1, 0, PCI_IRP_DEVICE, BDW_EP_VALID_OPTIONS_IBOX},
+ {"IBOX1C0", PMC167, IBOX1, PCI_UNC_IRP1_PMON_CTL_0, PCI_UNC_IRP1_PMON_CTR_0, 0, PCI_IRP_DEVICE, BDW_EP_VALID_OPTIONS_IBOX},
+ {"IBOX1C1", PMC168, IBOX1, PCI_UNC_IRP1_PMON_CTL_1, PCI_UNC_IRP1_PMON_CTR_1, 0, PCI_IRP_DEVICE, BDW_EP_VALID_OPTIONS_IBOX},
+ {"PBOX0", PMC169, PBOX, PCI_UNC_R2PCIE_PMON_CTL_0, PCI_UNC_R2PCIE_PMON_CTR_0_A, PCI_UNC_R2PCIE_PMON_CTR_0_B, PCI_R2PCIE_DEVICE, BDW_EP_VALID_OPTIONS_PBOX},
+ {"PBOX1", PMC170, PBOX, PCI_UNC_R2PCIE_PMON_CTL_1, PCI_UNC_R2PCIE_PMON_CTR_1_A, PCI_UNC_R2PCIE_PMON_CTR_1_B, PCI_R2PCIE_DEVICE, BDW_EP_VALID_OPTIONS_PBOX},
+ {"PBOX2", PMC171, PBOX, PCI_UNC_R2PCIE_PMON_CTL_2, PCI_UNC_R2PCIE_PMON_CTR_2_A, PCI_UNC_R2PCIE_PMON_CTR_2_B, PCI_R2PCIE_DEVICE, BDW_EP_VALID_OPTIONS_PBOX},
+ {"PBOX3", PMC172, PBOX, PCI_UNC_R2PCIE_PMON_CTL_3, PCI_UNC_R2PCIE_PMON_CTR_3_A, PCI_UNC_R2PCIE_PMON_CTR_3_B, PCI_R2PCIE_DEVICE, BDW_EP_VALID_OPTIONS_PBOX},
+ {"RBOX0C0", PMC173, RBOX0, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_0, BDW_EP_VALID_OPTIONS_RBOX},
+ {"RBOX0C1", PMC174, RBOX0, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_0, BDW_EP_VALID_OPTIONS_RBOX},
+ {"RBOX0C2", PMC175, RBOX0, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_0, BDW_EP_VALID_OPTIONS_RBOX},
+ {"RBOX1C0", PMC176, RBOX1, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_1, BDW_EP_VALID_OPTIONS_RBOX},
+ {"RBOX1C1", PMC177, RBOX1, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_1, BDW_EP_VALID_OPTIONS_RBOX},
+ {"RBOX1C2", PMC178, RBOX1, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_1, BDW_EP_VALID_OPTIONS_RBOX},
+ {"SBOX0C0", PMC179, SBOX0, MSR_UNC_V3_S0_PMON_CTL_0, MSR_UNC_V3_S0_PMON_CTR_0, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+ {"SBOX0C1", PMC180, SBOX0, MSR_UNC_V3_S0_PMON_CTL_1, MSR_UNC_V3_S0_PMON_CTR_1, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+ {"SBOX0C2", PMC181, SBOX0, MSR_UNC_V3_S0_PMON_CTL_2, MSR_UNC_V3_S0_PMON_CTR_2, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+ {"SBOX0C3", PMC182, SBOX0, MSR_UNC_V3_S0_PMON_CTL_3, MSR_UNC_V3_S0_PMON_CTR_3, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+ {"SBOX1C0", PMC183, SBOX1, MSR_UNC_V3_S1_PMON_CTL_0, MSR_UNC_V3_S1_PMON_CTR_0, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+ {"SBOX1C1", PMC184, SBOX1, MSR_UNC_V3_S1_PMON_CTL_1, MSR_UNC_V3_S1_PMON_CTR_1, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+ {"SBOX1C2", PMC185, SBOX1, MSR_UNC_V3_S1_PMON_CTL_2, MSR_UNC_V3_S1_PMON_CTR_2, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+ {"SBOX1C3", PMC186, SBOX1, MSR_UNC_V3_S1_PMON_CTL_3, MSR_UNC_V3_S1_PMON_CTR_3, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+ {"SBOX2C0", PMC187, SBOX2, MSR_UNC_V3_S2_PMON_CTL_0, MSR_UNC_V3_S2_PMON_CTR_0, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+ {"SBOX2C1", PMC188, SBOX2, MSR_UNC_V3_S2_PMON_CTL_1, MSR_UNC_V3_S2_PMON_CTR_1, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+ {"SBOX2C2", PMC189, SBOX2, MSR_UNC_V3_S2_PMON_CTL_2, MSR_UNC_V3_S2_PMON_CTR_2, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+ {"SBOX2C3", PMC190, SBOX2, MSR_UNC_V3_S2_PMON_CTL_3, MSR_UNC_V3_S2_PMON_CTR_3, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+ {"SBOX3C0", PMC191, SBOX3, MSR_UNC_V3_S3_PMON_CTL_0, MSR_UNC_V3_S3_PMON_CTR_0, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+ {"SBOX3C1", PMC192, SBOX3, MSR_UNC_V3_S3_PMON_CTL_1, MSR_UNC_V3_S3_PMON_CTR_1, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+ {"SBOX3C2", PMC193, SBOX3, MSR_UNC_V3_S3_PMON_CTL_2, MSR_UNC_V3_S3_PMON_CTR_2, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+ {"SBOX3C3", PMC194, SBOX3, MSR_UNC_V3_S3_PMON_CTL_3, MSR_UNC_V3_S3_PMON_CTR_3, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+ {"QBOX0C0", PMC195, QBOX0, PCI_UNC_V3_QPI_PMON_CTL_0, PCI_UNC_V3_QPI_PMON_CTR_0_A, PCI_UNC_V3_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_0, BDW_EP_VALID_OPTIONS_QBOX},
+ {"QBOX0C1", PMC196, QBOX0, PCI_UNC_V3_QPI_PMON_CTL_1, PCI_UNC_V3_QPI_PMON_CTR_1_A, PCI_UNC_V3_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_0, BDW_EP_VALID_OPTIONS_QBOX},
+ {"QBOX0C2", PMC197, QBOX0, PCI_UNC_V3_QPI_PMON_CTL_2, PCI_UNC_V3_QPI_PMON_CTR_2_A, PCI_UNC_V3_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_0, BDW_EP_VALID_OPTIONS_QBOX},
+ {"QBOX0C3", PMC198, QBOX0, PCI_UNC_V3_QPI_PMON_CTL_3, PCI_UNC_V3_QPI_PMON_CTR_3_A, PCI_UNC_V3_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_0, BDW_EP_VALID_OPTIONS_QBOX},
+ {"QBOX1C0", PMC199, QBOX1, PCI_UNC_V3_QPI_PMON_CTL_0, PCI_UNC_V3_QPI_PMON_CTR_0_A, PCI_UNC_V3_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_1, BDW_EP_VALID_OPTIONS_QBOX},
+ {"QBOX1C1", PMC200, QBOX1, PCI_UNC_V3_QPI_PMON_CTL_1, PCI_UNC_V3_QPI_PMON_CTR_1_A, PCI_UNC_V3_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_1, BDW_EP_VALID_OPTIONS_QBOX},
+ {"QBOX1C2", PMC201, QBOX1, PCI_UNC_V3_QPI_PMON_CTL_2, PCI_UNC_V3_QPI_PMON_CTR_2_A, PCI_UNC_V3_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_1, BDW_EP_VALID_OPTIONS_QBOX},
+ {"QBOX1C3", PMC202, QBOX1, PCI_UNC_V3_QPI_PMON_CTL_3, PCI_UNC_V3_QPI_PMON_CTR_3_A, PCI_UNC_V3_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_1, BDW_EP_VALID_OPTIONS_QBOX},
+ {"QBOX2C0", PMC203, QBOX2, PCI_UNC_V3_QPI_PMON_CTL_0, PCI_UNC_V3_QPI_PMON_CTR_0_A, PCI_UNC_V3_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_2, BDW_EP_VALID_OPTIONS_QBOX},
+ {"QBOX2C1", PMC204, QBOX2, PCI_UNC_V3_QPI_PMON_CTL_1, PCI_UNC_V3_QPI_PMON_CTR_1_A, PCI_UNC_V3_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_2, BDW_EP_VALID_OPTIONS_QBOX},
+ {"QBOX2C2", PMC205, QBOX2, PCI_UNC_V3_QPI_PMON_CTL_2, PCI_UNC_V3_QPI_PMON_CTR_2_A, PCI_UNC_V3_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_2, BDW_EP_VALID_OPTIONS_QBOX},
+ {"QBOX2C3", PMC206, QBOX2, PCI_UNC_V3_QPI_PMON_CTL_3, PCI_UNC_V3_QPI_PMON_CTR_3_A, PCI_UNC_V3_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_2, BDW_EP_VALID_OPTIONS_QBOX},
+ {"QBOX0FIX0", PMC207, QBOX0FIX, 0x0, PCI_UNC_V3_QPI_RATE_STATUS, 0x0, PCI_QPI_MISC_DEVICE_PORT_0, EVENT_OPTION_NONE_MASK},
+ {"QBOX0FIX1", PMC208, QBOX0FIX, 0x0, PCI_UNC_V3_QPI_LINK_IDLE, 0x0, PCI_QPI_MISC_DEVICE_PORT_0, EVENT_OPTION_NONE_MASK},
+ {"QBOX0FIX2", PMC209, QBOX0FIX, 0x0, PCI_UNC_V3_QPI_LINK_LLR, 0x0, PCI_QPI_MISC_DEVICE_PORT_0, EVENT_OPTION_NONE_MASK},
+ {"QBOX1FIX0", PMC210, QBOX1FIX, 0x0, PCI_UNC_V3_QPI_RATE_STATUS, 0x0, PCI_QPI_MISC_DEVICE_PORT_1, EVENT_OPTION_NONE_MASK},
+ {"QBOX1FIX1", PMC211, QBOX1FIX, 0x0, PCI_UNC_V3_QPI_LINK_IDLE, 0x0, PCI_QPI_MISC_DEVICE_PORT_1, EVENT_OPTION_NONE_MASK},
+ {"QBOX1FIX2", PMC212, QBOX1FIX, 0x0, PCI_UNC_V3_QPI_LINK_LLR, 0x0, PCI_QPI_MISC_DEVICE_PORT_1, EVENT_OPTION_NONE_MASK},
+ {"QBOX2FIX0", PMC213, QBOX2FIX, 0x0, PCI_UNC_V3_QPI_RATE_STATUS, 0x0, PCI_QPI_MISC_DEVICE_PORT_2, EVENT_OPTION_NONE_MASK},
+ {"QBOX2FIX1", PMC214, QBOX2FIX, 0x0, PCI_UNC_V3_QPI_LINK_IDLE, 0x0, PCI_QPI_MISC_DEVICE_PORT_2, EVENT_OPTION_NONE_MASK},
+ {"QBOX2FIX2", PMC215, QBOX2FIX, 0x0, PCI_UNC_V3_QPI_LINK_LLR, 0x0, PCI_QPI_MISC_DEVICE_PORT_2, EVENT_OPTION_NONE_MASK},
+};
+
+static BoxMap broadwellEP_box_map[NUM_UNITS] = {
+ [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+ [THERMAL] = {0, 0, 0, 0, 0, 0, 8},
+ [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+ [POWER] = {0, 0, 0, 0, 0, 0, 32},
+ [UBOX] = {0, MSR_UNC_V3_U_PMON_BOX_STATUS, MSR_UNC_V3_U_PMON_BOX_STATUS, 1, 0, 0, 48},
+ [UBOXFIX] = {0, MSR_UNC_V3_U_PMON_BOX_STATUS, MSR_UNC_V3_U_PMON_BOX_STATUS, 0, 0, 0, 48},
+ [CBOX0] = {MSR_UNC_V3_C0_PMON_BOX_CTL, MSR_UNC_V3_C0_PMON_BOX_STATUS, MSR_UNC_V3_C0_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C0_PMON_BOX_FILTER0, MSR_UNC_V3_C0_PMON_BOX_FILTER1},
+ [CBOX1] = {MSR_UNC_V3_C1_PMON_BOX_CTL, MSR_UNC_V3_C1_PMON_BOX_STATUS, MSR_UNC_V3_C1_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C1_PMON_BOX_FILTER0, MSR_UNC_V3_C1_PMON_BOX_FILTER1},
+ [CBOX2] = {MSR_UNC_V3_C2_PMON_BOX_CTL, MSR_UNC_V3_C2_PMON_BOX_STATUS, MSR_UNC_V3_C2_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C2_PMON_BOX_FILTER0, MSR_UNC_V3_C2_PMON_BOX_FILTER1},
+ [CBOX3] = {MSR_UNC_V3_C3_PMON_BOX_CTL, MSR_UNC_V3_C3_PMON_BOX_STATUS, MSR_UNC_V3_C3_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C3_PMON_BOX_FILTER0, MSR_UNC_V3_C3_PMON_BOX_FILTER1},
+ [CBOX4] = {MSR_UNC_V3_C4_PMON_BOX_CTL, MSR_UNC_V3_C4_PMON_BOX_STATUS, MSR_UNC_V3_C4_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C4_PMON_BOX_FILTER0, MSR_UNC_V3_C4_PMON_BOX_FILTER1},
+ [CBOX5] = {MSR_UNC_V3_C5_PMON_BOX_CTL, MSR_UNC_V3_C5_PMON_BOX_STATUS, MSR_UNC_V3_C5_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C5_PMON_BOX_FILTER0, MSR_UNC_V3_C5_PMON_BOX_FILTER1},
+ [CBOX6] = {MSR_UNC_V3_C6_PMON_BOX_CTL, MSR_UNC_V3_C6_PMON_BOX_STATUS, MSR_UNC_V3_C6_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C6_PMON_BOX_FILTER0, MSR_UNC_V3_C6_PMON_BOX_FILTER1},
+ [CBOX7] = {MSR_UNC_V3_C7_PMON_BOX_CTL, MSR_UNC_V3_C7_PMON_BOX_STATUS, MSR_UNC_V3_C7_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C7_PMON_BOX_FILTER0, MSR_UNC_V3_C7_PMON_BOX_FILTER1},
+ [CBOX8] = {MSR_UNC_V3_C8_PMON_BOX_CTL, MSR_UNC_V3_C8_PMON_BOX_STATUS, MSR_UNC_V3_C8_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C8_PMON_BOX_FILTER0, MSR_UNC_V3_C8_PMON_BOX_FILTER1},
+ [CBOX9] = {MSR_UNC_V3_C9_PMON_BOX_CTL, MSR_UNC_V3_C9_PMON_BOX_STATUS, MSR_UNC_V3_C9_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C9_PMON_BOX_FILTER0, MSR_UNC_V3_C9_PMON_BOX_FILTER1},
+ [CBOX10] = {MSR_UNC_V3_C10_PMON_BOX_CTL, MSR_UNC_V3_C10_PMON_BOX_STATUS, MSR_UNC_V3_C10_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C10_PMON_BOX_FILTER0, MSR_UNC_V3_C10_PMON_BOX_FILTER1},
+ [CBOX11] = {MSR_UNC_V3_C11_PMON_BOX_CTL, MSR_UNC_V3_C11_PMON_BOX_STATUS, MSR_UNC_V3_C11_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C11_PMON_BOX_FILTER0, MSR_UNC_V3_C11_PMON_BOX_FILTER1},
+ [CBOX12] = {MSR_UNC_V3_C12_PMON_BOX_CTL, MSR_UNC_V3_C12_PMON_BOX_STATUS, MSR_UNC_V3_C12_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C12_PMON_BOX_FILTER0, MSR_UNC_V3_C12_PMON_BOX_FILTER1},
+ [CBOX13] = {MSR_UNC_V3_C13_PMON_BOX_CTL, MSR_UNC_V3_C13_PMON_BOX_STATUS, MSR_UNC_V3_C13_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C13_PMON_BOX_FILTER0, MSR_UNC_V3_C13_PMON_BOX_FILTER1},
+ [CBOX14] = {MSR_UNC_V3_C14_PMON_BOX_CTL, MSR_UNC_V3_C14_PMON_BOX_STATUS, MSR_UNC_V3_C14_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C14_PMON_BOX_FILTER0, MSR_UNC_V3_C14_PMON_BOX_FILTER1},
+ [CBOX15] = {MSR_UNC_V3_C15_PMON_BOX_CTL, MSR_UNC_V3_C15_PMON_BOX_STATUS, MSR_UNC_V3_C15_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C15_PMON_BOX_FILTER0, MSR_UNC_V3_C15_PMON_BOX_FILTER1},
+ [CBOX16] = {MSR_UNC_V3_C16_PMON_BOX_CTL, MSR_UNC_V3_C16_PMON_BOX_STATUS, MSR_UNC_V3_C16_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C16_PMON_BOX_FILTER0, MSR_UNC_V3_C16_PMON_BOX_FILTER1},
+ [CBOX17] = {MSR_UNC_V3_C17_PMON_BOX_CTL, MSR_UNC_V3_C17_PMON_BOX_STATUS, MSR_UNC_V3_C17_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C17_PMON_BOX_FILTER0, MSR_UNC_V3_C17_PMON_BOX_FILTER1},
+ [CBOX18] = {MSR_UNC_V3_C18_PMON_BOX_CTL, MSR_UNC_V3_C18_PMON_BOX_STATUS, MSR_UNC_V3_C18_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C18_PMON_BOX_FILTER0, MSR_UNC_V3_C18_PMON_BOX_FILTER1},
+ [CBOX19] = {MSR_UNC_V3_C19_PMON_BOX_CTL, MSR_UNC_V3_C19_PMON_BOX_STATUS, MSR_UNC_V3_C19_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C19_PMON_BOX_FILTER0, MSR_UNC_V3_C19_PMON_BOX_FILTER1},
+ [CBOX20] = {MSR_UNC_V3_C20_PMON_BOX_CTL, MSR_UNC_V3_C20_PMON_BOX_STATUS, MSR_UNC_V3_C20_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C20_PMON_BOX_FILTER0, MSR_UNC_V3_C20_PMON_BOX_FILTER1},
+ [CBOX21] = {MSR_UNC_V3_C21_PMON_BOX_CTL, MSR_UNC_V3_C21_PMON_BOX_STATUS, MSR_UNC_V3_C21_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C21_PMON_BOX_FILTER0, MSR_UNC_V3_C21_PMON_BOX_FILTER1},
+ [CBOX22] = {MSR_UNC_V3_C22_PMON_BOX_CTL, MSR_UNC_V3_C22_PMON_BOX_STATUS, MSR_UNC_V3_C22_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C22_PMON_BOX_FILTER0, MSR_UNC_V3_C22_PMON_BOX_FILTER1},
+ [CBOX23] = {MSR_UNC_V3_C23_PMON_BOX_CTL, MSR_UNC_V3_C23_PMON_BOX_STATUS, MSR_UNC_V3_C23_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C23_PMON_BOX_FILTER0, MSR_UNC_V3_C23_PMON_BOX_FILTER1},
+ [WBOX] = {MSR_UNC_V3_PCU_PMON_BOX_CTL, MSR_UNC_V3_PCU_PMON_BOX_STATUS,MSR_UNC_V3_PCU_PMON_BOX_STATUS, 2, 0, 0, 48, MSR_UNC_V3_PCU_PMON_BOX_FILTER},
+ [WBOX0FIX] = {0,0,0,-1,0,0,64},
+ [BBOX0] = {PCI_UNC_HA_PMON_BOX_CTL, PCI_UNC_HA_PMON_BOX_STATUS, PCI_UNC_HA_PMON_BOX_STATUS, 21, 1, PCI_HA_DEVICE_0, 48},
+ [BBOX1] = {PCI_UNC_HA_PMON_BOX_CTL, PCI_UNC_HA_PMON_BOX_STATUS, PCI_UNC_HA_PMON_BOX_STATUS, 22, 1, PCI_HA_DEVICE_1, 48},
+ [MBOX0] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_0, 48},
+ [MBOX1] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_1, 48},
+ [MBOX2] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_2, 48},
+ [MBOX3] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_3, 48},
+ [MBOX4] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 24, 1, PCI_IMC_DEVICE_1_CH_0, 48},
+ [MBOX5] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 24, 1, PCI_IMC_DEVICE_1_CH_1, 48},
+ [MBOX6] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 24, 1, PCI_IMC_DEVICE_1_CH_2, 48},
+ [MBOX7] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 24, 1, PCI_IMC_DEVICE_1_CH_3, 48},
+ [IBOX0] = {PCI_UNC_IRP_PMON_BOX_CTL, PCI_UNC_IRP_PMON_BOX_STATUS, PCI_UNC_IRP_PMON_BOX_STATUS, 34, 1, PCI_IRP_DEVICE, 48},
+ [IBOX1] = {PCI_UNC_IRP_PMON_BOX_CTL, PCI_UNC_IRP_PMON_BOX_STATUS, PCI_UNC_IRP_PMON_BOX_STATUS, 34, 1, PCI_IRP_DEVICE, 48},
+ [PBOX] = {PCI_UNC_R2PCIE_PMON_BOX_CTL, PCI_UNC_R2PCIE_PMON_BOX_STATUS, PCI_UNC_R2PCIE_PMON_BOX_STATUS, 29, 1, PCI_R2PCIE_DEVICE, 48},
+ [RBOX0] = {PCI_UNC_R3QPI_PMON_BOX_CTL, PCI_UNC_R3QPI_PMON_BOX_STATUS, PCI_UNC_R3QPI_PMON_BOX_STATUS, 27, 1, PCI_R3QPI_DEVICE_LINK_0, 44},
+ [RBOX1] = {PCI_UNC_R3QPI_PMON_BOX_CTL, PCI_UNC_R3QPI_PMON_BOX_STATUS, PCI_UNC_R3QPI_PMON_BOX_STATUS, 28, 1, PCI_R3QPI_DEVICE_LINK_1, 44},
+ [SBOX0] = {MSR_UNC_V3_S0_PMON_BOX_CTL, MSR_UNC_V3_S0_PMON_BOX_STATUS, MSR_UNC_V3_S0_PMON_BOX_STATUS, -1, 0, 0, 48},
+ [SBOX1] = {MSR_UNC_V3_S1_PMON_BOX_CTL, MSR_UNC_V3_S1_PMON_BOX_STATUS, MSR_UNC_V3_S1_PMON_BOX_STATUS, -1, 0, 0, 48},
+ [SBOX2] = {MSR_UNC_V3_S2_PMON_BOX_CTL, MSR_UNC_V3_S2_PMON_BOX_STATUS, MSR_UNC_V3_S2_PMON_BOX_STATUS, -1, 0, 0, 48},
+ [SBOX3] = {MSR_UNC_V3_S3_PMON_BOX_CTL, MSR_UNC_V3_S3_PMON_BOX_STATUS, MSR_UNC_V3_S3_PMON_BOX_STATUS, -1, 0, 0, 48},
+ [QBOX0] = {PCI_UNC_V3_QPI_PMON_BOX_CTL, PCI_UNC_V3_QPI_PMON_BOX_STATUS, PCI_UNC_V3_QPI_PMON_BOX_STATUS, 25, 1, PCI_QPI_DEVICE_PORT_0, 48},
+ [QBOX1] = {PCI_UNC_V3_QPI_PMON_BOX_CTL, PCI_UNC_V3_QPI_PMON_BOX_STATUS, PCI_UNC_V3_QPI_PMON_BOX_STATUS, 26, 1, PCI_QPI_DEVICE_PORT_1, 48},
+ [QBOX2] = {PCI_UNC_V3_QPI_PMON_BOX_CTL, PCI_UNC_V3_QPI_PMON_BOX_STATUS, PCI_UNC_V3_QPI_PMON_BOX_STATUS, -1, 1, PCI_QPI_DEVICE_PORT_2, 48},
+ [QBOX0FIX] = {0x0, 0x0, 0x0, -1, 1, PCI_QPI_MISC_DEVICE_PORT_0, 32},
+ [QBOX1FIX] = {0x0, 0x0, 0x0, -1, 1, PCI_QPI_MISC_DEVICE_PORT_1, 32},
+};
+
+static PciDevice broadwellEP_pci_devices[MAX_NUM_PCI_DEVICES] = {
+ [MSR_DEV] = {NODEVTYPE, "", "MSR", ""},
+ [PCI_HA_DEVICE_0] = {HA, "12.1", "PCI_HA_DEVICE_0", "BBOX0", 0x6F30},
+ [PCI_HA_DEVICE_1] = {HA, "12.5", "PCI_HA_DEVICE_1", "BBOX1", 0x6F38},
+ [PCI_IMC_DEVICE_0_CH_0] = {IMC, "14.0", "PCI_IMC_DEVICE_0_CH_0", "MBOX0", 0x6FB4},
+ [PCI_IMC_DEVICE_0_CH_1] = {IMC, "14.1", "PCI_IMC_DEVICE_0_CH_1", "MBOX1", 0x6FB5},
+ [PCI_IMC_DEVICE_0_CH_2] = {IMC, "15.0", "PCI_IMC_DEVICE_0_CH_2", "MBOX2", 0x6FB0},
+ [PCI_IMC_DEVICE_0_CH_3] = {IMC, "15.1", "PCI_IMC_DEVICE_0_CH_3", "MBOX3", 0x6FB1},
+ [PCI_IMC_DEVICE_1_CH_0] = {IMC, "17.0", "PCI_IMC_DEVICE_1_CH_0", "MBOX4", 0x6FD4},
+ [PCI_IMC_DEVICE_1_CH_1] = {IMC, "17.1", "PCI_IMC_DEVICE_1_CH_1", "MBOX5", 0x6FD5},
+ [PCI_IMC_DEVICE_1_CH_2] = {IMC, "18.0", "PCI_IMC_DEVICE_1_CH_2", "MBOX6", 0x6FD0},
+ [PCI_IMC_DEVICE_1_CH_3] = {IMC, "18.1", "PCI_IMC_DEVICE_1_CH_3", "MBOX7", 0x6FD1},
+ [PCI_IRP_DEVICE] = {IRP, "05.6", "PCI_IRP_DEVICE", "IBOX0", 0x6F39},
+ [PCI_R2PCIE_DEVICE] = {R2PCIE, "10.1", "PCI_R2PCIE_DEVICE", "PBOX0", 0x6F34},
+ [PCI_QPI_DEVICE_PORT_0] = {QPI, "08.2", "PCI_QPI_DEVICE_PORT_0", "QBOX0", 0x6F32},
+ [PCI_QPI_DEVICE_PORT_1] = {QPI, "09.2", "PCI_QPI_DEVICE_PORT_1", "QBOX1", 0x6F33},
+ [PCI_QPI_DEVICE_PORT_2] = {QPI, "0a.2", "PCI_QPI_DEVICE_PORT_2", "QBOX2", 0x6F3A},
+ [PCI_QPI_MASK_DEVICE_PORT_0] = {QPI, "08.6", "PCI_QPI_MASK_DEVICE_PORT_0", NULL, 0x6F86},
+ [PCI_QPI_MASK_DEVICE_PORT_1] = {QPI, "09.6", "PCI_QPI_MASK_DEVICE_PORT_1", NULL, 0x6F96},
+ [PCI_QPI_MASK_DEVICE_PORT_2] = {QPI, "0a.6", "PCI_QPI_MASK_DEVICE_PORT_2", NULL, 0x6F46},
+ [PCI_QPI_MISC_DEVICE_PORT_0] = {QPI, "08.0", "PCI_QPI_MISC_DEVICE_PORT_0", "QBOX0FIX", 0x6F80},
+ [PCI_QPI_MISC_DEVICE_PORT_1] = {QPI, "08.0", "PCI_QPI_MISC_DEVICE_PORT_1", "QBOX1FIX", 0x6F80},
+ [PCI_QPI_MISC_DEVICE_PORT_2] = {QPI, "0a.0", "PCI_QPI_MISC_DEVICE_PORT_2", "QBOX2FIX", 0x6F40},
+ [PCI_R3QPI_DEVICE_LINK_0] = {R3QPI, "0b.1", "PCI_R3QPI_DEVICE_LINK_0", "RBOX0", 0x6F36},
+ [PCI_R3QPI_DEVICE_LINK_1] = {R3QPI, "0b.2", "PCI_R3QPI_DEVICE_LINK_1", "RBOX1", 0x6F37},
+ [PCI_R3QPI_DEVICE_LINK_2] = {R3QPI, "0b.5", "PCI_R3QPI_DEVICE_LINK_2", "RBOX1", 0x6F3E},
+};
diff --git a/src/includes/perfmon_broadwellEP_events.txt b/src/includes/perfmon_broadwellEP_events.txt
new file mode 100644
index 0000000..0781ebe
--- /dev/null
+++ b/src/includes/perfmon_broadwellEP_events.txt
@@ -0,0 +1,2569 @@
+# =======================================================================================
+#
+# Filename: perfmon_broadwellEP_events.txt
+#
+# Description: Event list for Intel Broadwell EP/EN/EX.
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
+# Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+# Project: likwid
+#
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation, either version 3 of the License, or (at your option) any later
+# version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
+
+EVENT_TEMP_CORE 0x00 TMP0
+UMASK_TEMP_CORE 0x00
+
+EVENT_PWR_PKG_ENERGY 0x00 PWR0
+UMASK_PWR_PKG_ENERGY 0x00
+
+EVENT_PWR_PP0_ENERGY 0x00 PWR1
+UMASK_PWR_PP0_ENERGY 0x00
+
+EVENT_PWR_PP1_ENERGY 0x00 PWR2
+UMASK_PWR_PP1_ENERGY 0x00
+
+EVENT_PWR_DRAM_ENERGY 0x00 PWR3
+UMASK_PWR_DRAM_ENERGY 0x00
+
+EVENT_INSTR_RETIRED 0x00 FIXC0
+UMASK_INSTR_RETIRED_ANY 0x00
+
+EVENT_CPU_CLK_UNHALTED 0x00 FIXC1
+UMASK_CPU_CLK_UNHALTED_CORE 0x00
+
+EVENT_CPU_CLK_UNHALTED 0x00 FIXC2
+UMASK_CPU_CLK_UNHALTED_REF 0x00
+
+EVENT_LD_BLOCKS 0x03 PMC
+UMASK_LD_BLOCKS_STORE_FORWARD 0x02
+UMASK_LD_BLOCKS_NO_SR 0x08
+
+EVENT_MISALIGN_MEM_REF 0x05 PMC
+UMASK_MISALIGN_MEM_REF_LOADS 0x01
+UMASK_MISALIGN_MEM_REF_STORES 0x02
+UMASK_MISALIGN_MEM_REF_ANY 0x03
+
+EVENT_LD_BLOCKS_PARTIAL 0x07 PMC
+UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS 0x01
+
+EVENT_DTLB_LOAD_MISSES 0x08 PMC
+UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK 0x01
+UMASK_DTLB_LOAD_MISSES_STLB_HIT 0x60
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED 0x0E
+UMASK_DTLB_LOAD_MISSES_STLB_HIT_4K 0x20
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED_4K 0x02
+UMASK_DTLB_LOAD_MISSES_WALK_DURATION 0x10
+
+EVENT_INT_MISC 0x0D PMC
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RECOVERY_CYCLES 0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_COUNT EVENT_OPTION_EDGE=1,EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RECOVERY_COUNT 0x03
+DEFAULT_OPTIONS_INT_MISC_RAT_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RAT_STALL_CYCLES 0x08
+DEFAULT_OPTIONS_INT_MISC_RAT_STALL_COUNT EVENT_OPTION_EDGE=1,EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RAT_STALL_COUNT 0x08
+
+EVENT_UOPS_ISSUED 0x0E PMC
+UMASK_UOPS_ISSUED_ANY 0x01
+UMASK_UOPS_ISSUED_FLAGS_MERGE 0x10
+UMASK_UOPS_ISSUED_SLOW_LEA 0x20
+UMASK_UOPS_ISSUED_SINGLE_MUL 0x40
+DEFAULT_OPTIONS_UOPS_ISSUED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_ISSUED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_ANY 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_FLAGS_MERGE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_FLAGS_MERGE 0x10
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_SLOW_LEA EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_SLOW_LEA 0x20
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_SINGLE_MUL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_SINGLE_MUL 0x40
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_TOTAL_CYCLES 0x01
+
+EVENT_ARITH_FPU_DIV_ACTIVE 0x14 PMC
+UMASK_ARITH_FPU_DIV_ACTIVE 0x01
+
+EVENT_L2_RQSTS 0x24 PMC
+UMASK_L2_RQSTS_DEMAND_DATA_RD_MISS 0x21
+UMASK_L2_RQSTS_DEMAND_DATA_RD_HIT 0x41
+UMASK_L2_RQSTS_RFO_MISS 0x22
+UMASK_L2_RQSTS_RFO_HIT 0x42
+UMASK_L2_RQSTS_CODE_RD_MISS 0x24
+UMASK_L2_RQSTS_CODE_RD_HIT 0x44
+UMASK_L2_RQSTS_L2_PF_HIT 0x50
+UMASK_L2_RQSTS_L2_PF_MISS 0x30
+UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD 0xE1
+UMASK_L2_RQSTS_ALL_DEMAND_MISS 0x27
+UMASK_L2_RQSTS_ALL_RFO 0xE2
+UMASK_L2_RQSTS_ALL_CODE_RD 0xE4
+UMASK_L2_RQSTS_ALL_DEMAND_REFERENCES 0xE7
+UMASK_L2_RQSTS_ALL_PF 0xF8
+UMASK_L2_RQSTS_MISS 0x3F
+UMASK_L2_RQSTS_REFERENCES 0xFF
+
+EVENT_L2_DEMAND_RQST_WB_HIT 0x27 PMC
+UMASK_L2_DEMAND_RQST_WB_HIT 0x50
+
+EVENT_LONGEST_LAT_CACHE 0x2E PMC
+UMASK_LONGEST_LAT_CACHE_REFERENCE 0x4F
+UMASK_LONGEST_LAT_CACHE_MISS 0x41
+
+EVENT_CPU_CLOCK_UNHALTED 0x3C PMC
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P 0x00
+UMASK_CPU_CLOCK_UNHALTED_REF_XCLK 0x01
+UMASK_CPU_CLOCK_UNHALTED_ONE_THREAD_ACTIVE 0x02
+
+EVENT_L1D_PEND_MISS 0x48 PMC2
+UMASK_L1D_PEND_MISS_PENDING 0x01
+DEFAULT_OPTIONS_L1D_PEND_MISS_PENDING_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_L1D_PEND_MISS_PENDING_CYCLES 0x01
+DEFAULT_OPTIONS_L1D_PEND_MISS_OCCURRENCES EVENT_OPTION_EDGE=1,EVENT_OPTION_THRESHOLD=0x01
+UMASK_L1D_PEND_MISS_OCCURRENCES 0x01
+
+EVENT_DTLB_STORE_MISSES 0x49 PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK 0x01
+UMASK_DTLB_STORE_MISSES_STLB_HIT 0x60
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED 0x0E
+UMASK_DTLB_STORE_MISSES_STLB_HIT_4K 0x20
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED_4K 0x02
+UMASK_DTLB_STORE_MISSES_WALK_DURATION 0x10
+
+EVENT_LOAD_HIT_PRE 0x4C PMC
+UMASK_LOAD_HIT_PRE_HW_PF 0x02
+
+EVENT_EPT_WALK_CYCLES 0x4F PMC
+UMASK_EPT_WALK_CYCLES 0x10
+
+EVENT_L1D 0x51 PMC
+UMASK_L1D_REPLACEMENT 0x01
+
+EVENT_TX_MEM 0x54 PMC
+UMASK_TX_MEM_ABORT_CONFLICT 0x01
+UMASK_TX_MEM_ABORT_CAPACITY_WRITE 0x02
+UMASK_TX_MEM_ABORT_HLE_STORE_TO_ELIDED_LOCK 0x04
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_NOT_EMPTY 0x08
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_MISMATCH 0x10
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_UNSUPPORTED_ALIGNMENT 0x20
+UMASK_TX_MEM_HLE_ELISION_BUFFER_FULL 0x40
+
+EVENT_MOVE_ELIMINATION 0x58 PMC
+UMASK_MOVE_ELIMINATION_INT_NOT_ELIMINATED 0x04
+UMASK_MOVE_ELIMINATION_SIMD_NOT_ELIMINATED 0x08
+UMASK_MOVE_ELIMINATION_INT_ELIMINATED 0x01
+UMASK_MOVE_ELIMINATION_SIMD_ELIMINATED 0x02
+
+EVENT_CPL_CYCLES 0x5C PMC
+UMASK_CPL_CYCLES_RING0 0x01
+UMASK_CPL_CYCLES_RING123 0x02
+DEFAULT_OPTIONS_CPL_CYCLES_RING0_TRANS EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_EDGE=1
+UMASK_CPL_CYCLES_RING0_TRANS 0x01
+
+EVENT_TX_EXEC 0x5D PMC
+EVENT_TX_EXEC_MISC1 0x01
+EVENT_TX_EXEC_MISC2 0x02
+EVENT_TX_EXEC_MISC3 0x04
+EVENT_TX_EXEC_MISC4 0x08
+EVENT_TX_EXEC_MISC5 0x10
+
+EVENT_RS_EVENTS 0x5E PMC
+UMASK_RS_EVENTS_EMPTY_CYCLES 0x01
+
+EVENT_OFFCORE_REQUESTS_OUTSTANDING 0x60 PMC
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD EVENT_OPTION_THRESHOLD=0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD 0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD 0x02
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO 0x04
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD EVENT_OPTION_THRESHOLD=0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD 0x08
+
+EVENT_LOCK_CYCLES 0x63 PMC
+UMASK_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION 0x01
+UMASK_LOCK_CYCLES_CACHE_LOCK_DURATION 0x02
+
+EVENT_IDQ 0x79 PMC
+UMASK_IDQ_EMPTY 0x02
+UMASK_IDQ_MITE_UOPS 0x04
+DEFAULT_OPTIONS_IDQ_MITE_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MITE_CYCLES 0x04
+UMASK_IDQ_DSB_UOPS 0x08
+DEFAULT_OPTIONS_IDQ_DSB_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_DSB_CYCLES 0x08
+UMASK_IDQ_MS_DSB_UOPS 0x10
+DEFAULT_OPTIONS_IDQ_MS_DSB_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MS_DSB_CYCLES 0x10
+DEFAULT_OPTIONS_IDQ_MS_DSB_OCCUR EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_EDGE=1
+UMASK_IDQ_MS_DSB_OCCUR 0x10
+UMASK_IDQ_MS_MITE_UOPS 0x20
+DEFAULT_OPTIONS_IDQ_MS_MITE_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MS_MITE_CYCLES 0x20
+UMASK_IDQ_MS_UOPS 0x30
+DEFAULT_OPTIONS_IDQ_MS_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MS_CYCLES 0x30
+DEFAULT_OPTIONS_IDQ_MS_SWITCHES EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_EDGE=1
+UMASK_IDQ_MS_SWITCHES 0x30
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS 0x18
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x04
+UMASK_IDQ_ALL_DSB_CYCLES_4_UOPS 0x18
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_ALL_MITE_CYCLES_ANY_UOPS 0x24
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x04
+UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS 0x24
+UMASK_IDQ_MITE_ALL_UOPS 0x3C
+
+EVENT_ICACHE 0x80 PMC
+UMASK_ICACHE_HIT 0x01
+UMASK_ICACHE_MISSES 0x02
+UMASK_ICACHE_ACCESSES 0x03
+
+EVENT_ITLB_MISSES 0x85 PMC
+UMASK_ITLB_MISSES_CAUSES_A_WALK 0x01
+UMASK_ITLB_MISSES_STLB_HIT 0x60
+UMASK_ITLB_MISSES_WALK_COMPLETED 0x0E
+UMASK_ITLB_MISSES_STLB_HIT_4K 0x20
+UMASK_ITLB_MISSES_WALK_COMPLETED_4K 0x02
+UMASK_ITLB_MISSES_WALK_DURATION 0x10
+
+
+EVENT_ILD_STALL 0x87 PMC
+UMASK_ILD_STALL_LCP 0x01
+
+EVENT_BR_INST_EXEC 0x88 PMC
+UMASK_BR_INST_EXEC_COND_TAKEN 0x81
+UMASK_BR_INST_EXEC_COND_NON_TAKEN 0x41
+UMASK_BR_INST_EXEC_DIRECT_JMP_TAKEN 0x82
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN 0x84
+UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN 0x88
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN 0x90
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN 0xA0
+UMASK_BR_INST_EXEC_ALL_CONDITIONAL 0xC1
+UMASK_BR_INST_EXEC_ALL_DIRECT_JMP 0xC2
+UMASK_BR_INST_EXEC_ALL_DIRECT_NEAR_CALL 0xD0
+UMASK_BR_INST_EXEC_ALL_INDIRECT_JUMP_NON_CALL_RET 0xC4
+UMASK_BR_INST_EXEC_ALL_INDIRECT_NEAR_RETURN 0xC8
+UMASK_BR_INST_EXEC_ALL_BRANCHES 0xFF
+
+EVENT_BR_MISP_EXEC 0x89 PMC
+UMASK_BR_MISP_EXEC_COND_TAKEN 0x81
+UMASK_BR_MISP_EXEC_COND_NON_TAKEN 0x41
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN 0x84
+UMASK_BR_MISP_EXEC_RETURN_NEAR_TAKEN 0x88
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_TAKEN 0x90
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN 0xA0
+UMASK_BR_MISP_EXEC_ALL_CONDITIONAL 0xC1
+UMASK_BR_MISP_EXEC_ALL_INDIRECT_JUMP_NON_CALL_RET 0xC4
+UMASK_BR_MISP_EXEC_ALL_BRANCHES 0xFF
+
+EVENT_IDQ_UOPS_NOT_DELIVERED 0x9C PMC
+UMASK_IDQ_UOPS_NOT_DELIVERED_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE EVENT_OPTION_THRESHOLD=0x04
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x03
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x02
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_INVERT=1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK 0x01
+
+EVENT_UOP_DISPATCHES_CANCELLED_SIMD_PRF 0xA0 PMC
+UMASK_UOP_DISPATCHES_CANCELLED_SIMD_PRF 0x03
+
+EVENT_UOPS_EXECUTED_PORT 0xA1 PMC
+UMASK_UOPS_EXECUTED_PORT_PORT_0 0x01
+UMASK_UOPS_EXECUTED_PORT_PORT_1 0x02
+UMASK_UOPS_EXECUTED_PORT_PORT_2 0x04
+UMASK_UOPS_EXECUTED_PORT_PORT_3 0x08
+UMASK_UOPS_EXECUTED_PORT_PORT_4 0x10
+UMASK_UOPS_EXECUTED_PORT_PORT_5 0x20
+UMASK_UOPS_EXECUTED_PORT_PORT_6 0x40
+UMASK_UOPS_EXECUTED_PORT_PORT_7 0x80
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_0_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_0_CORE 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_1_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_1_CORE 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_2_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_2_CORE 0x04
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_3_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_3_CORE 0x08
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_4_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_4_CORE 0x10
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_5_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_5_CORE 0x20
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_6_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_6_CORE 0x40
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_7_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_7_CORE 0x80
+
+EVENT_RESOURCE_STALLS 0xA2 PMC
+UMASK_RESOURCE_STALLS_ANY 0x01
+UMASK_RESOURCE_STALLS_RS 0x04
+UMASK_RESOURCE_STALLS_SB 0x08
+UMASK_RESOURCE_STALLS_ROB 0x10
+
+EVENT_CYCLE_ACTIVITY_CYCLES_L1D_MISS 0xA3 PMC2
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L1D_MISS EVENT_OPTION_THRESHOLD=0x08
+UMASK_CYCLE_ACTIVITY_CYCLES_L1D_MISS 0x08
+
+EVENT_CYCLE_ACTIVITY_CYCLES 0xA3 PMC
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L2_MISS EVENT_OPTION_THRESHOLD=0x01
+UMASK_CYCLE_ACTIVITY_CYCLES_L2_MISS 0x01
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_MEM_ANY EVENT_OPTION_THRESHOLD=0x02
+UMASK_CYCLE_ACTIVITY_CYCLES_MEM_ANY 0x02
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE EVENT_OPTION_THRESHOLD=0x04
+UMASK_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE 0x04
+
+EVENT_CYCLE_ACTIVITY_STALLS_L1D_MISS 0xA3 PMC2
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L1D_MISS EVENT_OPTION_THRESHOLD=0x0C
+UMASK_CYCLE_ACTIVITY_STALLS_L1D_MISS 0x0C
+
+EVENT_CYCLE_ACTIVITY_STALLS 0xA3 PMC
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_L2_MISS EVENT_OPTION_THRESHOLD=0x05
+UMASK_CYCLE_ACTIVITY_STALLS_L2_MISS 0x05
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_MEM_ANY EVENT_OPTION_THRESHOLD=0x06
+UMASK_CYCLE_ACTIVITY_STALLS_MEM_ANY 0x06
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_TOTAL EVENT_OPTION_THRESHOLD=0x04
+UMASK_CYCLE_ACTIVITY_STALLS_TOTAL 0x04
+
+EVENT_LSD_UOPS 0xA8 PMC
+UMASK_LSD_UOPS 0x01
+DEFAULT_OPTIONS_LSD_CYCLES_ACTIVE EVENT_OPTION_THRESHOLD=0x01
+UMASK_LSD_CYCLES_ACTIVE 0x01
+DEFAULT_OPTIONS_LSD_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x04
+UMASK_LSD_CYCLES_4_UOPS 0x01
+
+EVENT_DSB2MITE_SWITCHES_PENALTY_CYCLES 0xAB PMC
+UMASK_DSB2MITE_SWITCHES_PENALTY_CYCLES 0x02
+
+EVENT_ITLB 0xAE PMC
+UMASK_ITLB_ITLB_FLUSH 0x01
+
+EVENT_OFFCORE_REQUESTS 0xB0 PMC
+UMASK_OFFCORE_REQUESTS_DEMAND_DATA_RD 0x01
+UMASK_OFFCORE_REQUESTS_DEMAND_CODE_RD 0x02
+UMASK_OFFCORE_REQUESTS_DEMAND_RFO 0x04
+UMASK_OFFCORE_REQUESTS_ALL_DATA_RD 0x08
+
+EVENT_UOPS_EXECUTED 0xB1 PMC
+UMASK_UOPS_EXECUTED_THREAD 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC 0x01
+UMASK_UOPS_EXECUTED_CORE 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_USED_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_TOTAL_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC 0x02
+
+EVENT_OFFCORE_REQUESTS_BUFFER_SQ_FULL 0xB2 PMC
+UMASK_OFFCORE_REQUESTS_BUFFER_SQ_FULL 0x01
+
+EVENT_PAGE_WALKER_LOADS 0xBC PMC
+UMASK_PAGE_WALKER_LOADS_DTLB_L1 0x11
+UMASK_PAGE_WALKER_LOADS_ITLB_L1 0x21
+UMASK_PAGE_WALKER_LOADS_DTLB_L2 0x12
+UMASK_PAGE_WALKER_LOADS_ITLB_L2 0x22
+UMASK_PAGE_WALKER_LOADS_DTLB_L3 0x14
+UMASK_PAGE_WALKER_LOADS_ITLB_L3 0x24
+UMASK_PAGE_WALKER_LOADS_DTLB_MEMORY 0x18
+
+EVENT_INST_RETIRED_ANY_P 0xC0 PMC
+UMASK_INST_RETIRED_ANY_P 0x00
+UMASK_INST_RETIRED_X87 0x02
+
+EVENT_INST_RETIRED_PREC 0xC0 PMC1
+UMASK_INST_RETIRED_PREC_DIST 0x01
+
+EVENT_OTHER_ASSISTS 0xC1 PMC
+UMASK_OTHER_ASSISTS_AVX_TO_SSE 0x08
+UMASK_OTHER_ASSISTS_SSE_TO_AVX 0x10
+UMASK_OTHER_ASSISTS_ANY_WB_ASSIST 0x40
+
+EVENT_UOPS_RETIRED 0xC2 PMC
+UMASK_UOPS_RETIRED_ALL 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL 0x01
+UMASK_UOPS_RETIRED_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_RETIRED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL 0x01
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_TOTAL_CYCLES 0x01
+
+EVENT_MACHINE_CLEARS 0xC3 PMC
+DEFAULT_OPTIONS_MACHINE_CLEARS_COUNT EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_EDGE=1
+UMASK_MACHINE_CLEARS_COUNT 0x01
+UMASK_MACHINE_CLEARS_CYCLES 0x01
+UMASK_MACHINE_CLEARS_MEMORY_ORDERING 0x02
+UMASK_MACHINE_CLEARS_SMC 0x04
+UMASK_MACHINE_CLEARS_MASKMOV 0x20
+
+EVENT_BR_INST_RETIRED 0xC4 PMC
+UMASK_BR_INST_RETIRED_ALL_BRANCHES 0x00
+UMASK_BR_INST_RETIRED_CONDITIONAL 0x01
+UMASK_BR_INST_RETIRED_NEAR_CALL 0x02
+UMASK_BR_INST_RETIRED_NEAR_RETURN 0x08
+UMASK_BR_INST_RETIRED_NOT_TAKEN 0x10
+UMASK_BR_INST_RETIRED_NEAR_TAKEN 0x20
+UMASK_BR_INST_RETIRED_FAR_BRANCH 0x40
+
+EVENT_BR_MISP_RETIRED 0xC5 PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES 0x00
+UMASK_BR_MISP_RETIRED_CONDITIONAL 0x01
+UMASK_BR_MISP_RETIRED_NEAR_TAKEN 0x20
+
+EVENT_FP_ARITH_INST_RETIRED 0xC7 PMC
+UMASK_FP_ARITH_INST_RETIRED_SCALAR_DOUBLE 0x01
+UMASK_FP_ARITH_INST_RETIRED_SCALAR_SINGLE 0x02
+UMASK_FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE 0x04
+UMASK_FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE 0x08
+UMASK_FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE 0x10
+UMASK_FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE 0x20
+UMASK_FP_ARITH_INST_RETIRED_SCALAR 0x03
+UMASK_FP_ARITH_INST_RETIRED_PACKED 0x3C
+UMASK_FP_ARITH_INST_RETIRED_DOUBLE 0x15
+UMASK_FP_ARITH_INST_RETIRED_SINGLE 0x2A
+
+EVENT_HLE_RETIRED 0xC8 PMC
+UMASK_HLE_RETIRED_START 0x01
+UMASK_HLE_RETIRED_COMMIT 0x02
+UMASK_HLE_RETIRED_ABORTED 0x04
+UMASK_HLE_RETIRED_ABORTED_MISC1 0x08
+UMASK_HLE_RETIRED_ABORTED_MISC2 0x10
+UMASK_HLE_RETIRED_ABORTED_MISC3 0x20
+UMASK_HLE_RETIRED_ABORTED_MISC4 0x40
+UMASK_HLE_RETIRED_ABORTED_MISC5 0x80
+
+EVENT_RTM_RETIRED 0xC9 PMC
+UMASK_RTM_RETIRED_START 0x01
+UMASK_RTM_RETIRED_COMMIT 0x02
+UMASK_RTM_RETIRED_ABORTED 0x04
+UMASK_RTM_RETIRED_ABORTED_MISC1 0x08
+UMASK_RTM_RETIRED_ABORTED_MISC2 0x10
+UMASK_RTM_RETIRED_ABORTED_MISC3 0x20
+UMASK_RTM_RETIRED_ABORTED_MISC4 0x40
+UMASK_RTM_RETIRED_ABORTED_MISC5 0x80
+
+EVENT_FP_ASSIST 0xCA PMC
+UMASK_FP_ASSIST_X87_OUTPUT 0x02
+UMASK_FP_ASSIST_X87_INPUT 0x04
+UMASK_FP_ASSIST_SIMD_OUTPUT 0x08
+UMASK_FP_ASSIST_SIMD_INPUT 0x10
+UMASK_FP_ASSIST_ANY 0x1E
+
+EVENT_ROB_MISC_EVENT_LBR_INSERTS 0xCC PMC
+UMASK_ROB_MISC_EVENT_LBR_INSERTS 0x20
+
+EVENT_MEM_UOPS_RETIRED 0xD0 PMC
+UMASK_MEM_UOPS_RETIRED_LOADS_ALL 0x81
+UMASK_MEM_UOPS_RETIRED_STORES_ALL 0x82
+UMASK_MEM_UOPS_RETIRED_LOADS_LOCK 0x21
+UMASK_MEM_UOPS_RETIRED_LOADS_STLB_MISS 0x11
+UMASK_MEM_UOPS_RETIRED_STORES_STLB_MISS 0x12
+UMASK_MEM_UOPS_RETIRED_LOADS_SPLIT 0x41
+UMASK_MEM_UOPS_RETIRED_STORES_SPLIT 0x42
+R
+EVENT_MEM_LOAD_UOPS_RETIRED 0xD1 PMC
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT 0x01
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_MISS 0x08
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_ALL 0x09
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_HIT 0x02
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_MISS 0x10
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_ALL 0x12
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT 0x04
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS 0x20
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL 0x24
+UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB 0x40
+
+EVENT_MEM_LOAD_UOPS_L3_HIT_RETIRED 0xD2 PMC
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_MISS 0x01
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HIT 0x02
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM 0x04
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_NONE 0x08
+
+EVENT_MEM_LOAD_UOPS_L3_MISS_RETIRED 0xD3 PMC
+UMASK_MEM_LOAD_UOPS_L3_MISS_RETIRED_LOCAL_DRAM 0x01
+UMASK_MEM_LOAD_UOPS_L3_MISS_RETIRED_LOCAL_DRAM 0x01
+UMASK_MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_DRAM 0x04
+UMASK_MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM 0x10
+UMASK_MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_FWD 0x20
+
+EVENT_BACLEARS 0xE6 PMC
+UMASK_BACLEARS_ANY 0x1F
+
+EVENT_L2_TRANS 0xF0 PMC
+UMASK_L2_TRANS_DEMAND_DATA_RD 0x01
+UMASK_L2_TRANS_RFO 0x02
+UMASK_L2_TRANS_CODE_RD 0x04
+UMASK_L2_TRANS_ALL_PF 0x08
+UMASK_L2_TRANS_L1D_WB 0x10
+UMASK_L2_TRANS_L2_FILL 0x20
+UMASK_L2_TRANS_L2_WB 0x40
+UMASK_L2_TRANS_ALL_REQUESTS 0x80
+
+EVENT_L2_LINES_IN 0xF1 PMC
+UMASK_L2_LINES_IN_I 0x01
+UMASK_L2_LINES_IN_S 0x02
+UMASK_L2_LINES_IN_E 0x04
+UMASK_L2_LINES_IN_ALL 0x07
+
+EVENT_L2_LINES_OUT 0xF2 PMC
+UMASK_L2_LINES_OUT_DEMAND_CLEAN 0x05
+UMASK_L2_LINES_OUT_DEMAND_DIRTY 0x06
+
+EVENT_OFFCORE_RESPONSE_0 0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS 0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY 0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY 0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY 0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY 0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY 0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY 0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY 0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY 0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY 0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY 0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY 0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY 0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY 0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1 0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS 0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY 0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY 0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY 0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY 0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY 0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY 0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY 0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY 0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY 0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY 0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY 0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY 0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY 0x01 0x0F 0x10
+
+EVENT_EVENT_MSG 0x42 UBOX
+UMASK_EVENT_MSG_DOORBELL_RCVD 0x08
+
+EVENT_PHOLD_CYCLES 0x45 UBOX
+UMASK_PHOLD_CYCLES_ASSERT_TO_ACK 0x01
+
+EVENT_RACU_REQUESTS 0x46 UBOX
+UMASK_RACU_REQUESTS 0x00
+
+EVENT_UNCORE_CLOCK 0x00 UBOXFIX
+UMASK_UNCORE_CLOCK 0x00
+
+EVENT_CBOX_CLOCKTICKS 0x00 CBOX
+UMASK_CBOX_CLOCKTICKS 0x00
+
+EVENT_TXR_INSERTS 0x02 CBOX
+UMASK_TXR_INSERTS_AD_CACHE 0x01
+UMASK_TXR_INSERTS_AK_CACHE 0x02
+UMASK_TXR_INSERTS_BL_CACHE 0x04
+UMASK_TXR_INSERTS_IV_CACHE 0x08
+UMASK_TXR_INSERTS_AD_CORE 0x10
+UMASK_TXR_INSERTS_AK_CORE 0x20
+UMASK_TXR_INSERTS_BL_CORE 0x40
+
+EVENT_TXR_ADS_USED 0x04 CBOX
+UMASK_TXR_ADS_USED_AD 0x01
+UMASK_TXR_ADS_USED_AK 0x02
+UMASK_TXR_ADS_USED_BL 0x04
+
+EVENT_RING_BOUNCES 0x05 CBOX
+UMASK_RING_BOUNCES_AD 0x01
+UMASK_RING_BOUNCES_AK 0x02
+UMASK_RING_BOUNCES_BL 0x04
+UMASK_RING_BOUNCES_IV 0x10
+
+EVENT_RING_SRC_THRTL 0x07 CBOX
+UMASK_RING_SRC_THRTL 0x00
+
+EVENT_FAST_ASSERTED 0x09 CBOX0C0|CBOX0C1|CBOX1C0|CBOX1C1|CBOX2C0|CBOX2C1|CBOX3C0|CBOX3C1|CBOX4C0|CBOX4C1|CBOX5C0|CBOX5C1|CBOX6C0|CBOX6C1|CBOX7C0|CBOX7C1|CBOX8C0|CBOX8C1|CBOX9C0|CBOX9C1|CBOX10C0|CBOX10C1|CBOX11C0|CBOX11C1|CBOX12C0|CBOX12C1|CBOX13C0|CBOX13C1|CBOX14C0|CBOX14C1|CBOX15C0|CBOX15C1|CBOX16C0|CBOX16C1|CBOX17C0|CBOX17C1|CBOX18C0|CBOX18C1|CBOX19C0|CBOX19C1|CBOX20C0|CBOX20C1|CBOX21C0|CBOX21C1|CBOX22C0|CBOX22C1|CBOX23C0|CBOX23C1
+UMASK_FAST_ASSERTED 0x00
+
+EVENT_BOUNCE_CONTROL 0x0A CBOX
+UMASK_BOUNCE_CONTROL 0x00
+
+EVENT_RING_AD_USED 0x1B CBOX
+UMASK_RING_AD_USED_UP_EVEN 0x01
+UMASK_RING_AD_USED_UP_ODD 0x02
+UMASK_RING_AD_USED_UP 0x03
+UMASK_RING_AD_USED_DOWN_EVEN 0x04
+UMASK_RING_AD_USED_DOWN_ODD 0x08
+UMASK_RING_AD_USED_DOWN 0x0C
+UMASK_RING_AD_USED_ANY 0x0F
+
+EVENT_RING_AK_USED 0x1C CBOX
+UMASK_RING_AK_USED_UP_EVEN 0x01
+UMASK_RING_AK_USED_UP_ODD 0x02
+UMASK_RING_AK_USED_UP 0x03
+UMASK_RING_AK_USED_DOWN_EVEN 0x04
+UMASK_RING_AK_USED_DOWN_ODD 0x08
+UMASK_RING_AK_USED_DOWN 0x0C
+UMASK_RING_AK_USED_ANY 0x0F
+
+EVENT_RING_BL_USED 0x1D CBOX
+UMASK_RING_BL_USED_UP_EVEN 0x01
+UMASK_RING_BL_USED_UP_ODD 0x02
+UMASK_RING_BL_USED_UP 0x03
+UMASK_RING_BL_USED_DOWN_EVEN 0x04
+UMASK_RING_BL_USED_DOWN_ODD 0x08
+UMASK_RING_BL_USED_DOWN 0x0C
+UMASK_RING_BL_USED_ANY 0x0F
+
+EVENT_RING_IV_USED 0x1E CBOX
+UMASK_RING_IV_USED_UP 0x03
+UMASK_RING_IV_USED_DN 0x0C
+UMASK_RING_IV_USED_ANY 0x0F
+UMASK_RING_IV_USED_DOWN 0x33
+
+EVENT_COUNTER0_OCCUPANCY 0x1F CBOX
+UMASK_COUNTER0_OCCUPANCY 0x00
+DEFAULT_OPTIONS_COUNTER0_OCCUPANCY_COUNT EVENT_OPTION_THRESHOLD=0x01
+UMASK_COUNTER0_OCCUPANCY_COUNT 0x00
+
+EVENT_RXR_OCCUPANCY 0x11 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX15C0|CBOX16C0|CBOX17C0|CBOX18C0|CBOX19C0|CBOX20C0|CBOX21C0|CBOX22C0|CBOX23C0
+UMASK_RXR_OCCUPANCY_IRQ 0x01
+UMASK_RXR_OCCUPANCY_IRQ_REJ 0x02
+UMASK_RXR_OCCUPANCY_IPQ 0x04
+UMASK_RXR_OCCUPANCY_PRQ_REJ 0x20
+
+EVENT_RXR_EXT_STARVED 0x12 CBOX
+UMASK_RXR_EXT_STARVED_IRQ 0x01
+UMASK_RXR_EXT_STARVED_IPQ 0x02
+UMASK_RXR_EXT_STARVED_PRQ 0x04
+UMASK_RXR_EXT_STARVED_ISMQ_BIDS 0x08
+
+EVENT_RXR_INSERTS 0x13 CBOX
+UMASK_RXR_INSERTS_IRQ 0x01
+UMASK_RXR_INSERTS_IRQ_REJ 0x02
+UMASK_RXR_INSERTS_IPQ 0x04
+UMASK_RXR_INSERTS_PRQ 0x10
+UMASK_RXR_INSERTS_PRQ_REJ 0x20
+
+EVENT_RXR_IPQ_RETRY 0x31 CBOX
+UMASK_RXR_IPQ_RETRY_ANY 0x01
+UMASK_RXR_IPQ_RETRY_FULL 0x02
+UMASK_RXR_IPQ_RETRY_ADDR_CONFLICT 0x04
+UMASK_RXR_IPQ_RETRY_QPI_CREDITS 0x10
+
+EVENT_RXR_IPQ_RETRY2 0x28 CBOX
+UMASK_RXR_IPQ_RETRY2_AD_SBO 0x01
+OPTIONS_RXR_IPQ_RETRY2_TARGET EVENT_OPTION_NID_MASK
+UMASK_RXR_IPQ_RETRY2_TARGET 0x40
+
+EVENT_RXR_IRQ_RETRY 0x32 CBOX
+UMASK_RXR_IRQ_RETRY_ANY 0x01
+UMASK_RXR_IRQ_RETRY_FULL 0x02
+UMASK_RXR_IRQ_RETRY_ADDR_CONFLICT 0x04
+UMASK_RXR_IRQ_RETRY_RTID 0x08
+UMASK_RXR_IRQ_RETRY_QPI_CREDITS 0x10
+UMASK_RXR_IRQ_RETRY_IIO_CREDITS 0x20
+OPTIONS_RXR_IRQ_RETRY_NID EVENT_OPTION_NID_MASK
+UMASK_RXR_IRQ_RETRY_NID 0x40
+
+EVENT_RXR_IRQ_RETRY2 0x29 CBOX
+UMASK_RXR_IRQ_RETRY2_AD_SBO 0x01
+UMASK_RXR_IRQ_RETRY2_BL_SBO 0x02
+OPTIONS_RXR_IRQ_RETRY2_TARGET EVENT_OPTION_NID_MASK
+UMASK_RXR_IRQ_RETRY2_TARGET 0x40
+
+EVENT_RXR_ISMQ_RETRY 0x33 CBOX
+UMASK_RXR_ISMQ_RETRY_ANY 0x01
+UMASK_RXR_ISMQ_RETRY_FULL 0x02
+UMASK_RXR_ISMQ_RETRY_RTID 0x08
+UMASK_RXR_ISMQ_RETRY_QPI_CREDITS 0x10
+UMASK_RXR_ISMQ_RETRY_IIO_CREDITS 0x20
+OPTIONS_RXR_ISMQ_RETRY_NID EVENT_OPTION_NID_MASK
+UMASK_RXR_ISMQ_RETRY_NID 0x40
+OPTIONS_RXR_ISMQ_RETRY_WB_CREDITS EVENT_OPTION_NID_MASK
+UMASK_RXR_ISMQ_RETRY_WB_CREDITS 0x80
+
+EVENT_RXR_ISMQ_RETRY2 0x2A CBOX
+UMASK_RXR_ISMQ_RETRY2_AD_SBO 0x01
+UMASK_RXR_ISMQ_RETRY2_BL_SBO 0x02
+OPTIONS_RXR_ISMQ_RETRY2_TARGET EVENT_OPTION_NID_MASK
+UMASK_RXR_ISMQ_RETRY2_TARGET 0x40
+
+EVENT_LLC_LOOKUP 0x34 CBOX
+OPTIONS_LLC_LOOKUP_DATA_READ EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_DATA_READ 0x03
+OPTIONS_LLC_LOOKUP_WRITE EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_WRITE 0x05
+OPTIONS_LLC_LOOKUP_REMOTE_SNOOP EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_REMOTE_SNOOP 0x09
+OPTIONS_LLC_LOOKUP_ANY EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_ANY 0x11
+OPTIONS_LLC_LOOKUP_READ EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_READ 0x21
+OPTIONS_LLC_LOOKUP_NID EVENT_OPTION_STATE_MASK|EVENT_OPTION_NID_MASK
+UMASK_LLC_LOOKUP_NID 0x41
+
+EVENT_LLC_VICTIMS 0x37 CBOX
+UMASK_LLC_VICTIMS_M 0x01
+UMASK_LLC_VICTIMS_E 0x02
+UMASK_LLC_VICTIMS_S 0x04
+UMASK_LLC_VICTIMS_F 0x08
+UMASK_LLC_VICTIMS_MISS 0x10
+OPTIONS_LLC_VICTIMS_NID EVENT_OPTION_STATE_MASK|EVENT_OPTION_NID_MASK
+UMASK_LLC_VICTIMS_NID 0x40
+
+
+EVENT_TOR_INSERTS 0x35 CBOX
+OPTIONS_TOR_INSERTS_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_OPCODE 0x01
+OPTIONS_TOR_INSERTS_MISS_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_MISS_OPCODE 0x03
+UMASK_TOR_INSERTS_EVICTION 0x04
+UMASK_TOR_INSERTS_ALL 0x08
+UMASK_TOR_INSERTS_WB 0x10
+OPTIONS_TOR_INSERTS_LOCAL_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_LOCAL_OPCODE 0x21
+OPTIONS_TOR_INSERTS_MISS_LOCAL_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_MISS_LOCAL_OPCODE 0x23
+UMASK_TOR_INSERTS_LOCAL 0x28
+UMASK_TOR_INSERTS_MISS_LOCAL 0x2A
+OPTIONS_TOR_INSERTS_NID_OPCODE EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_NID_OPCODE 0x41
+OPTIONS_TOR_INSERTS_NID_MISS_OPCODE EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_NID_MISS_OPCODE 0x43
+OPTIONS_TOR_INSERTS_NID_EVICION EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_EVICION 0x44
+DEFAULT_OPTIONS_TOR_INSERTS_NID_ALL EVENT_OPTION_STATE=0x01
+OPTIONS_TOR_INSERTS_NID_ALL EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_ALL 0x48
+OPTIONS_TOR_INSERTS_NID_MISS_ALL EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_MISS_ALL 0x4A
+OPTIONS_TOR_INSERTS_NID_WB EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_WB 0x50
+OPTIONS_TOR_INSERTS_REMOTE_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_REMOTE_OPCODE 0x81
+OPTIONS_TOR_INSERTS_MISS_REMOTE_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_MISS_REMOTE_OPCODE 0x83
+UMASK_TOR_INSERTS_REMOTE 0x88
+UMASK_TOR_INSERTS_MISS_REMOTE 0x8A
+
+EVENT_TOR_OCCUPANCY 0x36 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX15C0|CBOX16C0|CBOX17C0|CBOX18C0|CBOX19C0|CBOX20C0|CBOX21C0|CBOX22C0|CBOX23C0
+OPTIONS_TOR_OCCUPANCY_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_OPCODE 0x01
+OPTIONS_TOR_OCCUPANCY_MISS_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_MISS_OPCODE 0x03
+UMASK_TOR_OCCUPANCY_EVICTION 0x04
+UMASK_TOR_OCCUPANCY_ALL 0x08
+UMASK_TOR_OCCUPANCY_MISS_ALL 0x0A
+UMASK_TOR_OCCUPANCY_WB 0x10
+UMASK_TOR_OCCUPANCY_LOCAL_OPCODE 0x21
+UMASK_TOR_OCCUPANCY_MISS_LOCAL_OPCODE 0x23
+UMASK_TOR_OCCUPANCY_LOCAL 0x28
+UMASK_TOR_OCCUPANCY_MISS_LOCAL 0x2A
+OPTIONS_TOR_OCCUPANCY_NID_OPCODE EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_NID_OPCODE 0x41
+OPTIONS_TOR_OCCUPANCY_NID_MISS_OPCODE EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_NID_MISS_OPCODE 0x43
+OPTIONS_TOR_OCCUPANCY_NID_EVICTION EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_EVICTION 0x44
+DEFAULT_OPTIONS_TOR_OCCUPANCY_NID_ALL EVENT_OPTION_STATE=0x01
+OPTIONS_TOR_OCCUPANCY_NID_ALL EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_ALL 0x48
+OPTIONS_TOR_OCCUPANCY_NID_MISS_ALL EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_MISS_ALL 0x4A
+OPTIONS_TOR_OCCUPANCY_NID_WB EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_WB 0x50
+OPTIONS_TOR_OCCUPANCY_REMOTE_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_REMOTE_OPCODE 0x81
+OPTIONS_TOR_OCCUPANCY_MISS_REMOTE_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_MISS_REMOTE_OPCODE 0x83
+UMASK_TOR_OCCUPANCY_REMOTE 0x88
+UMASK_TOR_OCCUPANCY_MISS_REMOTE 0x8A
+
+EVENT_MISC 0x39 CBOX
+UMASK_MISC_RSPI_WAS_FSE 0x01
+UMASK_MISC_WC_ALIASING 0x02
+UMASK_MISC_STARTED 0x04
+UMASK_MISC_RFO_HIT_S 0x08
+UMASK_MISC_CVZERO_PREFETCH_VICTIM 0x10
+UMASK_MISC_CVZERO_PREFETCH_MISS 0x20
+
+EVENT_SBO_CREDITS_ACQUIRED 0x3D CBOX
+UMASK_SBO_CREDITS_ACQUIRED_AD 0x01
+UMASK_SBO_CREDITS_ACQUIRED_BL 0x02
+UMASK_SBO_CREDITS_ACQUIRED_ANY 0x03
+
+EVENT_SBO_CREDIT_OCCUPANCY 0x3E CBOX
+UMASK_SBO_CREDIT_OCCUPANCY_AD 0x01
+UMASK_SBO_CREDIT_OCCUPANCY_BL 0x02
+UMASK_SBO_CREDIT_OCCUPANCY_ANY 0x03
+
+EVENT_WBOX_CLOCKTICKS 0x00 WBOX
+UMASK_WBOX_CLOCKTICKS 0x00
+
+EVENT_CORE0_TRANSITION_CYCLES 0x60 WBOX
+UMASK_CORE0_TRANSITION_CYCLES 0x00
+
+EVENT_CORE1_TRANSITION_CYCLES 0x61 WBOX
+UMASK_CORE1_TRANSITION_CYCLES 0x00
+
+EVENT_CORE2_TRANSITION_CYCLES 0x62 WBOX
+UMASK_CORE2_TRANSITION_CYCLES 0x00
+
+EVENT_CORE3_TRANSITION_CYCLES 0x63 WBOX
+UMASK_CORE3_TRANSITION_CYCLES 0x00
+
+EVENT_CORE4_TRANSITION_CYCLES 0x64 WBOX
+UMASK_CORE4_TRANSITION_CYCLES 0x00
+
+EVENT_CORE5_TRANSITION_CYCLES 0x65 WBOX
+UMASK_CORE5_TRANSITION_CYCLES 0x00
+
+EVENT_CORE6_TRANSITION_CYCLES 0x66 WBOX
+UMASK_CORE6_TRANSITION_CYCLES 0x00
+
+EVENT_CORE7_TRANSITION_CYCLES 0x67 WBOX
+UMASK_CORE7_TRANSITION_CYCLES 0x00
+
+EVENT_CORE8_TRANSITION_CYCLES 0x68 WBOX
+UMASK_CORE8_TRANSITION_CYCLES 0x00
+
+EVENT_CORE9_TRANSITION_CYCLES 0x69 WBOX
+UMASK_CORE9_TRANSITION_CYCLES 0x00
+
+EVENT_CORE10_TRANSITION_CYCLES 0x6A WBOX
+UMASK_CORE10_TRANSITION_CYCLES 0x00
+
+EVENT_CORE11_TRANSITION_CYCLES 0x6B WBOX
+UMASK_CORE11_TRANSITION_CYCLES 0x00
+
+EVENT_CORE12_TRANSITION_CYCLES 0x6C WBOX
+UMASK_CORE12_TRANSITION_CYCLES 0x00
+
+EVENT_CORE13_TRANSITION_CYCLES 0x6D WBOX
+UMASK_CORE13_TRANSITION_CYCLES 0x00
+
+EVENT_CORE14_TRANSITION_CYCLES 0x6E WBOX
+UMASK_CORE14_TRANSITION_CYCLES 0x00
+
+EVENT_CORE15_TRANSITION_CYCLES 0x6F WBOX
+UMASK_CORE15_TRANSITION_CYCLES 0x00
+
+EVENT_CORE16_TRANSITION_CYCLES 0x70 WBOX
+UMASK_CORE16_TRANSITION_CYCLES 0x00
+
+EVENT_CORE17_TRANSITION_CYCLES 0x71 WBOX
+UMASK_CORE17_TRANSITION_CYCLES 0x00
+
+EVENT_FIVR_PS_PS0_CYCLES 0x75 WBOX
+UMASK_FIVR_PS_PS0_CYCLES 0x00
+
+EVENT_FIVR_PS_PS1_CYCLES 0x75 WBOX
+UMASK_FIVR_PS_PS1_CYCLES 0x00
+
+EVENT_FIVR_PS_PS2_CYCLES 0x75 WBOX
+UMASK_FIVR_PS_PS2_CYCLES 0x00
+
+EVENT_FIVR_PS_PS3_CYCLES 0x75 WBOX
+UMASK_FIVR_PS_PS3_CYCLES 0x00
+
+EVENT_DEMOTIONS_CORE0 0x30 WBOX
+UMASK_DEMOTIONS_CORE0 0x00
+
+EVENT_DEMOTIONS_CORE1 0x31 WBOX
+UMASK_DEMOTIONS_CORE1 0x00
+
+EVENT_DEMOTIONS_CORE2 0x32 WBOX
+UMASK_DEMOTIONS_CORE2 0x00
+
+EVENT_DEMOTIONS_CORE3 0x33 WBOX
+UMASK_DEMOTIONS_CORE3 0x00
+
+EVENT_DEMOTIONS_CORE4 0x34 WBOX
+UMASK_DEMOTIONS_CORE4 0x00
+
+EVENT_DEMOTIONS_CORE5 0x35 WBOX
+UMASK_DEMOTIONS_CORE5 0x00
+
+EVENT_DEMOTIONS_CORE6 0x36 WBOX
+UMASK_DEMOTIONS_CORE6 0x00
+
+EVENT_DEMOTIONS_CORE7 0x37 WBOX
+UMASK_DEMOTIONS_CORE7 0x00
+
+EVENT_DEMOTIONS_CORE8 0x38 WBOX
+UMASK_DEMOTIONS_CORE8 0x00
+
+EVENT_DEMOTIONS_CORE9 0x39 WBOX
+UMASK_DEMOTIONS_CORE9 0x00
+
+EVENT_DEMOTIONS_CORE10 0x3A WBOX
+UMASK_DEMOTIONS_CORE10 0x00
+
+EVENT_DEMOTIONS_CORE11 0x3B WBOX
+UMASK_DEMOTIONS_CORE11 0x00
+
+EVENT_DEMOTIONS_CORE12 0x3C WBOX
+UMASK_DEMOTIONS_CORE12 0x00
+
+EVENT_DEMOTIONS_CORE13 0x3D WBOX
+UMASK_DEMOTIONS_CORE13 0x00
+
+EVENT_DEMOTIONS_CORE14 0x3E WBOX
+UMASK_DEMOTIONS_CORE14 0x00
+
+EVENT_DEMOTIONS_CORE15 0x3F WBOX
+UMASK_DEMOTIONS_CORE15 0x00
+
+EVENT_DEMOTIONS_CORE16 0x40 WBOX
+UMASK_DEMOTIONS_CORE16 0x00
+
+EVENT_DEMOTIONS_CORE17 0x41 WBOX
+UMASK_DEMOTIONS_CORE17 0x00
+
+EVENT_FREQ_BAND0_CYCLES 0x0B WBOX
+OPTIONS_FREQ_BAND0_CYCLES EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND0_CYCLES 0x00
+
+EVENT_FREQ_BAND1_CYCLES 0x0C WBOX
+OPTIONS_FREQ_BAND1_CYCLES EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND1_CYCLES 0x00
+
+EVENT_FREQ_BAND2_CYCLES 0x0D WBOX
+OPTIONS_FREQ_BAND2_CYCLES EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND2_CYCLES 0x00
+
+EVENT_FREQ_BAND3_CYCLES 0x0E WBOX
+OPTIONS_FREQ_BAND3_CYCLES EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND3_CYCLES 0x00
+
+EVENT_FREQ_MAX_LIMIT_THERMAL_CYCLES 0x04 WBOX
+UMASK_FREQ_MAX_LIMIT_THERMAL_CYCLES 0x00
+
+EVENT_FREQ_MAX_OS_CYCLES 0x06 WBOX
+UMASK_FREQ_MAX_OS_CYCLES 0x00
+
+EVENT_FREQ_MAX_POWER_CYCLES 0x05 WBOX
+UMASK_FREQ_MAX_POWER_CYCLES 0x00
+
+EVENT_FREQ_MIN_IO_P_CYCLES 0x73 WBOX
+UMASK_FREQ_MIN_IO_P_CYCLES 0x00
+
+EVENT_FREQ_TRANS_CYCLES 0x74 WBOX
+UMASK_FREQ_TRANS_CYCLES 0x00
+
+EVENT_MEMORY_PHASE_SHEDDING_CYCLES 0x2F WBOX
+UMASK_MEMORY_PHASE_SHEDDING_CYCLES 0x00
+
+EVENT_POWER_STATE_OCCUPANCY 0x80 WBOX
+UMASK_POWER_STATE_OCCUPANCY_CORES_C0 0x40
+UMASK_POWER_STATE_OCCUPANCY_CORES_C3 0x80
+UMASK_POWER_STATE_OCCUPANCY_CORES_C6 0xC0
+
+EVENT_PROCHOT_EXTERNAL_CYCLES 0x0A WBOX
+UMASK_PROCHOT_EXTERNAL_CYCLES 0x00
+
+EVENT_PROCHOT_INTERNAL_CYCLES 0x09 WBOX
+UMASK_PROCHOT_INTERNAL_CYCLES 0x00
+
+EVENT_TOTAL_TRANSITION_CYCLES 0x72 WBOX
+UMASK_TOTAL_TRANSITION_CYCLES 0x00
+
+EVENT_VR_HOT_CYCLES 0x42 WBOX
+UMASK_VR_HOT_CYCLES 0x00
+
+EVENT_UFS_BANDWIDTH_MAX_RANGE 0x7E WBOX
+UMASK_UFS_BANDWIDTH_MAX_RANGE 0x00
+
+EVENT_UFS_TRANSITIONS_DOWN 0x7C WBOX
+UMASK_UFS_TRANSITIONS_DOWN 0x00
+
+EVENT_UFS_TRANSITIONS_IO_P_LIMIT 0x7D WBOX
+UMASK_UFS_TRANSITIONS_IO_P_LIMIT 0x00
+
+EVENT_UFS_TRANSITIONS_NO_CHANGE 0x79 WBOX
+UMASK_UFS_TRANSITIONS_NO_CHANGE 0x00
+
+EVENT_UFS_TRANSITIONS_UP_RING 0x7A WBOX
+UMASK_UFS_TRANSITIONS_UP_RING 0x00
+
+EVENT_UFS_TRANSITIONS_UP_STALL 0x7B WBOX
+UMASK_UFS_TRANSITIONS_UP_STALL 0x00
+
+EVENT_CORES_IN_C3 0x00 WBOX0FIX
+UMASK_CORES_IN_C3 0x00
+
+EVENT_CORES_IN_C6 0x00 WBOX1FIX
+UMASK_CORES_IN_C6 0x00
+
+EVENT_BBOX_CLOCKTICKS 0x00 BBOX
+UMASK_BBOX_CLOCKTICKS 0x00
+
+EVENT_ADDR_OPC_MATCH 0x20 BBOX
+OPTIONS_ADDR_OPC_MATCH_ADDR EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_ADDR_OPC_MATCH_ADDR 0x01
+OPTIONS_ADDR_OPC_MATCH_OPC EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_OPC 0x02
+OPTIONS_ADDR_OPC_MATCH_FILT EVENT_OPTION_OPCODE_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_ADDR_OPC_MATCH_FILT 0x03
+OPTIONS_ADDR_OPC_MATCH_AD EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_AD 0x04
+OPTIONS_ADDR_OPC_MATCH_BL EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_BL 0x08
+OPTIONS_ADDR_OPC_MATCH_AK EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_AK 0x10
+
+EVENT_BT_CYCLES_NE 0x42 BBOX
+UMASK_BT_CYCLES_NE 0x00
+
+EVENT_BT_OCCUPANCY 0x43 BBOX
+UMASK_BT_OCCUPANCY 0x00
+
+EVENT_BYPASS_IMC 0x14 BBOX
+UMASK_BYPASS_IMC_TAKEN 0x01
+UMASK_BYPASS_IMC_NOT_TAKEN 0x02
+
+EVENT_CONFLICT_CYCLES 0x0B BBOX0C1|BBOX1C1
+UMASK_CONFLICT_CYCLES 0x00
+
+EVENT_DIRECT2CORE_COUNT 0x11 BBOX
+UMASK_DIRECT2CORE_COUNT 0x00
+
+EVENT_DIRECT2CORE_CYCLES_DISABLED 0x12 BBOX
+UMASK_DIRECT2CORE_CYCLES_DISABLED 0x00
+
+EVENT_DIRECT2CORE_TXN_OVERRIDE 0x13 BBOX
+UMASK_DIRECT2CORE_TXN_OVERRIDE 0x00
+
+EVENT_DIRECTORY_LAT_OPT 0x41 BBOX
+UMASK_DIRECTORY_LAT_OPT 0x00
+
+EVENT_DIRECTORY_LOOKUP 0x0C BBOX
+UMASK_DIRECTORY_LOOKUP_SNP 0x01
+UMASK_DIRECTORY_LOOKUP_NO_SNP 0x02
+
+EVENT_DIRECTORY_UPDATE 0x0D BBOX
+UMASK_DIRECTORY_UPDATE_SET 0x01
+UMASK_DIRECTORY_UPDATE_CLEAR 0x02
+UMASK_DIRECTORY_UPDATE_ANY 0x03
+
+EVENT_HITME_LOOKUP 0x70 BBOX
+UMASK_HITME_LOOKUP_READ_OR_INVITOE 0x01
+UMASK_HITME_LOOKUP_WBMTOI 0x02
+UMASK_HITME_LOOKUP_ACKCNFLTWBI 0x04
+UMASK_HITME_LOOKUP_WBMTOE_OR_S 0x08
+UMASK_HITME_LOOKUP_HOM 0x0F
+UMASK_HITME_LOOKUP_RSPFWDI_REMOTE 0x10
+UMASK_HITME_LOOKUP_RSPFWDI_LOCAL 0x20
+UMASK_HITME_LOOKUP_INVALS 0x26
+UMASK_HITME_LOOKUP_RSPFWDS 0x40
+UMASK_HITME_LOOKUP_ALLOCS 0x70
+UMASK_HITME_LOOKUP_RSP 0x80
+UMASK_HITME_LOOKUP_ALL 0xFF
+
+EVENT_HITME_HIT 0x71 BBOX
+UMASK_HITME_HIT_READ_OR_INVITOE 0x01
+UMASK_HITME_HIT_WBMTOI 0x02
+UMASK_HITME_HIT_ACKCNFLTWBI 0x04
+UMASK_HITME_HIT_WBMTOE_OR_S 0x08
+UMASK_HITME_HIT_HOM 0x0F
+UMASK_HITME_HIT_RSPFWDI_REMOTE 0x10
+UMASK_HITME_HIT_RSPFWDI_LOCAL 0x20
+UMASK_HITME_HIT_INVALS 0x26
+UMASK_HITME_HIT_RSPFWDS 0x40
+UMASK_HITME_HIT_EVICTS 0x42
+UMASK_HITME_HIT_ALLOCS 0x70
+UMASK_HITME_HIT_RSP 0x80
+UMASK_HITME_HIT_ALL 0xFF
+
+EVENT_HITME_HIT_PV_BITS_SET 0x72 BBOX
+UMASK_HITME_HIT_PV_BITS_SET_READ_OR_INVITOE 0x01
+UMASK_HITME_HIT_PV_BITS_SET_WBMTOI 0x02
+UMASK_HITME_HIT_PV_BITS_SET_ACKCNFLTWBI 0x04
+UMASK_HITME_HIT_PV_BITS_SET_WBMTOE_OR_S 0x08
+UMASK_HITME_HIT_PV_BITS_SET_HOM 0x0F
+UMASK_HITME_HIT_PV_BITS_SET_RSPFWDI_REMOTE 0x10
+UMASK_HITME_HIT_PV_BITS_SET_RSPFWDI_LOCAL 0x20
+UMASK_HITME_HIT_PV_BITS_SET_RSPFWDS 0x40
+UMASK_HITME_HIT_PV_BITS_SET_RSP 0x80
+UMASK_HITME_HIT_PV_BITS_SET_ALL 0xFF
+
+EVENT_IGR_NO_CREDIT_CYCLES 0x22 BBOX
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI0 0x01
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI1 0x02
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI2 0x10
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI0 0x04
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI1 0x08
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI2 0x20
+
+EVENT_IMC_READS 0x17 BBOX
+UMASK_IMC_READS_NORMAL 0x01
+
+EVENT_IMC_RETRY 0x1E BBOX
+UMASK_IMC_RETRY 0x00
+
+EVENT_IMC_WRITES 0x1A BBOX
+UMASK_IMC_WRITES_FULL 0x01
+UMASK_IMC_WRITES_PARTIAL 0x02
+UMASK_IMC_WRITES_FULL_ISOCH 0x04
+UMASK_IMC_WRITES_PARTIAL_ISOCH 0x08
+UMASK_IMC_WRITES_ALL 0x0F
+
+EVENT_OSB 0x53 BBOX
+UMASK_OSB_READS_LOCAL 0x02
+UMASK_OSB_INVITOE_LOCAL 0x04
+UMASK_OSB_REMOTE 0x08
+UMASK_OSB_CANCELLED 0x10
+UMASK_OSB_READS_LOCAL_USEFUL 0x20
+UMASK_OSB_REMOTE_USEFUL 0x40
+
+EVENT_OSB_EDR 0x54 BBOX
+UMASK_OSB_EDR_ALL 0x01
+UMASK_OSB_EDR_READS_LOCAL_I 0x02
+UMASK_OSB_EDR_READS_REMOTE_I 0x04
+UMASK_OSB_EDR_READS_LOCAL_S 0x08
+UMASK_OSB_EDR_READS_REMOTE_S 0x10
+
+EVENT_REQUESTS 0x01 BBOX
+UMASK_REQUESTS_READS_LOCAL 0x01
+UMASK_REQUESTS_READS_REMOTE 0x02
+UMASK_REQUESTS_READS 0x03
+UMASK_REQUESTS_WRITES_LOCAL 0x04
+UMASK_REQUESTS_WRITES_REMOTE 0x08
+UMASK_REQUESTS_WRITES 0x0C
+UMASK_REQUESTS_INVITOE_LOCAL 0x10
+UMASK_REQUESTS_INVITOE_REMOTE 0x20
+
+EVENT_RING_AD_USED 0x3E BBOX
+UMASK_RING_AD_USED_CW_EVEN 0x01
+UMASK_RING_AD_USED_CW_ODD 0x02
+UMASK_RING_AD_USED_CW 0x03
+UMASK_RING_AD_USED_CCW_EVEN 0x04
+UMASK_RING_AD_USED_CCW_ODD 0x08
+UMASK_RING_AD_USED_CCW 0x0C
+
+EVENT_RING_AK_USED 0x3F BBOX
+UMASK_RING_AK_USED_CW_EVEN 0x01
+UMASK_RING_AK_USED_CW_ODD 0x02
+UMASK_RING_AK_USED_CW 0x03
+UMASK_RING_AK_USED_CCW_EVEN 0x04
+UMASK_RING_AK_USED_CCW_ODD 0x08
+UMASK_RING_AK_USED_CCW 0x0C
+
+EVENT_RING_BL_USED 0x40 BBOX
+UMASK_RING_BL_USED_CW_EVEN 0x01
+UMASK_RING_BL_USED_CW_ODD 0x02
+UMASK_RING_BL_USED_CW 0x03
+UMASK_RING_BL_USED_CCW_EVEN 0x04
+UMASK_RING_BL_USED_CCW_ODD 0x08
+UMASK_RING_BL_USED_CCW 0x0C
+
+EVENT_RPQ_CYCLES_NO_REG_CREDITS 0x15 BBOX
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN0 0x01
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN1 0x02
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN2 0x04
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN3 0x08
+
+EVENT_WPQ_CYCLES_NO_REG_CREDITS 0x18 BBOX
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0 0x01
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0 0x02
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0 0x04
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0 0x08
+
+EVENT_SBO0_CREDITS_ACQUIRED 0x68 BBOX
+UMASK_SBO0_CREDITS_ACQUIRED_AD 0x01
+UMASK_SBO0_CREDITS_ACQUIRED_BL 0x02
+
+EVENT_SBO0_CREDIT_OCCUPANCY 0x6A BBOX
+UMASK_SBO0_CREDIT_OCCUPANCY_AD 0x01
+UMASK_SBO0_CREDIT_OCCUPANCY_BL 0x02
+
+EVENT_SBO1_CREDITS_ACQUIRED 0x69 BBOX
+UMASK_SBO1_CREDITS_ACQUIRED_AD 0x01
+UMASK_SBO1_CREDITS_ACQUIRED_BL 0x02
+
+EVENT_SBO1_CREDIT_OCCUPANCY 0x6B BBOX
+UMASK_SBO1_CREDIT_OCCUPANCY_AD 0x01
+UMASK_SBO1_CREDIT_OCCUPANCY_BL 0x02
+
+EVENT_SNOOPS_RSP_AFTER_DATA 0x0A BBOX
+UMASK_SNOOPS_RSP_AFTER_DATA_LOCAL 0x01
+UMASK_SNOOPS_RSP_AFTER_DATA_REMOTE 0x02
+
+EVENT_SNOOP_CYCLES_NE 0x08 BBOX
+UMASK_SNOOP_CYCLES_NE_LOCAL 0x01
+UMASK_SNOOP_CYCLES_NE_REMOTE 0x02
+UMASK_SNOOP_CYCLES_NE_ALL 0x03
+
+EVENT_SNOOP_OCCUPANCY 0x09 BBOX
+UMASK_SNOOP_OCCUPANCY_LOCAL 0x01
+UMASK_SNOOP_OCCUPANCY_REMOTE 0x02
+
+EVENT_SNOOP_RESP 0x21 BBOX
+UMASK_SNOOP_RESP_RSPI 0x01
+UMASK_SNOOP_RESP_RSPS 0x02
+UMASK_SNOOP_RESP_RSPIFWD 0x04
+UMASK_SNOOP_RESP_RSPSFWD 0x08
+UMASK_SNOOP_RESP_RSP_WB 0x10
+UMASK_SNOOP_RESP_RSP_FWD_WB 0x20
+UMASK_SNOOP_RESP_RSPCNFLCT 0x40
+
+EVENT_SNP_RESP_RECV_LOCAL 0x60 BBOX
+UMASK_SNP_RESP_RECV_LOCAL_RSPI 0x01
+UMASK_SNP_RESP_RECV_LOCAL_RSPS 0x02
+UMASK_SNP_RESP_RECV_LOCAL_RSPIFWD 0x04
+UMASK_SNP_RESP_RECV_LOCAL_RSPSFWD 0x08
+UMASK_SNP_RESP_RECV_LOCAL_RSPXWB 0x10
+UMASK_SNP_RESP_RECV_LOCAL_RSPXFWDXWB 0x20
+UMASK_SNP_RESP_RECV_LOCAL_RSPCNFLCT 0x40
+UMASK_SNP_RESP_RECV_LOCAL_OTHER 0x80
+
+EVENT_STALL_NO_SBO_CREDIT 0x6C BBOX
+UMASK_STALL_NO_SBO_CREDIT_SBO0_AD 0x01
+UMASK_STALL_NO_SBO_CREDIT_SBO1_AD 0x02
+UMASK_STALL_NO_SBO_CREDIT_SBO0_BL 0x04
+UMASK_STALL_NO_SBO_CREDIT_SBO0_BL 0x08
+
+EVENT_TAD_REQUESTS_G0 0x1B BBOX
+UMASK_TAD_REQUESTS_G0_REGION0 0x01
+UMASK_TAD_REQUESTS_G0_REGION1 0x02
+UMASK_TAD_REQUESTS_G0_REGION2 0x04
+UMASK_TAD_REQUESTS_G0_REGION3 0x08
+UMASK_TAD_REQUESTS_G0_REGION4 0x10
+UMASK_TAD_REQUESTS_G0_REGION5 0x20
+UMASK_TAD_REQUESTS_G0_REGION6 0x40
+UMASK_TAD_REQUESTS_G0_REGION7 0x80
+
+EVENT_TAD_REQUESTS_G1 0x1C BBOX
+UMASK_TAD_REQUESTS_G1_REGION8 0x01
+UMASK_TAD_REQUESTS_G1_REGION9 0x02
+UMASK_TAD_REQUESTS_G1_REGION10 0x04
+UMASK_TAD_REQUESTS_G1_REGION11 0x08
+
+EVENT_TRACKER_CYCLES_FULL 0x02 BBOX
+UMASK_TRACKER_CYCLES_FULL_GP 0x01
+UMASK_TRACKER_CYCLES_FULL_ALL 0x02
+
+EVENT_TRACKER_CYCLES_NE 0x03 BBOX
+UMASK_TRACKER_CYCLES_NE_LOCAL 0x01
+UMASK_TRACKER_CYCLES_NE_REMOTE 0x02
+UMASK_TRACKER_CYCLES_NE_ALL 0x03
+
+EVENT_TRACKER_OCCUPANCY 0x04 BBOX
+UMASK_TRACKER_OCCUPANCY_READS_LOCAL 0x04
+UMASK_TRACKER_OCCUPANCY_READS_REMOTE 0x08
+UMASK_TRACKER_OCCUPANCY_WRITES_LOCAL 0x10
+UMASK_TRACKER_OCCUPANCY_WRITES_REMOTE 0x20
+UMASK_TRACKER_OCCUPANCY_INVITOE_LOCAL 0x40
+UMASK_TRACKER_OCCUPANCY_INVITOE_REMOTE 0x80
+
+EVENT_TRACKER_PENDING_OCCUPANCY 0x05 BBOX
+UMASK_TRACKER_PENDING_OCCUPANCY_LOCAL 0x01
+UMASK_TRACKER_PENDING_OCCUPANCY_REMOTE 0x02
+
+EVENT_TXR_AD_CYCLES_FULL 0x2A BBOX
+UMASK_TXR_AD_CYCLES_FULL_SCHED0 0x01
+UMASK_TXR_AD_CYCLES_FULL_SCHED1 0x02
+UMASK_TXR_AD_CYCLES_FULL_ALL 0x03
+
+EVENT_TXR_AK 0x0E BBOX
+UMASK_TXR_AK 0x00
+
+EVENT_TXR_AK_CYCLES_FULL 0x32 BBOX
+UMASK_TXR_AK_CYCLES_FULL_SCHED0 0x01
+UMASK_TXR_AK_CYCLES_FULL_SCHED1 0x02
+UMASK_TXR_AK_CYCLES_FULL_ALL 0x03
+
+EVENT_TXR_BL 0x10 BBOX
+UMASK_TXR_BL_DRS_CACHE 0x01
+UMASK_TXR_BL_DRS_CORE 0x02
+UMASK_TXR_BL_DRS_QPI 0x04
+
+EVENT_TXR_BL_CYCLES_FULL 0x36 BBOX
+UMASK_TXR_BL_CYCLES_FULL_SCHED0 0x01
+UMASK_TXR_BL_CYCLES_FULL_SCHED1 0x02
+UMASK_TXR_BL_CYCLES_FULL_ALL 0x03
+
+EVENT_TXR_BL_OCCUPANCY 0x34 BBOX
+UMASK_TXR_BL_OCCUPANCY 0x00
+
+EVENT_TXR_STARVED 0x6D BBOX
+UMASK_TXR_STARVED_AK 0x01
+UMASK_TXR_STARVED_BL 0x02
+
+EVENT_DRAM_CLOCKTICKS 0x00 MBOX
+UMASK_DRAM_CLOCKTICKS 0x00
+
+EVENT_ACT_COUNT 0x01 MBOX
+UMASK_ACT_COUNT_RD 0x01
+UMASK_ACT_COUNT_WR 0x02
+UMASK_ACT_COUNT_BYP 0x08
+
+EVENT_BYP_CMDS 0xA1 MBOX
+UMASK_BYP_CMDS_ACT 0x01
+UMASK_BYP_CMDS_CAS 0x02
+UMASK_BYP_CMDS_PRE 0x04
+
+EVENT_CAS_COUNT 0x04 MBOX
+UMASK_CAS_COUNT_RD_REG 0x01
+UMASK_CAS_COUNT_RD_UNDERFILL 0x02
+UMASK_CAS_COUNT_RD 0x03
+UMASK_CAS_COUNT_RD_WMM 0x10
+UMASK_CAS_COUNT_RD_RMM 0x20
+UMASK_CAS_COUNT_WR_WMM 0x04
+UMASK_CAS_COUNT_WR_RMM 0x08
+UMASK_CAS_COUNT_WR 0x0C
+UMASK_CAS_COUNT_ALL 0x0F
+
+EVENT_DRAM_PRE_ALL 0x06 MBOX
+UMASK_DRAM_PRE_ALL 0x00
+
+EVENT_DRAM_REFRESH 0x05 MBOX
+UMASK_DRAM_REFRESH_PANIC 0x02
+UMASK_DRAM_REFRESH_HIGH 0x04
+
+EVENT_ECC_CORRECTABLE_ERRORS 0x09 MBOX
+UMASK_ECC_CORRECTABLE_ERRORS 0x00
+
+EVENT_MAJOR_MODES 0x07 MBOX
+UMASK_MAJOR_MODES_READ 0x01
+UMASK_MAJOR_MODES_WRITE 0x02
+UMASK_MAJOR_MODES_PARTIAL 0x03
+UMASK_MAJOR_MODES_ISOCH 0x04
+
+EVENT_POWER_CHANNEL_DLLOFF 0x84 MBOX
+UMASK_POWER_CHANNEL_DLLOFF 0x00
+
+EVENT_POWER_CHANNEL_PPD 0x85 MBOX
+UMASK_POWER_CHANNEL_PPD 0x00
+
+EVENT_POWER_CKE_CYCLES 0x83 MBOX
+UMASK_POWER_CKE_CYCLES_RANK0 0x01
+UMASK_POWER_CKE_CYCLES_RANK1 0x02
+UMASK_POWER_CKE_CYCLES_RANK2 0x04
+UMASK_POWER_CKE_CYCLES_RANK3 0x08
+UMASK_POWER_CKE_CYCLES_RANK4 0x10
+UMASK_POWER_CKE_CYCLES_RANK5 0x20
+UMASK_POWER_CKE_CYCLES_RANK6 0x40
+UMASK_POWER_CKE_CYCLES_RANK7 0x80
+
+EVENT_POWER_CRITICAL_THROTTLE_CYCLES 0x86 MBOX
+UMASK_POWER_CRITICAL_THROTTLE_CYCLES 0x00
+
+EVENT_POWER_PCU_THROTTLING 0x42 MBOX
+UMASK_POWER_PCU_THROTTLING 0x00
+
+EVENT_POWER_SELF_REFRESH 0x43 MBOX
+UMASK_POWER_SELF_REFRESH 0x00
+
+EVENT_POWER_THROTTLE_CYCLES 0x41 MBOX
+UMASK_POWER_THROTTLE_CYCLES_RANK0 0x01
+UMASK_POWER_THROTTLE_CYCLES_RANK1 0x02
+UMASK_POWER_THROTTLE_CYCLES_RANK2 0x04
+UMASK_POWER_THROTTLE_CYCLES_RANK3 0x08
+UMASK_POWER_THROTTLE_CYCLES_RANK4 0x10
+UMASK_POWER_THROTTLE_CYCLES_RANK5 0x20
+UMASK_POWER_THROTTLE_CYCLES_RANK6 0x40
+UMASK_POWER_THROTTLE_CYCLES_RANK7 0x80
+
+EVENT_PREEMPTION 0x08 MBOX
+UMASK_PREEMPTION_RD_PREEMPT_RD 0x01
+UMASK_PREEMPTION_RD_PREEMPT_WR 0x02
+
+EVENT_PRE_COUNT 0x02 MBOX
+UMASK_PRE_COUNT_PAGE_MISS 0x01
+UMASK_PRE_COUNT_PAGE_CLOSE 0x02
+UMASK_PRE_COUNT_RD 0x04
+UMASK_PRE_COUNT_WR 0x08
+UMASK_PRE_COUNT_BYP 0x10
+
+EVENT_RD_CAS_PRIO 0xA0 MBOX
+UMASK_RD_CAS_PRIO_LOW 0x01
+UMASK_RD_CAS_PRIO_MED 0x02
+UMASK_RD_CAS_PRIO_HIGH 0x04
+UMASK_RD_CAS_PRIO_PANIC 0x08
+
+EVENT_RD_CAS_RANK0 0xB0 MBOX
+UMASK_RD_CAS_RANK0_BANK0 0x00
+UMASK_RD_CAS_RANK0_BANK1 0x01
+UMASK_RD_CAS_RANK0_BANK2 0x02
+UMASK_RD_CAS_RANK0_BANK3 0x03
+UMASK_RD_CAS_RANK0_BANK4 0x04
+UMASK_RD_CAS_RANK0_BANK5 0x05
+UMASK_RD_CAS_RANK0_BANK6 0x06
+UMASK_RD_CAS_RANK0_BANK7 0x07
+UMASK_RD_CAS_RANK0_BANK8 0x08
+UMASK_RD_CAS_RANK0_BANK9 0x09
+UMASK_RD_CAS_RANK0_BANK10 0x0A
+UMASK_RD_CAS_RANK0_BANK11 0x0B
+UMASK_RD_CAS_RANK0_BANK12 0x0C
+UMASK_RD_CAS_RANK0_BANK13 0x0D
+UMASK_RD_CAS_RANK0_BANK14 0x0E
+UMASK_RD_CAS_RANK0_BANK15 0x0F
+UMASK_RD_CAS_RANK0_ALLBANKS 0x10
+UMASK_RD_CAS_RANK0_BANKG0 0x11
+UMASK_RD_CAS_RANK0_BANKG1 0x12
+UMASK_RD_CAS_RANK0_BANKG2 0x13
+UMASK_RD_CAS_RANK0_BANKG3 0x14
+
+EVENT_RD_CAS_RANK1 0xB1 MBOX
+UMASK_RD_CAS_RANK1_BANK0 0x00
+UMASK_RD_CAS_RANK1_BANK1 0x01
+UMASK_RD_CAS_RANK1_BANK2 0x02
+UMASK_RD_CAS_RANK1_BANK3 0x03
+UMASK_RD_CAS_RANK1_BANK4 0x04
+UMASK_RD_CAS_RANK1_BANK5 0x05
+UMASK_RD_CAS_RANK1_BANK6 0x06
+UMASK_RD_CAS_RANK1_BANK7 0x07
+UMASK_RD_CAS_RANK1_BANK8 0x08
+UMASK_RD_CAS_RANK1_BANK9 0x09
+UMASK_RD_CAS_RANK1_BANK10 0x0A
+UMASK_RD_CAS_RANK1_BANK11 0x0B
+UMASK_RD_CAS_RANK1_BANK12 0x0C
+UMASK_RD_CAS_RANK1_BANK13 0x0D
+UMASK_RD_CAS_RANK1_BANK14 0x0E
+UMASK_RD_CAS_RANK1_BANK15 0x0F
+UMASK_RD_CAS_RANK1_ALLBANKS 0x10
+UMASK_RD_CAS_RANK1_BANKG0 0x11
+UMASK_RD_CAS_RANK1_BANKG1 0x12
+UMASK_RD_CAS_RANK1_BANKG2 0x13
+UMASK_RD_CAS_RANK1_BANKG3 0x14
+
+EVENT_RD_CAS_RANK2 0xB2 MBOX
+UMASK_RD_CAS_RANK2_BANK0 0x00
+UMASK_RD_CAS_RANK2_BANK1 0x01
+UMASK_RD_CAS_RANK2_BANK2 0x02
+UMASK_RD_CAS_RANK2_BANK3 0x03
+UMASK_RD_CAS_RANK2_BANK4 0x04
+UMASK_RD_CAS_RANK2_BANK5 0x05
+UMASK_RD_CAS_RANK2_BANK6 0x06
+UMASK_RD_CAS_RANK2_BANK7 0x07
+UMASK_RD_CAS_RANK2_BANK8 0x08
+UMASK_RD_CAS_RANK2_BANK9 0x09
+UMASK_RD_CAS_RANK2_BANK10 0x0A
+UMASK_RD_CAS_RANK2_BANK11 0x0B
+UMASK_RD_CAS_RANK2_BANK12 0x0C
+UMASK_RD_CAS_RANK2_BANK13 0x0D
+UMASK_RD_CAS_RANK2_BANK14 0x0E
+UMASK_RD_CAS_RANK2_BANK15 0x0F
+UMASK_RD_CAS_RANK2_ALLBANKS 0x10
+UMASK_RD_CAS_RANK2_BANKG0 0x11
+UMASK_RD_CAS_RANK2_BANKG1 0x12
+UMASK_RD_CAS_RANK2_BANKG2 0x13
+UMASK_RD_CAS_RANK2_BANKG3 0x14
+
+EVENT_RD_CAS_RANK3 0xB3 MBOX
+UMASK_RD_CAS_RANK3_BANK0 0x00
+UMASK_RD_CAS_RANK3_BANK1 0x01
+UMASK_RD_CAS_RANK3_BANK2 0x02
+UMASK_RD_CAS_RANK3_BANK3 0x03
+UMASK_RD_CAS_RANK3_BANK4 0x04
+UMASK_RD_CAS_RANK3_BANK5 0x05
+UMASK_RD_CAS_RANK3_BANK6 0x06
+UMASK_RD_CAS_RANK3_BANK7 0x07
+UMASK_RD_CAS_RANK3_BANK8 0x08
+UMASK_RD_CAS_RANK3_BANK9 0x09
+UMASK_RD_CAS_RANK3_BANK10 0x0A
+UMASK_RD_CAS_RANK3_BANK11 0x0B
+UMASK_RD_CAS_RANK3_BANK12 0x0C
+UMASK_RD_CAS_RANK3_BANK13 0x0D
+UMASK_RD_CAS_RANK3_BANK14 0x0E
+UMASK_RD_CAS_RANK3_BANK15 0x0F
+UMASK_RD_CAS_RANK3_ALLBANKS 0x10
+UMASK_RD_CAS_RANK3_BANKG0 0x11
+UMASK_RD_CAS_RANK3_BANKG1 0x12
+UMASK_RD_CAS_RANK3_BANKG2 0x13
+UMASK_RD_CAS_RANK3_BANKG3 0x14
+
+EVENT_RD_CAS_RANK4 0xB4 MBOX
+UMASK_RD_CAS_RANK4_BANK0 0x00
+UMASK_RD_CAS_RANK4_BANK1 0x01
+UMASK_RD_CAS_RANK4_BANK2 0x02
+UMASK_RD_CAS_RANK4_BANK3 0x03
+UMASK_RD_CAS_RANK4_BANK4 0x04
+UMASK_RD_CAS_RANK4_BANK5 0x05
+UMASK_RD_CAS_RANK4_BANK6 0x06
+UMASK_RD_CAS_RANK4_BANK7 0x07
+UMASK_RD_CAS_RANK4_BANK8 0x08
+UMASK_RD_CAS_RANK4_BANK9 0x09
+UMASK_RD_CAS_RANK4_BANK10 0x0A
+UMASK_RD_CAS_RANK4_BANK11 0x0B
+UMASK_RD_CAS_RANK4_BANK12 0x0C
+UMASK_RD_CAS_RANK4_BANK13 0x0D
+UMASK_RD_CAS_RANK4_BANK14 0x0E
+UMASK_RD_CAS_RANK4_BANK15 0x0F
+UMASK_RD_CAS_RANK4_ALLBANKS 0x10
+UMASK_RD_CAS_RANK4_BANKG0 0x11
+UMASK_RD_CAS_RANK4_BANKG1 0x12
+UMASK_RD_CAS_RANK4_BANKG2 0x13
+UMASK_RD_CAS_RANK4_BANKG3 0x14
+
+EVENT_RD_CAS_RANK5 0xB5 MBOX
+UMASK_RD_CAS_RANK5_BANK0 0x00
+UMASK_RD_CAS_RANK5_BANK1 0x01
+UMASK_RD_CAS_RANK5_BANK2 0x02
+UMASK_RD_CAS_RANK5_BANK3 0x03
+UMASK_RD_CAS_RANK5_BANK4 0x04
+UMASK_RD_CAS_RANK5_BANK5 0x05
+UMASK_RD_CAS_RANK5_BANK6 0x06
+UMASK_RD_CAS_RANK5_BANK7 0x07
+UMASK_RD_CAS_RANK5_BANK8 0x08
+UMASK_RD_CAS_RANK5_BANK9 0x09
+UMASK_RD_CAS_RANK5_BANK10 0x0A
+UMASK_RD_CAS_RANK5_BANK11 0x0B
+UMASK_RD_CAS_RANK5_BANK12 0x0C
+UMASK_RD_CAS_RANK5_BANK13 0x0D
+UMASK_RD_CAS_RANK5_BANK14 0x0E
+UMASK_RD_CAS_RANK5_BANK15 0x0F
+UMASK_RD_CAS_RANK5_ALLBANKS 0x10
+UMASK_RD_CAS_RANK5_BANKG0 0x11
+UMASK_RD_CAS_RANK5_BANKG1 0x12
+UMASK_RD_CAS_RANK5_BANKG2 0x13
+UMASK_RD_CAS_RANK5_BANKG3 0x14
+
+EVENT_RD_CAS_RANK6 0xB6 MBOX
+UMASK_RD_CAS_RANK6_BANK0 0x00
+UMASK_RD_CAS_RANK6_BANK1 0x01
+UMASK_RD_CAS_RANK6_BANK2 0x02
+UMASK_RD_CAS_RANK6_BANK3 0x03
+UMASK_RD_CAS_RANK6_BANK4 0x04
+UMASK_RD_CAS_RANK6_BANK5 0x05
+UMASK_RD_CAS_RANK6_BANK6 0x06
+UMASK_RD_CAS_RANK6_BANK7 0x07
+UMASK_RD_CAS_RANK6_BANK8 0x08
+UMASK_RD_CAS_RANK6_BANK9 0x09
+UMASK_RD_CAS_RANK6_BANK10 0x0A
+UMASK_RD_CAS_RANK6_BANK11 0x0B
+UMASK_RD_CAS_RANK6_BANK12 0x0C
+UMASK_RD_CAS_RANK6_BANK13 0x0D
+UMASK_RD_CAS_RANK6_BANK14 0x0E
+UMASK_RD_CAS_RANK6_BANK15 0x0F
+UMASK_RD_CAS_RANK6_ALLBANKS 0x10
+UMASK_RD_CAS_RANK6_BANKG0 0x11
+UMASK_RD_CAS_RANK6_BANKG1 0x12
+UMASK_RD_CAS_RANK6_BANKG2 0x13
+UMASK_RD_CAS_RANK6_BANKG3 0x14
+
+EVENT_RD_CAS_RANK7 0xB7 MBOX
+UMASK_RD_CAS_RANK7_BANK0 0x00
+UMASK_RD_CAS_RANK7_BANK1 0x01
+UMASK_RD_CAS_RANK7_BANK2 0x02
+UMASK_RD_CAS_RANK7_BANK3 0x03
+UMASK_RD_CAS_RANK7_BANK4 0x04
+UMASK_RD_CAS_RANK7_BANK5 0x05
+UMASK_RD_CAS_RANK7_BANK6 0x06
+UMASK_RD_CAS_RANK7_BANK7 0x07
+UMASK_RD_CAS_RANK7_BANK8 0x08
+UMASK_RD_CAS_RANK7_BANK9 0x09
+UMASK_RD_CAS_RANK7_BANK10 0x0A
+UMASK_RD_CAS_RANK7_BANK11 0x0B
+UMASK_RD_CAS_RANK7_BANK12 0x0C
+UMASK_RD_CAS_RANK7_BANK13 0x0D
+UMASK_RD_CAS_RANK7_BANK14 0x0E
+UMASK_RD_CAS_RANK7_BANK15 0x0F
+UMASK_RD_CAS_RANK7_ALLBANKS 0x10
+UMASK_RD_CAS_RANK7_BANKG0 0x11
+UMASK_RD_CAS_RANK7_BANKG1 0x12
+UMASK_RD_CAS_RANK7_BANKG2 0x13
+UMASK_RD_CAS_RANK7_BANKG3 0x14
+
+EVENT_RPQ_CYCLES_NE 0x11 MBOX
+UMASK_RPQ_CYCLES_NE 0x00
+
+EVENT_RPQ_INSERTS 0x10 MBOX
+UMASK_RPQ_INSERTS 0x00
+
+EVENT_VMSE_MXB_WR_OCCUPANCY 0x91 MBOX
+UMASK_VMSE_MXB_WR_OCCUPANCY 0x00
+
+EVENT_VMSE_WR_PUSH 0x90 MBOX
+UMASK_VMSE_WR_PUSH_WMM 0x01
+UMASK_VMSE_WR_PUSH_RMM 0x02
+
+EVENT_WMM_TO_RMM 0xC0 MBOX
+UMASK_WMM_TO_RMM_LOW_THRESH 0x01
+UMASK_WMM_TO_RMM_STARVE 0x02
+UMASK_WMM_TO_RMM_VMSE_RETRY 0x04
+
+# Undocumented event, mentioned in metrics table but not defined
+EVENT_WPQ_INSERTS 0x20 MBOX
+UMASK_WPQ_INSERTS 0x00
+
+EVENT_WPQ_CYCLES_FULL 0x22 MBOX
+UMASK_WPQ_CYCLES_FULL 0x00
+
+EVENT_WPQ_CYCLES_NE 0x21 MBOX
+UMASK_WPQ_CYCLES_NE 0x00
+
+EVENT_WPQ_READ_HIT 0x23 MBOX
+UMASK_WPQ_READ_HIT 0x00
+
+EVENT_WPQ_WRITE_HIT 0x24 MBOX
+UMASK_WPQ_WRITE_HIT 0x00
+
+EVENT_WRONG_MM 0xC1 MBOX
+UMASK_WRONG_MM 0x00
+
+EVENT_WR_CAS_RANK0 0xB8 MBOX
+UMASK_WR_CAS_RANK0_BANK0 0x00
+UMASK_WR_CAS_RANK0_BANK1 0x01
+UMASK_WR_CAS_RANK0_BANK2 0x02
+UMASK_WR_CAS_RANK0_BANK3 0x03
+UMASK_WR_CAS_RANK0_BANK4 0x04
+UMASK_WR_CAS_RANK0_BANK5 0x05
+UMASK_WR_CAS_RANK0_BANK6 0x06
+UMASK_WR_CAS_RANK0_BANK7 0x07
+UMASK_WR_CAS_RANK0_BANK8 0x08
+UMASK_WR_CAS_RANK0_BANK9 0x09
+UMASK_WR_CAS_RANK0_BANK10 0x0A
+UMASK_WR_CAS_RANK0_BANK11 0x0B
+UMASK_WR_CAS_RANK0_BANK12 0x0C
+UMASK_WR_CAS_RANK0_BANK13 0x0D
+UMASK_WR_CAS_RANK0_BANK14 0x0E
+UMASK_WR_CAS_RANK0_BANK15 0x0F
+UMASK_WR_CAS_RANK0_ALLBANKS 0x10
+UMASK_WR_CAS_RANK0_BANKG0 0x11
+UMASK_WR_CAS_RANK0_BANKG1 0x12
+UMASK_WR_CAS_RANK0_BANKG2 0x13
+UMASK_WR_CAS_RANK0_BANKG3 0x14
+
+EVENT_WR_CAS_RANK1 0xB9 MBOX
+UMASK_WR_CAS_RANK1_BANK0 0x00
+UMASK_WR_CAS_RANK1_BANK1 0x01
+UMASK_WR_CAS_RANK1_BANK2 0x02
+UMASK_WR_CAS_RANK1_BANK3 0x03
+UMASK_WR_CAS_RANK1_BANK4 0x04
+UMASK_WR_CAS_RANK1_BANK5 0x05
+UMASK_WR_CAS_RANK1_BANK6 0x06
+UMASK_WR_CAS_RANK1_BANK7 0x07
+UMASK_WR_CAS_RANK1_BANK8 0x08
+UMASK_WR_CAS_RANK1_BANK9 0x09
+UMASK_WR_CAS_RANK1_BANK10 0x0A
+UMASK_WR_CAS_RANK1_BANK11 0x0B
+UMASK_WR_CAS_RANK1_BANK12 0x0C
+UMASK_WR_CAS_RANK1_BANK13 0x0D
+UMASK_WR_CAS_RANK1_BANK14 0x0E
+UMASK_WR_CAS_RANK1_BANK15 0x0F
+UMASK_WR_CAS_RANK1_ALLBANKS 0x10
+UMASK_WR_CAS_RANK1_BANKG0 0x11
+UMASK_WR_CAS_RANK1_BANKG1 0x12
+UMASK_WR_CAS_RANK1_BANKG2 0x13
+UMASK_WR_CAS_RANK1_BANKG3 0x14
+
+EVENT_WR_CAS_RANK2 0xBA MBOX
+UMASK_WR_CAS_RANK2_BANK0 0x00
+UMASK_WR_CAS_RANK2_BANK1 0x01
+UMASK_WR_CAS_RANK2_BANK2 0x02
+UMASK_WR_CAS_RANK2_BANK3 0x03
+UMASK_WR_CAS_RANK2_BANK4 0x04
+UMASK_WR_CAS_RANK2_BANK5 0x05
+UMASK_WR_CAS_RANK2_BANK6 0x06
+UMASK_WR_CAS_RANK2_BANK7 0x07
+UMASK_WR_CAS_RANK2_BANK8 0x08
+UMASK_WR_CAS_RANK2_BANK9 0x09
+UMASK_WR_CAS_RANK2_BANK10 0x0A
+UMASK_WR_CAS_RANK2_BANK11 0x0B
+UMASK_WR_CAS_RANK2_BANK12 0x0C
+UMASK_WR_CAS_RANK2_BANK13 0x0D
+UMASK_WR_CAS_RANK2_BANK14 0x0E
+UMASK_WR_CAS_RANK2_BANK15 0x0F
+UMASK_WR_CAS_RANK2_ALLBANKS 0x10
+UMASK_WR_CAS_RANK2_BANKG0 0x11
+UMASK_WR_CAS_RANK2_BANKG1 0x12
+UMASK_WR_CAS_RANK2_BANKG2 0x13
+UMASK_WR_CAS_RANK2_BANKG3 0x14
+
+EVENT_WR_CAS_RANK3 0xBB MBOX
+UMASK_WR_CAS_RANK3_BANK0 0x00
+UMASK_WR_CAS_RANK3_BANK1 0x01
+UMASK_WR_CAS_RANK3_BANK2 0x02
+UMASK_WR_CAS_RANK3_BANK3 0x03
+UMASK_WR_CAS_RANK3_BANK4 0x04
+UMASK_WR_CAS_RANK3_BANK5 0x05
+UMASK_WR_CAS_RANK3_BANK6 0x06
+UMASK_WR_CAS_RANK3_BANK7 0x07
+UMASK_WR_CAS_RANK3_BANK8 0x08
+UMASK_WR_CAS_RANK3_BANK9 0x09
+UMASK_WR_CAS_RANK3_BANK10 0x0A
+UMASK_WR_CAS_RANK3_BANK11 0x0B
+UMASK_WR_CAS_RANK3_BANK12 0x0C
+UMASK_WR_CAS_RANK3_BANK13 0x0D
+UMASK_WR_CAS_RANK3_BANK14 0x0E
+UMASK_WR_CAS_RANK3_BANK15 0x0F
+UMASK_WR_CAS_RANK3_ALLBANKS 0x10
+UMASK_WR_CAS_RANK3_BANKG0 0x11
+UMASK_WR_CAS_RANK3_BANKG1 0x12
+UMASK_WR_CAS_RANK3_BANKG2 0x13
+UMASK_WR_CAS_RANK3_BANKG3 0x14
+
+EVENT_WR_CAS_RANK4 0xBC MBOX
+UMASK_WR_CAS_RANK4_BANK0 0x00
+UMASK_WR_CAS_RANK4_BANK1 0x01
+UMASK_WR_CAS_RANK4_BANK2 0x02
+UMASK_WR_CAS_RANK4_BANK3 0x03
+UMASK_WR_CAS_RANK4_BANK4 0x04
+UMASK_WR_CAS_RANK4_BANK5 0x05
+UMASK_WR_CAS_RANK4_BANK6 0x06
+UMASK_WR_CAS_RANK4_BANK7 0x07
+UMASK_WR_CAS_RANK4_BANK8 0x08
+UMASK_WR_CAS_RANK4_BANK9 0x09
+UMASK_WR_CAS_RANK4_BANK10 0x0A
+UMASK_WR_CAS_RANK4_BANK11 0x0B
+UMASK_WR_CAS_RANK4_BANK12 0x0C
+UMASK_WR_CAS_RANK4_BANK13 0x0D
+UMASK_WR_CAS_RANK4_BANK14 0x0E
+UMASK_WR_CAS_RANK4_BANK15 0x0F
+UMASK_WR_CAS_RANK4_ALLBANKS 0x10
+UMASK_WR_CAS_RANK4_BANKG0 0x11
+UMASK_WR_CAS_RANK4_BANKG1 0x12
+UMASK_WR_CAS_RANK4_BANKG2 0x13
+UMASK_WR_CAS_RANK4_BANKG3 0x14
+
+EVENT_WR_CAS_RANK5 0xBD MBOX
+UMASK_WR_CAS_RANK5_BANK0 0x00
+UMASK_WR_CAS_RANK5_BANK1 0x01
+UMASK_WR_CAS_RANK5_BANK2 0x02
+UMASK_WR_CAS_RANK5_BANK3 0x03
+UMASK_WR_CAS_RANK5_BANK4 0x04
+UMASK_WR_CAS_RANK5_BANK5 0x05
+UMASK_WR_CAS_RANK5_BANK6 0x06
+UMASK_WR_CAS_RANK5_BANK7 0x07
+UMASK_WR_CAS_RANK5_BANK8 0x08
+UMASK_WR_CAS_RANK5_BANK9 0x09
+UMASK_WR_CAS_RANK5_BANK10 0x0A
+UMASK_WR_CAS_RANK5_BANK11 0x0B
+UMASK_WR_CAS_RANK5_BANK12 0x0C
+UMASK_WR_CAS_RANK5_BANK13 0x0D
+UMASK_WR_CAS_RANK5_BANK14 0x0E
+UMASK_WR_CAS_RANK5_BANK15 0x0F
+UMASK_WR_CAS_RANK5_ALLBANKS 0x10
+UMASK_WR_CAS_RANK5_BANKG0 0x11
+UMASK_WR_CAS_RANK5_BANKG1 0x12
+UMASK_WR_CAS_RANK5_BANKG2 0x13
+UMASK_WR_CAS_RANK5_BANKG3 0x14
+
+EVENT_WR_CAS_RANK6 0xBE MBOX
+UMASK_WR_CAS_RANK6_BANK0 0x00
+UMASK_WR_CAS_RANK6_BANK1 0x01
+UMASK_WR_CAS_RANK6_BANK2 0x02
+UMASK_WR_CAS_RANK6_BANK3 0x03
+UMASK_WR_CAS_RANK6_BANK4 0x04
+UMASK_WR_CAS_RANK6_BANK5 0x05
+UMASK_WR_CAS_RANK6_BANK6 0x06
+UMASK_WR_CAS_RANK6_BANK7 0x07
+UMASK_WR_CAS_RANK6_BANK8 0x08
+UMASK_WR_CAS_RANK6_BANK9 0x09
+UMASK_WR_CAS_RANK6_BANK10 0x0A
+UMASK_WR_CAS_RANK6_BANK11 0x0B
+UMASK_WR_CAS_RANK6_BANK12 0x0C
+UMASK_WR_CAS_RANK6_BANK13 0x0D
+UMASK_WR_CAS_RANK6_BANK14 0x0E
+UMASK_WR_CAS_RANK6_BANK15 0x0F
+UMASK_WR_CAS_RANK6_ALLBANKS 0x10
+UMASK_WR_CAS_RANK6_BANKG0 0x11
+UMASK_WR_CAS_RANK6_BANKG1 0x12
+UMASK_WR_CAS_RANK6_BANKG2 0x13
+UMASK_WR_CAS_RANK6_BANKG3 0x14
+
+EVENT_WR_CAS_RANK7 0xBF MBOX
+UMASK_WR_CAS_RANK7_BANK0 0x00
+UMASK_WR_CAS_RANK7_BANK1 0x01
+UMASK_WR_CAS_RANK7_BANK2 0x02
+UMASK_WR_CAS_RANK7_BANK3 0x03
+UMASK_WR_CAS_RANK7_BANK4 0x04
+UMASK_WR_CAS_RANK7_BANK5 0x05
+UMASK_WR_CAS_RANK7_BANK6 0x06
+UMASK_WR_CAS_RANK7_BANK7 0x07
+UMASK_WR_CAS_RANK7_BANK8 0x08
+UMASK_WR_CAS_RANK7_BANK9 0x09
+UMASK_WR_CAS_RANK7_BANK10 0x0A
+UMASK_WR_CAS_RANK7_BANK11 0x0B
+UMASK_WR_CAS_RANK7_BANK12 0x0C
+UMASK_WR_CAS_RANK7_BANK13 0x0D
+UMASK_WR_CAS_RANK7_BANK14 0x0E
+UMASK_WR_CAS_RANK7_BANK15 0x0F
+UMASK_WR_CAS_RANK7_ALLBANKS 0x10
+UMASK_WR_CAS_RANK7_BANKG0 0x11
+UMASK_WR_CAS_RANK7_BANKG1 0x12
+UMASK_WR_CAS_RANK7_BANKG2 0x13
+UMASK_WR_CAS_RANK7_BANKG3 0x14
+
+EVENT_PBOX_CLOCKTICKS 0x01 PBOX
+UMASK_PBOX_CLOCKTICKS 0x00
+
+EVENT_IIO_CREDIT 0x2D PBOX0|PBOX1
+UMASK_IIO_CREDIT_PRQ_QPI0 0x01
+UMASK_IIO_CREDIT_PRQ_QPI1 0x02
+UMASK_IIO_CREDIT_ISOCH_QPI0 0x04
+UMASK_IIO_CREDIT_ISOCH_QPI1 0x08
+
+EVENT_RING_AD_USED 0x07 PBOX
+UMASK_RING_AD_USED_CW_EVEN 0x01
+UMASK_RING_AD_USED_CW_ODD 0x02
+UMASK_RING_AD_USED_CW 0x03
+UMASK_RING_AD_USED_CCW_EVEN 0x04
+UMASK_RING_AD_USED_CCW_ODD 0x08
+UMASK_RING_AD_USED_CCW 0x0C
+
+EVENT_RING_AK_BOUNCES 0x12 PBOX
+UMASK_RING_AK_BOUNCES_UP 0x01
+UMASK_RING_AK_BOUNCES_DN 0x02
+
+EVENT_RING_AK_USED 0x08 PBOX
+UMASK_RING_AK_USED_CW_EVEN 0x01
+UMASK_RING_AK_USED_CW_ODD 0x02
+UMASK_RING_AK_USED_CW 0x03
+UMASK_RING_AK_USED_CCW_EVEN 0x04
+UMASK_RING_AK_USED_CCW_ODD 0x08
+UMASK_RING_AK_USED_CCW 0x0C
+
+EVENT_RING_BL_USED 0x09 PBOX
+UMASK_RING_BL_USED_CW_EVEN 0x01
+UMASK_RING_BL_USED_CW_ODD 0x02
+UMASK_RING_BL_USED_CW 0x03
+UMASK_RING_BL_USED_CCW_EVEN 0x04
+UMASK_RING_BL_USED_CCW_ODD 0x08
+UMASK_RING_BL_USED_CCW 0x0C
+
+EVENT_RING_IV_USED 0x0A PBOX
+UMASK_RING_IV_USED_CW 0x03
+UMASK_RING_IV_USED_CCW 0x0C
+UMASK_RING_IV_USED_ANY 0x0F
+
+EVENT_RXR_CYCLES_NE 0x10 PBOX0|PBOX1
+UMASK_RXR_CYCLES_NE_NCB 0x10
+UMASK_RXR_CYCLES_NE_NCS 0x20
+
+EVENT_RXR_INSERTS 0x11 PBOX0|PBOX1
+UMASK_RXR_INSERTS_NCB 0x10
+UMASK_RXR_INSERTS_NCS 0x20
+
+EVENT_RXR_OCCUPANCY 0x13 PBOX0
+UMASK_RXR_OCCUPANCY_DRS 0x08
+
+EVENT_TXR_CYCLES_FULL 0x25 PBOX0
+UMASK_TXR_CYCLES_FULL_AD 0x01
+UMASK_TXR_CYCLES_FULL_AK 0x02
+UMASK_TXR_CYCLES_FULL_BL 0x04
+
+EVENT_TXR_CYCLES_NE 0x23 PBOX0
+UMASK_TXR_CYCLES_NE_AD 0x01
+UMASK_TXR_CYCLES_NE_AK 0x02
+UMASK_TXR_CYCLES_NE_BL 0x04
+
+EVENT_TXR_NACK_CW 0x26 PBOX0|PBOX1
+UMASK_TXR_NACK_CW_DN_AD 0x01
+UMASK_TXR_NACK_CW_DN_BL 0x02
+UMASK_TXR_NACK_CW_DN_AK 0x04
+UMASK_TXR_NACK_CW_UP_AD 0x08
+UMASK_TXR_NACK_CW_UP_BL 0x10
+UMASK_TXR_NACK_CW_UP_AK 0x20
+
+EVENT_SBO0_CREDITS_ACQUIRED 0x28 PBOX0|PBOX1
+UMASK_SBO0_CREDITS_ACQUIRED_AD 0x01
+UMASK_SBO0_CREDITS_ACQUIRED_BL 0x02
+
+EVENT_STALL_NO_SBO_CREDIT 0x2C PBOX0|PBOX1
+UMASK_STALL_NO_SBO_CREDIT_SBO0_AD 0x01
+UMASK_STALL_NO_SBO_CREDIT_SBO1_AD 0x02
+UMASK_STALL_NO_SBO_CREDIT_SBO0_BL 0x04
+UMASK_STALL_NO_SBO_CREDIT_SBO0_BL 0x08
+
+EVENT_CACHE_TOTAL_OCCUPANCY 0x12 IBOX
+UMASK_CACHE_TOTAL_OCCUPANCY_ANY 0x01
+UMASK_CACHE_TOTAL_OCCUPANCY_SOURCE 0x02
+
+EVENT_COHERENT_OPS 0x13 IBOX
+UMASK_COHERENT_OPS_PCIRDCUR 0x01
+UMASK_COHERENT_OPS_CRD 0x02
+UMASK_COHERENT_OPS_DRD 0x04
+UMASK_COHERENT_OPS_RFO 0x08
+UMASK_COHERENT_OPS_PCITOM 0x10
+UMASK_COHERENT_OPS_PCIDCAHINT 0x20
+UMASK_COHERENT_OPS_WBMTOI 0x40
+UMASK_COHERENT_OPS_CLFLUSH 0x80
+
+EVENT_MISC0 0x14 IBOX
+UMASK_MISC0_FAST_REQ 0x01
+UMASK_MISC0_FAST_REJ 0x02
+UMASK_MISC0_2ND_RD_INSERT 0x04
+UMASK_MISC0_2ND_WR_INSERT 0x08
+UMASK_MISC0_2ND_ATOMIC_INSERT 0x10
+UMASK_MISC0_FAST_XFER 0x20
+UMASK_MISC0_PF_ACK_HINT 0x40
+UMASK_MISC0_PF_TIMEOUT 0x80
+
+EVENT_MISC1 0x15 IBOX
+UMASK_MISC1_SLOW_I 0x01
+UMASK_MISC1_SLOW_S 0x02
+UMASK_MISC1_SLOW_E 0x04
+UMASK_MISC1_SLOW_M 0x08
+UMASK_MISC1_LOST_FWD 0x10
+UMASK_MISC1_SEC_RCVD_INVLD 0x20
+UMASK_MISC1_SEC_RCVD_VLD 0x40
+UMASK_MISC1_DATA_THROTTLE 0x80
+
+EVENT_SNOOP_RESP 0x17 IBOX
+UMASK_SNOOP_RESP_MISS 0x01
+UMASK_SNOOP_RESP_HIT_I 0x02
+UMASK_SNOOP_RESP_HIT_ES 0x04
+UMASK_SNOOP_RESP_HIT_M 0x08
+UMASK_SNOOP_RESP_SNPCODE 0x10
+UMASK_SNOOP_RESP_SNPDATA 0x20
+UMASK_SNOOP_RESP_SNPINV 0x40
+
+EVENT_TRANSACTIONS 0x16 IBOX
+UMASK_TRANSACTIONS_READS 0x01
+UMASK_TRANSACTIONS_WRITES 0x02
+UMASK_TRANSACTIONS_RD_PREF 0x04
+UMASK_TRANSACTIONS_WR_PREF 0x08
+UMASK_TRANSACTIONS_ALL_READS 0x05
+UMASK_TRANSACTIONS_ALL_WRITES 0x0A
+UMASK_TRANSACTIONS_ATOMIC 0x10
+UMASK_TRANSACTIONS_OTHER 0x20
+UMASK_TRANSACTIONS_ORDERINGQ 0x40
+
+EVENT_RXR_AK_INSERTS 0x0A IBOX
+UMASK_RXR_AK_INSERTS 0x00
+
+EVENT_RXR_BL_DRS_CYCLES_FULL 0x04 IBOX
+UMASK_RXR_BL_DRS_CYCLES_FULL 0x00
+
+EVENT_RXR_BL_DRS_INSERTS 0x01 IBOX
+UMASK_RXR_BL_DRS_INSERTS 0x00
+
+EVENT_RXR_BL_DRS_OCCUPANCY 0x07 IBOX
+UMASK_RXR_BL_DRS_OCCUPANCY 0x00
+
+EVENT_RXR_BL_NCB_CYCLES_FULL 0x05 IBOX
+UMASK_RXR_BL_NCB_CYCLES_FULL 0x00
+
+EVENT_RXR_BL_NCB_INSERTS 0x02 IBOX
+UMASK_RXR_BL_NCB_INSERTS 0x00
+
+EVENT_RXR_BL_NCB_OCCUPANCY 0x08 IBOX
+UMASK_RXR_BL_NCB_OCCUPANCY 0x00
+
+EVENT_RXR_BL_NCS_CYCLES_FULL 0x06 IBOX
+UMASK_RXR_BL_NCS_CYCLES_FULL 0x00
+
+EVENT_RXR_BL_NCS_INSERTS 0x03 IBOX
+UMASK_RXR_BL_NCS_INSERTS 0x00
+
+EVENT_RXR_BL_NCS_OCCUPANCY 0x09 IBOX
+UMASK_RXR_BL_NCS_OCCUPANCY 0x00
+
+EVENT_TXR_AD_STALL_CREDIT_CYCLES 0x18 IBOX
+UMASK_TXR_AD_STALL_CREDIT_CYCLES 0x00
+
+EVENT_TXR_BL_STALL_CREDIT_CYCLES 0x19 IBOX
+UMASK_TXR_BL_STALL_CREDIT_CYCLES 0x00
+
+EVENT_TXR_DATA_INSERTS_NCB 0x0E IBOX
+UMASK_TXR_DATA_INSERTS_NCB 0x00
+
+EVENT_TXR_DATA_INSERTS_NCS 0x0F IBOX
+UMASK_TXR_DATA_INSERTS_NCS 0x00
+
+EVENT_TXR_REQUEST_OCCUPANCY 0x0D IBOX
+UMASK_TXR_REQUEST_OCCUPANCY 0x00
+
+EVENT_RBOX_CLOCKTICK 0x01 RBOX
+UMASK_RBOX_CLOCKTICK 0x00
+
+EVENT_C_HI_AD_CREDITS_EMPTY 0x1F RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO8 0x01
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO9 0x02
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO10 0x04
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO11 0x08
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO12 0x10
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO13 0x20
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO14_16 0x40
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO15_17 0x80
+
+EVENT_C_LO_AD_CREDITS_EMPTY 0x22 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO0 0x01
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO1 0x02
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO2 0x04
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO3 0x08
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO4 0x10
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO5 0x20
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO6 0x40
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO7 0x80
+
+EVENT_HA_R2_BL_CREDITS_EMPTY 0x2D RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_HA_R2_BL_CREDITS_EMPTY_HA0 0x01
+UMASK_HA_R2_BL_CREDITS_EMPTY_HA1 0x02
+UMASK_HA_R2_BL_CREDITS_EMPTY_R2_NCB 0x04
+UMASK_HA_R2_BL_CREDITS_EMPTY_R2_NCS 0x08
+
+EVENT_QPI0_AD_CREDITS_EMPTY 0x20 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_QPI0_AD_CREDITS_EMPTY_VNA 0x01
+UMASK_QPI0_AD_CREDITS_EMPTY_VN0_HOM 0x02
+UMASK_QPI0_AD_CREDITS_EMPTY_VN0_SNP 0x04
+UMASK_QPI0_AD_CREDITS_EMPTY_VN0_NDR 0x08
+UMASK_QPI0_AD_CREDITS_EMPTY_VN1_HOM 0x10
+UMASK_QPI0_AD_CREDITS_EMPTY_VN1_SNP 0x20
+UMASK_QPI0_AD_CREDITS_EMPTY_VN1_NDR 0x40
+
+EVENT_QPI1_AD_CREDITS_EMPTY 0x2E RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_QPI1_AD_CREDITS_EMPTY_VNA 0x01
+UMASK_QPI1_AD_CREDITS_EMPTY_VN1_HOM 0x10
+UMASK_QPI1_AD_CREDITS_EMPTY_VN1_SNP 0x20
+UMASK_QPI1_AD_CREDITS_EMPTY_VN1_NDR 0x40
+
+EVENT_QPI0_BL_CREDITS_EMPTY 0x21 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_QPI0_BL_CREDITS_EMPTY_VNA 0x01
+UMASK_QPI0_BL_CREDITS_EMPTY_VN1_HOM 0x10
+UMASK_QPI0_BL_CREDITS_EMPTY_VN1_SNP 0x20
+UMASK_QPI0_BL_CREDITS_EMPTY_VN1_NDR 0x40
+
+EVENT_QPI1_BL_CREDITS_EMPTY 0x2F RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_QPI1_BL_CREDITS_EMPTY_VNA 0x01
+UMASK_QPI1_BL_CREDITS_EMPTY_VN0_HOM 0x02
+UMASK_QPI1_BL_CREDITS_EMPTY_VN0_SNP 0x04
+UMASK_QPI1_BL_CREDITS_EMPTY_VN0_NDR 0x08
+UMASK_QPI1_BL_CREDITS_EMPTY_VN1_HOM 0x10
+UMASK_QPI1_BL_CREDITS_EMPTY_VN1_SNP 0x20
+UMASK_QPI1_BL_CREDITS_EMPTY_VN1_NDR 0x40
+
+EVENT_RING_AD_USED 0x07 RBOX
+UMASK_RING_AD_USED_CW_EVEN 0x01
+UMASK_RING_AD_USED_CW_ODD 0x02
+UMASK_RING_AD_USED_CW 0x03
+UMASK_RING_AD_USED_CCW_EVEN 0x04
+UMASK_RING_AD_USED_CCW_ODD 0x08
+UMASK_RING_AD_USED_CCW 0x0C
+
+EVENT_RING_AK_USED 0x08 RBOX
+UMASK_RING_AK_USED_CW_EVEN 0x01
+UMASK_RING_AK_USED_CW_ODD 0x02
+UMASK_RING_AK_USED_CW 0x03
+UMASK_RING_AK_USED_CCW_EVEN 0x04
+UMASK_RING_AK_USED_CCW_ODD 0x08
+UMASK_RING_AK_USED_CCW 0x0C
+
+EVENT_RING_BL_USED 0x09 RBOX
+UMASK_RING_BL_USED_CW_EVEN 0x01
+UMASK_RING_BL_USED_CW_ODD 0x02
+UMASK_RING_BL_USED_CW 0x03
+UMASK_RING_BL_USED_CCW_EVEN 0x04
+UMASK_RING_BL_USED_CCW_ODD 0x08
+UMASK_RING_BL_USED_CCW 0x0C
+
+EVENT_RING_IV_USED 0x0A RBOX
+UMASK_RING_IV_USED_CW 0x03
+UMASK_RING_IV_USED_CCW 0x0C
+UMASK_RING_IV_USED_ANY 0x0F
+
+EVENT_RING_SINK_STARVED 0x0E RBOX
+UMASK_RING_SINK_STARVED_AK 0x02
+
+EVENT_RXR_CYCLES_NE 0x10 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_CYCLES_NE_HOM 0x01
+UMASK_RXR_CYCLES_NE_SNP 0x02
+UMASK_RXR_CYCLES_NE_NDR 0x04
+
+EVENT_RXR_CYCLES_NE_VN1 0x14 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_CYCLES_NE_VN1_HOM 0x01
+UMASK_RXR_CYCLES_NE_VN1_SNP 0x02
+UMASK_RXR_CYCLES_NE_VN1_NDR 0x04
+UMASK_RXR_CYCLES_NE_VN1_DRS 0x08
+UMASK_RXR_CYCLES_NE_VN1_NCB 0x10
+UMASK_RXR_CYCLES_NE_VN1_NCS 0x20
+
+EVENT_RXR_INSERTS 0x11 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_INSERTS_HOM 0x01
+UMASK_RXR_INSERTS_SNP 0x02
+UMASK_RXR_INSERTS_NDR 0x04
+UMASK_RXR_INSERTS_DRS 0x08
+UMASK_RXR_INSERTS_NCB 0x10
+UMASK_RXR_INSERTS_NCS 0x20
+
+EVENT_RXR_INSERTS_VN1 0x15 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_INSERTS_VN1_HOM 0x01
+UMASK_RXR_INSERTS_VN1_SNP 0x02
+UMASK_RXR_INSERTS_VN1_NDR 0x04
+UMASK_RXR_INSERTS_VN1_DRS 0x08
+UMASK_RXR_INSERTS_VN1_NCB 0x10
+UMASK_RXR_INSERTS_VN1_NCS 0x20
+
+EVENT_RXR_OCCUPANCY_VN1 0x13 RBOX0C0|RBOX1C0
+UMASK_RXR_OCCUPANCY_VN1_HOM 0x01
+UMASK_RXR_OCCUPANCY_VN1_SNP 0x02
+UMASK_RXR_OCCUPANCY_VN1_NDR 0x04
+UMASK_RXR_OCCUPANCY_VN1_DRS 0x08
+UMASK_RXR_OCCUPANCY_VN1_NCB 0x10
+UMASK_RXR_OCCUPANCY_VN1_NCS 0x20
+
+EVENT_TXR_CYCLES_FULL 0x25 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_TXR_CYCLES_FULL 0x00
+
+EVENT_TXR_CYCLES_NE 0x23 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_TXR_CYCLES_NE 0x00
+
+EVENT_TXR_NACK 0x26 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_TXR_NACK_DN_AD 0x01
+UMASK_TXR_NACK_DN_BL 0x02
+UMASK_TXR_NACK_DN_AK 0x04
+UMASK_TXR_NACK_UP_AD 0x08
+UMASK_TXR_NACK_UP_BL 0x10
+UMASK_TXR_NACK_UP_AK 0x20
+
+EVENT_SBO0_CREDITS_ACQUIRED 0x28 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_SBO0_CREDITS_ACQUIRED_AD 0x01
+UMASK_SBO0_CREDITS_ACQUIRED_BL 0x02
+
+EVENT_SBO1_CREDITS_ACQUIRED 0x29 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_SBO1_CREDITS_ACQUIRED_AD 0x01
+UMASK_SBO1_CREDITS_ACQUIRED_BL 0x02
+
+EVENT_STALL_NO_SBO_CREDIT 0x2C RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_STALL_NO_SBO_CREDIT_SBO0_AD 0x01
+UMASK_STALL_NO_SBO_CREDIT_SBO1_AD 0x02
+UMASK_STALL_NO_SBO_CREDIT_SBO0_BL 0x04
+UMASK_STALL_NO_SBO_CREDIT_SBO1_BL 0x08
+
+EVENT_VN0_CREDITS_REJECT 0x37 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN0_CREDITS_REJECT_HOM 0x01
+UMASK_VN0_CREDITS_REJECT_SNP 0x02
+UMASK_VN0_CREDITS_REJECT_NDR 0x04
+UMASK_VN0_CREDITS_REJECT_DRS 0x08
+UMASK_VN0_CREDITS_REJECT_NCB 0x10
+UMASK_VN0_CREDITS_REJECT_NCS 0x20
+
+EVENT_VN1_CREDITS_REJECT 0x39 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN1_CREDITS_REJECT_HOM 0x01
+UMASK_VN1_CREDITS_REJECT_SNP 0x02
+UMASK_VN1_CREDITS_REJECT_NDR 0x04
+UMASK_VN1_CREDITS_REJECT_DRS 0x08
+UMASK_VN1_CREDITS_REJECT_NCB 0x10
+UMASK_VN1_CREDITS_REJECT_NCS 0x20
+
+EVENT_VNA_CREDITS_REJECT 0x34 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VNA_CREDITS_REJECT_HOM 0x01
+UMASK_VNA_CREDITS_REJECT_SNP 0x02
+UMASK_VNA_CREDITS_REJECT_NDR 0x04
+UMASK_VNA_CREDITS_REJECT_DRS 0x08
+UMASK_VNA_CREDITS_REJECT_NCB 0x10
+UMASK_VNA_CREDITS_REJECT_NCS 0x20
+
+EVENT_VN0_CREDITS_USED 0x36 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN0_CREDITS_USED_HOM 0x01
+UMASK_VN0_CREDITS_USED_SNP 0x02
+UMASK_VN0_CREDITS_USED_NDR 0x04
+UMASK_VN0_CREDITS_USED_DRS 0x08
+UMASK_VN0_CREDITS_USED_NCB 0x10
+UMASK_VN0_CREDITS_USED_NCS 0x20
+
+EVENT_VN1_CREDITS_USED 0x38 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN1_CREDITS_USED_HOM 0x01
+UMASK_VN1_CREDITS_USED_SNP 0x02
+UMASK_VN1_CREDITS_USED_NDR 0x04
+UMASK_VN1_CREDITS_USED_DRS 0x08
+UMASK_VN1_CREDITS_USED_NCB 0x10
+UMASK_VN1_CREDITS_USED_NCS 0x20
+
+EVENT_VNA_CREDITS_ACQUIRED 0x33 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+EVENT_VNA_CREDITS_ACQUIRED_AD 0x01
+EVENT_VNA_CREDITS_ACQUIRED_BL 0x04
+
+EVENT_BOUNCE_CONTROL 0x0A SBOX
+UMASK_BOUNCE_CONTROL 0x00
+
+EVENT_SBOX_CLOCKTICKS 0x00 SBOX
+UMASK_SBOX_CLOCKTICKS 0x00
+
+EVENT_FAST_ASSERTED 0x09 SBOX
+UMASK_FAST_ASSERTED 0x00
+
+EVENT_RING_AD_USED 0x1B SBOX
+UMASK_RING_AD_USED_ANY 0x0F
+UMASK_RING_AD_USED_UP_EVEN 0x01
+UMASK_RING_AD_USED_UP_ODD 0x02
+UMASK_RING_AD_USED_UP 0x03
+UMASK_RING_AD_USED_DOWN_EVEN 0x04
+UMASK_RING_AD_USED_DOWN_ODD 0x08
+UMASK_RING_AD_USED_DOWN 0x0C
+
+EVENT_RING_AK_USED 0x1C SBOX
+UMASK_RING_AK_USED_ANY 0x0F
+UMASK_RING_AK_USED_UP_EVEN 0x01
+UMASK_RING_AK_USED_UP_ODD 0x02
+UMASK_RING_AK_USED_UP 0x03
+UMASK_RING_AK_USED_DOWN_EVEN 0x04
+UMASK_RING_AK_USED_DOWN_ODD 0x08
+UMASK_RING_AK_USED_DOWN 0x0C
+
+EVENT_RING_BL_USED 0x1D SBOX
+UMASK_RING_BL_USED_ANY 0x0F
+UMASK_RING_BL_USED_UP_EVEN 0x01
+UMASK_RING_BL_USED_UP_ODD 0x02
+UMASK_RING_BL_USED_UP 0x03
+UMASK_RING_BL_USED_DOWN_EVEN 0x04
+UMASK_RING_BL_USED_DOWN_ODD 0x08
+UMASK_RING_BL_USED_DOWN 0x0C
+
+EVENT_RING_BOUNCES 0x05 SBOX
+UMASK_RING_BOUNCES_AD_CACHE 0x01
+UMASK_RING_BOUNCES_AK_CORE 0x02
+UMASK_RING_BOUNCES_BL_CORE 0x04
+UMASK_RING_BOUNCES_IV_CORE 0x08
+
+EVENT_RING_IV_USED 0x1E SBOX
+UMASK_RING_IV_USED_ANY 0x0F
+UMASK_RING_IV_USED_UP 0x03
+UMASK_RING_IV_USED_DOWN 0x0C
+
+EVENT_RXR_BYPASS 0x12 SBOX
+UMASK_RXR_BYPASS_AD_CRD 0x01
+UMASK_RXR_BYPASS_AD_BNC 0x02
+UMASK_RXR_BYPASS_BL_CRD 0x04
+UMASK_RXR_BYPASS_BL_BNC 0x08
+UMASK_RXR_BYPASS_AK 0x10
+UMASK_RXR_BYPASS_IV 0x20
+
+EVENT_RxR_INSERTS 0x13 SBOX
+UMASK_RXR_INSERTS_AD_CRD 0x01
+UMASK_RXR_INSERTS_AD_BNC 0x02
+UMASK_RXR_INSERTS_BL_CRD 0x04
+UMASK_RXR_INSERTS_BL_BNC 0x08
+UMASK_RXR_INSERTS_AK 0x10
+UMASK_RXR_INSERTS_IV 0x20
+
+EVENT_RXR_OCCUPANCY 0x11 SBOX
+UMASK_RXR_OCCUPANCY_AD_CRD 0x01
+UMASK_RXR_OCCUPANCY_AD_BNC 0x02
+UMASK_RXR_OCCUPANCY_BL_CRD 0x04
+UMASK_RXR_OCCUPANCY_BL_BNC 0x08
+UMASK_RXR_OCCUPANCY_AK 0x10
+UMASK_RXR_OCCUPANCY_IV 0x20
+
+EVENT_TXR_ADS_USED 0x04 SBOX
+UMASK_TXR_ADS_USED_AD 0x01
+UMASK_TXR_ADS_USED_AK 0x02
+UMASK_TXR_ADS_USED_BL 0x04
+
+EVENT_TXR_INSERTS 0x02 SBOX
+UMASK_TXR_INSERTS_AD_CRD 0x01
+UMASK_TXR_INSERTS_AD_BNC 0x02
+UMASK_TXR_INSERTS_BL_CRD 0x04
+UMASK_TXR_INSERTS_BL_BNC 0x08
+UMASK_TXR_INSERTS_AK 0x10
+UMASK_TXR_INSERTS_IV 0x20
+
+EVENT_TXR_OCCUPANCY 0x01 SBOX
+UMASK_TXR_OCCUPANCY_AD_CRD 0x01
+UMASK_TXR_OCCUPANCY_AD_BNC 0x02
+UMASK_TXR_OCCUPANCY_BL_CRD 0x04
+UMASK_TXR_OCCUPANCY_BL_BNC 0x08
+UMASK_TXR_OCCUPANCY_AK 0x10
+UMASK_TXR_OCCUPANCY_IV 0x20
+
+EVENT_TXR_ORDERING 0x07 SBOX
+UMASK_TXR_ORDERING_IV_SNOOPGO_UP 0x01
+UMASK_TXR_ORDERING_IV_SNOOPGO_DN 0x02
+UMASK_TXR_ORDERING_AK_U2C_UP_EVEN 0x04
+UMASK_TXR_ORDERING_AK_U2C_UP_ODD 0x08
+UMASK_TXR_ORDERING_AK_U2C_DN_EVEN 0x10
+UMASK_TXR_ORDERING_AK_U2C_DN_ODD 0x20
+
+EVENT_QBOX_CLOCKTICKS 0x14 QBOX
+UMASK_QBOX_CLOCKTICKS 0x00
+
+EVENT_CTO_COUNT 0x38 QBOX
+OPTIONS_CTO_COUNT EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MASK2_MASK|EVENT_OPTION_MASK3_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK|EVENT_OPTION_MATCH2_MASK|EVENT_OPTION_MATCH3_MASK
+UMASK_CTO_COUNT 0x00 0x01 0x00
+
+EVENT_DIRECT2CORE 0x13 QBOX
+UMASK_DIRECT2CORE_SUCCESS_RBT_HIT 0x01
+UMASK_DIRECT2CORE_FAILURE_CREDITS 0x02
+UMASK_DIRECT2CORE_FAILURE_RBT_HIT 0x04
+UMASK_DIRECT2CORE_FAILURE_CREDITS_RBT 0x08
+UMASK_DIRECT2CORE_FAILURE_MISS 0x10
+UMASK_DIRECT2CORE_FAILURE_CREDITS_MISS 0x20
+UMASK_DIRECT2CORE_FAILURE_RBT_MISS 0x40
+UMASK_DIRECT2CORE_FAILURE_CREDITS_RBT_MISS 0x80
+
+EVENT_L1_POWER_CYCLES 0x12 QBOX
+UMASK_L1_POWER_CYCLES 0x00
+
+EVENT_RXL0P_POWER_CYCLES 0x10 QBOX
+UMASK_RXL0P_POWER_CYCLES 0x00
+
+EVENT_RXL0_POWER_CYCLES 0x0F QBOX
+UMASK_RXL0_POWER_CYCLES 0x00
+
+EVENT_RXL_BYPASSED 0x09 QBOX
+UMASK_RXL_BYPASSED 0x00
+
+EVENT_RXL_CREDITS_CONSUMED_VN0 0x1E QBOX
+UMASK_RXL_CREDITS_CONSUMED_VN0_DRS 0x01 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN0_NCB 0x02 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN0_NCS 0x04 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN0_HOM 0x08 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN0_SNP 0x10 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN0_NDR 0x20 0x01 0x00
+
+EVENT_RXL_CREDITS_CONSUMED_VN1 0x39 QBOX
+UMASK_RXL_CREDITS_CONSUMED_VN1_DRS 0x01 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN1_NCB 0x02 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN1_NCS 0x04 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN1_HOM 0x08 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN1_SNP 0x10 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN1_NDR 0x20 0x01 0x00
+
+EVENT_RXL_CREDITS_CONSUMED_VNA 0x1D QBOX
+UMASK_RXL_CREDITS_CONSUMED_VNA 0x00 0x01 0x00
+
+EVENT_RXL_CYCLES_NE 0x0A QBOX
+UMASK_RXL_CYCLES_NE 0x00
+
+EVENT_RXL_FLITS_G0 0x01 QBOX
+UMASK_RXL_FLITS_G0_IDLE 0x01
+UMASK_RXL_FLITS_G0_DATA 0x02
+UMASK_RXL_FLITS_G0_NON_DATA 0x04
+
+EVENT_RXL_FLITS_G1 0x02 QBOX
+UMASK_RXL_FLITS_G1_SNP 0x01 0x01 0x00
+UMASK_RXL_FLITS_G1_HOM_REQ 0x02 0x01 0x00
+UMASK_RXL_FLITS_G1_HOM_NONREQ 0x04 0x01 0x00
+UMASK_RXL_FLITS_G1_HOM 0x06 0x01 0x00
+UMASK_RXL_FLITS_G1_DRS_DATA 0x08 0x01 0x00
+UMASK_RXL_FLITS_G1_DRS_NONDATA 0x10 0x01 0x00
+UMASK_RXL_FLITS_G1_DRS 0x18 0x01 0x00
+
+EVENT_RXL_FLITS_G2 0x03 QBOX
+UMASK_RXL_FLITS_G2_NDR_AD 0x01 0x01 0x00
+UMASK_RXL_FLITS_G2_NDR_AK 0x02 0x01 0x00
+UMASK_RXL_FLITS_G2_NCB_DATA 0x04 0x01 0x00
+UMASK_RXL_FLITS_G2_NCB_NONDATA 0x08 0x01 0x00
+UMASK_RXL_FLITS_G2_NCB 0x0C 0x01 0x00
+UMASK_RXL_FLITS_G2_NCS 0x10 0x01 0x00
+
+EVENT_RXL_INSERTS 0x08 QBOX
+UMASK_RXL_INSERTS 0x00
+
+EVENT_RXL_INSERTS_DRS 0x09 QBOX
+UMASK_RXL_INSERTS_DRS_VN0 0x01 0x01 0x00
+UMASK_RXL_INSERTS_DRS_VN1 0x02 0x01 0x00
+
+EVENT_RXL_INSERTS_HOM 0x0C QBOX
+UMASK_RXL_INSERTS_HOM_VN0 0x01 0x01 0x00
+UMASK_RXL_INSERTS_HOM_VN1 0x02 0x01 0x00
+
+EVENT_RXL_INSERTS_NCB 0x0A QBOX
+UMASK_RXL_INSERTS_NCB_VN0 0x01 0x01 0x00
+UMASK_RXL_INSERTS_NCB_VN1 0x02 0x01 0x00
+
+EVENT_RXL_INSERTS_NCS 0x0B QBOX
+UMASK_RXL_INSERTS_NCS_VN0 0x01 0x01 0x00
+UMASK_RXL_INSERTS_NCS_VN1 0x02 0x01 0x00
+
+EVENT_RXL_INSERTS_NDR 0x0E QBOX
+UMASK_RXL_INSERTS_NDR_VN0 0x01 0x01 0x00
+UMASK_RXL_INSERTS_NDR_VN1 0x02 0x01 0x00
+
+EVENT_RXL_INSERTS_SNP 0x0D QBOX
+UMASK_RXL_INSERTS_SNP_VN0 0x01 0x01 0x00
+UMASK_RXL_INSERTS_SNP_VN1 0x02 0x01 0x00
+
+EVENT_RXL_OCCUPANCY 0x0B QBOX
+UMASK_RXL_OCCUPANCY 0x00
+
+EVENT_RXL_OCCUPANCY_DRS 0x15 QBOX
+UMASK_RXL_OCCUPANCY_DRS_VN0 0x01 0x01 0x00
+UMASK_RXL_OCCUPANCY_DRS_VN1 0x02 0x01 0x00
+
+EVENT_RXL_OCCUPANCY_HOM 0x18 QBOX
+UMASK_RXL_OCCUPANCY_HOM_VN0 0x01 0x01 0x00
+UMASK_RXL_OCCUPANCY_HOM_VN1 0x02 0x01 0x00
+
+EVENT_RXL_OCCUPANCY_NCB 0x16 QBOX
+UMASK_RXL_OCCUPANCY_NCB_VN0 0x01 0x01 0x00
+UMASK_RXL_OCCUPANCY_NCB_VN1 0x02 0x01 0x00
+
+EVENT_RXL_OCCUPANCY_NCS 0x17 QBOX
+UMASK_RXL_OCCUPANCY_NCS_VN0 0x01 0x01 0x00
+UMASK_RXL_OCCUPANCY_NCS_VN1 0x02 0x01 0x00
+
+EVENT_RXL_OCCUPANCY_NDR 0x1A QBOX
+UMASK_RXL_OCCUPANCY_NDR_VN0 0x01 0x01 0x00
+UMASK_RXL_OCCUPANCY_NDR_VN1 0x02 0x01 0x00
+
+EVENT_RXL_OCCUPANCY_SNP 0x19 QBOX
+UMASK_RXL_OCCUPANCY_SNP_VN0 0x01 0x01 0x00
+UMASK_RXL_OCCUPANCY_SNP_VN1 0x02 0x01 0x00
+
+EVENT_TXL0P_POWER_CYCLES 0x0D QBOX
+UMASK_TXL0P_POWER_CYCLES 0x00
+
+EVENT_TXL0_POWER_CYCLES 0x0C QBOX
+UMASK_TXL0_POWER_CYCLES 0x00
+
+EVENT_TXL_BYPASSED 0x05 QBOX
+UMASK_TXL_BYPASSED 0x00
+
+EVENT_TXL_CYCLES_NE 0x06 QBOX
+UMASK_TXL_CYCLES_NE 0x00
+
+EVENT_TXL_FLITS_G0 0x00 QBOX
+UMASK_TXL_FLITS_G0_IDLE 0x01
+UMASK_TXL_FLITS_G0_DATA 0x02
+UMASK_TXL_FLITS_G0_NON_DATA 0x04
+
+EVENT_TXL_FLITS_G1 0x00 QBOX
+UMASK_TXL_FLITS_G1_SNP 0x01 0x01 0x00
+UMASK_TXL_FLITS_G1_HOM_REQ 0x02 0x01 0x00
+UMASK_TXL_FLITS_G1_HOM_NONREQ 0x04 0x01 0x00
+UMASK_TXL_FLITS_G1_HOM 0x06 0x01 0x00
+UMASK_TXL_FLITS_G1_DRS_DATA 0x08 0x01 0x00
+UMASK_TXL_FLITS_G1_DRS_NONDATA 0x10 0x01 0x00
+UMASK_TXL_FLITS_G1_DRS 0x18 0x01 0x00
+
+EVENT_TXL_FLITS_G2 0x01 QBOX
+UMASK_TXL_FLITS_G2_NDR_AD 0x01 0x01 0x00
+UMASK_TXL_FLITS_G2_NDR_AK 0x02 0x01 0x00
+UMASK_TXL_FLITS_G2_NCB_DATA 0x04 0x01 0x00
+UMASK_TXL_FLITS_G2_NCB_NONDATA 0x08 0x01 0x00
+UMASK_TXL_FLITS_G2_NCB 0x0C 0x01 0x00
+UMASK_TXL_FLITS_G2_NCS 0x10 0x01 0x00
+
+EVENT_TXL_INSERTS 0x04 QBOX
+UMASK_TXL_INSERTS 0x00
+
+EVENT_TXL_OCCUPANCY 0x07 QBOX
+UMASK_TXL_OCCUPANCY 0x00
+
+EVENT_TXR_AD_HOM_CREDIT_ACQUIRED 0x26 QBOX
+UMASK_TXR_AD_HOM_CREDIT_ACQUIRED_VN0 0x01 0x01 0x00
+UMASK_TXR_AD_HOM_CREDIT_ACQUIRED_VN1 0x02 0x01 0x00
+
+EVENT_TXR_AD_HOM_CREDIT_OCCUPANCY 0x22 QBOX
+UMASK_TXR_AD_HOM_CREDIT_OCCUPANCY_VN0 0x01 0x01 0x00
+UMASK_TXR_AD_HOM_CREDIT_OCCUPANCY_VN1 0x02 0x01 0x00
+
+EVENT_TXR_AD_NDR_CREDIT_ACQUIRED 0x28 QBOX
+UMASK_TXR_AD_NDR_CREDIT_ACQUIRED_VN0 0x01 0x01 0x00
+UMASK_TXR_AD_NDR_CREDIT_ACQUIRED_VN1 0x02 0x01 0x00
+
+EVENT_TXR_AD_NDR_CREDIT_OCCUPANCY 0x24 QBOX
+UMASK_TXR_AD_NDR_CREDIT_OCCUPANCY_VN0 0x01 0x01 0x00
+UMASK_TXR_AD_NDR_CREDIT_OCCUPANCY_VN1 0x02 0x01 0x00
+
+EVENT_TXR_AD_SNP_CREDIT_ACQUIRED 0x27 QBOX
+UMASK_TXR_AD_SNP_CREDIT_ACQUIRED_VN0 0x01 0x01 0x00
+UMASK_TXR_AD_SNP_CREDIT_ACQUIRED_VN1 0x02 0x01 0x00
+
+EVENT_TXR_AD_SNP_CREDIT_OCCUPANCY 0x23 QBOX
+UMASK_TXR_AD_SNP_CREDIT_OCCUPANCY_VN0 0x01 0x01 0x00
+UMASK_TXR_AD_SNP_CREDIT_OCCUPANCY_VN1 0x02 0x01 0x00
+
+EVENT_TXR_AK_NDR_CREDIT_ACQUIRED 0x29 QBOX
+UMASK_TXR_AK_NDR_CREDIT_ACQUIRED 0x00 0x01 0x00
+
+EVENT_TXR_AK_NDR_CREDIT_OCCUPANCY 0x25 QBOX
+UMASK_TXR_AK_NDR_CREDIT_OCCUPANCY 0x00 0x01 0x00
+
+EVENT_TXR_BL_DRS_CREDIT_ACQUIRED 0x2A QBOX
+UMASK_TXR_BL_DRS_CREDIT_ACQUIRED_VN0 0x01 0x01 0x00
+UMASK_TXR_BL_DRS_CREDIT_ACQUIRED_VN1 0x02 0x01 0x00
+UMASK_TXR_BL_DRS_CREDIT_ACQUIRED_VN_SHR 0x04 0x01 0x00
+
+EVENT_TXR_BL_DRS_CREDIT_OCCUPANCY 0x1F QBOX
+UMASK_TXR_BL_DRS_CREDIT_OCCUPANCY_VN0 0x01 0x01 0x00
+UMASK_TXR_BL_DRS_CREDIT_OCCUPANCY_VN1 0x02 0x01 0x00
+UMASK_TXR_BL_DRS_CREDIT_OCCUPANCY_VN_SHR 0x04 0x01 0x00
+
+EVENT_TXR_BL_NCB_CREDIT_ACQUIRED 0x2B QBOX
+UMASK_TXR_BL_NCB_CREDIT_ACQUIRED_VN0 0x01 0x01 0x00
+UMASK_TXR_BL_NCB_CREDIT_ACQUIRED_VN1 0x02 0x01 0x00
+
+EVENT_TXR_BL_NCB_CREDIT_OCCUPANCY 0x20 QBOX
+UMASK_TXR_BL_NCB_CREDIT_OCCUPANCY_VN0 0x01 0x01 0x00
+UMASK_TXR_BL_NCB_CREDIT_OCCUPANCY_VN1 0x02 0x01 0x00
+
+EVENT_TXR_BL_NCS_CREDIT_ACQUIRED 0x2C QBOX
+UMASK_TXR_BL_NCS_CREDIT_ACQUIRED_VN0 0x01 0x01 0x00
+UMASK_TXR_BL_NCS_CREDIT_ACQUIRED_VN1 0x02 0x01 0x00
+
+EVENT_TXR_BL_NCS_CREDIT_OCCUPANCY 0x21 QBOX
+UMASK_TXR_BL_NCS_CREDIT_OCCUPANCY_VN0 0x01 0x01 0x00
+UMASK_TXR_BL_NCS_CREDIT_OCCUPANCY_VN1 0x02 0x01 0x00
+
+EVENT_VNA_CREDIT_RETURNS 0x1C QBOX
+UMASK_VNA_CREDIT_RETURNS 0x00 0x01 0x00
+
+EVENT_VNA_CREDIT_RETURN_OCCUPANCY 0x1B QBOX
+UMASK_VNA_CREDIT_RETURN_OCCUPANCY 0x00 0x01 0x00
+
+EVENT_QPI_RATE 0x00 QBOX0FIX0|QBOX1FIX0
+UMASK_QPI_RATE 0x00
+
+EVENT_QPI_RX_IDLE 0x01 QBOX0FIX1|QBOX1FIX1
+UMASK_QPI_RX_IDLE 0x00
+
+EVENT_QPI_RX_LLR 0x02 QBOX0FIX2|QBOX1FIX2
+UMASK_QPI_RX_LLR 0x00
diff --git a/src/includes/perfmon_broadwell_counters.h b/src/includes/perfmon_broadwell_counters.h
new file mode 100644
index 0000000..d5608ba
--- /dev/null
+++ b/src/includes/perfmon_broadwell_counters.h
@@ -0,0 +1,83 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: perfmon_broadwell_counters.h
+ *
+ * Description: Counter Header File of perfmon module for Broadwell.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#define NUM_COUNTERS_BROADWELL 23
+#define NUM_COUNTERS_CORE_BROADWELL 8
+#define NUM_COUNTERS_UNCORE_BROADWELL 15
+
+#define BDW_VALID_OPTIONS_FIXED EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_COUNT_KERNEL_MASK
+#define BDW_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK| \
+ EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_IN_TRANS_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define BDW_VALID_OPTIONS_CBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+#define BDW_VALID_OPTIONS_UBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+
+static RegisterMap broadwell_counter_map[NUM_COUNTERS_BROADWELL] = {
+ /* Fixed Counters: instructions retired, cycles unhalted core */
+ {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, BDW_VALID_OPTIONS_FIXED},
+ {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, BDW_VALID_OPTIONS_FIXED},
+ {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, BDW_VALID_OPTIONS_FIXED},
+ /* PMC Counters: 4 48bit wide */
+ {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, BDW_VALID_OPTIONS_PMC},
+ {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, BDW_VALID_OPTIONS_PMC},
+ {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, BDW_VALID_OPTIONS_PMC|EVENT_OPTION_IN_TRANS_ABORT_MASK},
+ {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, BDW_VALID_OPTIONS_PMC},
+ /* Temperature Sensor*/
+ {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ /* RAPL counters */
+ {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"CBOX0C0", PMC12, CBOX0, MSR_UNC_CBO_0_PERFEVTSEL0, MSR_UNC_CBO_0_CTR0, 0, 0, BDW_VALID_OPTIONS_CBOX},
+ {"CBOX0C1", PMC13, CBOX0, MSR_UNC_CBO_0_PERFEVTSEL1, MSR_UNC_CBO_0_CTR1, 0, 0, BDW_VALID_OPTIONS_CBOX},
+ {"CBOX1C0", PMC14, CBOX1, MSR_UNC_CBO_1_PERFEVTSEL0, MSR_UNC_CBO_1_CTR0, 0, 0, BDW_VALID_OPTIONS_CBOX},
+ {"CBOX1C1", PMC15, CBOX1, MSR_UNC_CBO_1_PERFEVTSEL1, MSR_UNC_CBO_1_CTR1, 0, 0, BDW_VALID_OPTIONS_CBOX},
+ {"CBOX2C0", PMC16, CBOX2, MSR_UNC_CBO_2_PERFEVTSEL0, MSR_UNC_CBO_2_CTR0, 0, 0, BDW_VALID_OPTIONS_CBOX},
+ {"CBOX2C1", PMC17, CBOX2, MSR_UNC_CBO_2_PERFEVTSEL1, MSR_UNC_CBO_2_CTR1, 0, 0, BDW_VALID_OPTIONS_CBOX},
+ {"CBOX3C0", PMC18, CBOX3, MSR_UNC_CBO_3_PERFEVTSEL0, MSR_UNC_CBO_3_CTR0, 0, 0, BDW_VALID_OPTIONS_CBOX},
+ {"CBOX3C1", PMC19, CBOX3, MSR_UNC_CBO_3_PERFEVTSEL1, MSR_UNC_CBO_3_CTR1, 0, 0, BDW_VALID_OPTIONS_CBOX},
+ {"UBOX0", PMC20, UBOX, MSR_UNC_ARB_PERFEVTSEL0, MSR_UNC_ARB_CTR0, 0, 0, BDW_VALID_OPTIONS_UBOX},
+ {"UBOX1", PMC21, UBOX, MSR_UNC_ARB_PERFEVTSEL1, MSR_UNC_ARB_CTR1, 0, 0, BDW_VALID_OPTIONS_UBOX},
+ {"UBOXFIX", PMC22, UBOXFIX, MSR_UNC_PERF_FIXED_CTRL, MSR_UNC_PERF_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+};
+
+
+static BoxMap broadwell_box_map[NUM_UNITS] = {
+ [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+ [THERMAL] = {0, 0, 0, 0, 0, 0, 8},
+ [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+ [POWER] = {0, 0, 0, 0, 0, 0, 32},
+ [CBOX0] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+ [CBOX1] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+ [CBOX2] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+ [CBOX3] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+ [UBOX] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 1, 0, MSR_DEV, 44},
+ [UBOXFIX] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 0, 0, MSR_DEV, 44},
+};
diff --git a/src/includes/perfmon_broadwell_events.txt b/src/includes/perfmon_broadwell_events.txt
new file mode 100644
index 0000000..023bc01
--- /dev/null
+++ b/src/includes/perfmon_broadwell_events.txt
@@ -0,0 +1,665 @@
+# =======================================================================================
+#
+# Filename: perfmon_broadwell_events.txt
+#
+# Description: Event list for Intel Broadwell
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
+# Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+# Project: likwid
+#
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation, either version 3 of the License, or (at your option) any later
+# version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
+
+EVENT_TEMP_CORE 0x00 TMP0
+UMASK_TEMP_CORE 0x00
+
+EVENT_PWR_PKG_ENERGY 0x00 PWR0
+UMASK_PWR_PKG_ENERGY 0x00
+
+EVENT_PWR_PP0_ENERGY 0x00 PWR1
+UMASK_PWR_PP0_ENERGY 0x00
+
+EVENT_PWR_PP1_ENERGY 0x00 PWR2
+UMASK_PWR_PP1_ENERGY 0x00
+
+EVENT_PWR_DRAM_ENERGY 0x00 PWR3
+UMASK_PWR_DRAM_ENERGY 0x00
+
+EVENT_INSTR_RETIRED 0x00 FIXC0
+UMASK_INSTR_RETIRED_ANY 0x00
+
+EVENT_CPU_CLK_UNHALTED 0x00 FIXC1
+UMASK_CPU_CLK_UNHALTED_CORE 0x00
+
+EVENT_CPU_CLK_UNHALTED 0x00 FIXC2
+UMASK_CPU_CLK_UNHALTED_REF 0x00
+
+EVENT_LD_BLOCKS 0x03 PMC
+UMASK_LD_BLOCKS_STORE_FORWARD 0x02
+UMASK_LD_BLOCKS_NO_SR 0x08
+
+EVENT_MISALIGN_MEM_REF 0x05 PMC
+UMASK_MISALIGN_MEM_REF_LOADS 0x01
+UMASK_MISALIGN_MEM_REF_STORES 0x02
+UMASK_MISALIGN_MEM_REF_ANY 0x03
+
+EVENT_LD_BLOCKS_PARTIAL 0x07 PMC
+UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS 0x01
+
+EVENT_DTLB_LOAD_MISSES 0x08 PMC
+UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK 0x01
+UMASK_DTLB_LOAD_MISSES_STLB_HIT 0x60
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED 0x0E
+UMASK_DTLB_LOAD_MISSES_STLB_HIT_4K 0x20
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED_4K 0x02
+UMASK_DTLB_LOAD_MISSES_WALK_DURATION 0x10
+
+EVENT_INT_MISC 0x0D PMC
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RECOVERY_CYCLES 0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_COUNT EVENT_OPTION_EDGE=1,EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RECOVERY_COUNT 0x03
+DEFAULT_OPTIONS_INT_MISC_RAT_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RAT_STALL_CYCLES 0x08
+DEFAULT_OPTIONS_INT_MISC_RAT_STALL_COUNT EVENT_OPTION_EDGE=1,EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RAT_STALL_COUNT 0x08
+
+EVENT_UOPS_ISSUED 0x0E PMC
+UMASK_UOPS_ISSUED_ANY 0x01
+UMASK_UOPS_ISSUED_FLAGS_MERGE 0x10
+UMASK_UOPS_ISSUED_SLOW_LEA 0x20
+UMASK_UOPS_ISSUED_SINGLE_MUL 0x40
+DEFAULT_OPTIONS_UOPS_ISSUED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_ISSUED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_ANY 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_FLAGS_MERGE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_FLAGS_MERGE 0x10
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_SLOW_LEA EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_SLOW_LEA 0x20
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_SINGLE_MUL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_SINGLE_MUL 0x40
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_TOTAL_CYCLES 0x01
+
+EVENT_ARITH_FPU_DIV_ACTIVE 0x14 PMC
+UMASK_ARITH_FPU_DIV_ACTIVE 0x01
+
+EVENT_L2_RQSTS 0x24 PMC
+UMASK_L2_RQSTS_DEMAND_DATA_RD_MISS 0x21
+UMASK_L2_RQSTS_DEMAND_DATA_RD_HIT 0x41
+UMASK_L2_RQSTS_RFO_MISS 0x22
+UMASK_L2_RQSTS_RFO_HIT 0x42
+UMASK_L2_RQSTS_CODE_RD_MISS 0x24
+UMASK_L2_RQSTS_CODE_RD_HIT 0x44
+UMASK_L2_RQSTS_L2_PF_HIT 0x50
+UMASK_L2_RQSTS_L2_PF_MISS 0x30
+UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD 0xE1
+UMASK_L2_RQSTS_ALL_DEMAND_MISS 0x27
+UMASK_L2_RQSTS_ALL_RFO 0xE2
+UMASK_L2_RQSTS_ALL_CODE_RD 0xE4
+UMASK_L2_RQSTS_ALL_DEMAND_REFERENCES 0xE7
+UMASK_L2_RQSTS_ALL_PF 0xF8
+UMASK_L2_RQSTS_MISS 0x3F
+UMASK_L2_RQSTS_REFERENCES 0xFF
+
+EVENT_L2_DEMAND_RQST_WB_HIT 0x27 PMC
+UMASK_L2_DEMAND_RQST_WB_HIT 0x50
+
+EVENT_LONGEST_LAT_CACHE 0x2E PMC
+UMASK_LONGEST_LAT_CACHE_REFERENCE 0x4F
+UMASK_LONGEST_LAT_CACHE_MISS 0x41
+
+EVENT_CPU_CLOCK_UNHALTED 0x3C PMC
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P 0x00
+UMASK_CPU_CLOCK_UNHALTED_REF_XCLK 0x01
+UMASK_CPU_CLOCK_UNHALTED_ONE_THREAD_ACTIVE 0x02
+
+EVENT_L1D_PEND_MISS 0x48 PMC2
+UMASK_L1D_PEND_MISS_PENDING 0x01
+DEFAULT_OPTIONS_L1D_PEND_MISS_PENDING_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_L1D_PEND_MISS_PENDING_CYCLES 0x01
+DEFAULT_OPTIONS_L1D_PEND_MISS_OCCURRENCES EVENT_OPTION_EDGE=1,EVENT_OPTION_THRESHOLD=0x01
+UMASK_L1D_PEND_MISS_OCCURRENCES 0x01
+
+EVENT_DTLB_STORE_MISSES 0x49 PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK 0x01
+UMASK_DTLB_STORE_MISSES_STLB_HIT 0x60
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED 0x0E
+UMASK_DTLB_STORE_MISSES_STLB_HIT_4K 0x20
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED_4K 0x02
+UMASK_DTLB_STORE_MISSES_WALK_DURATION 0x10
+
+EVENT_LOAD_HIT_PRE 0x4C PMC
+UMASK_LOAD_HIT_PRE_HW_PF 0x02
+
+EVENT_EPT_WALK_CYCLES 0x4F PMC
+UMASK_EPT_WALK_CYCLES 0x10
+
+EVENT_L1D 0x51 PMC
+UMASK_L1D_REPLACEMENT 0x01
+
+EVENT_TX_MEM 0x54 PMC
+UMASK_TX_MEM_ABORT_CONFLICT 0x01
+UMASK_TX_MEM_ABORT_CAPACITY_WRITE 0x02
+UMASK_TX_MEM_ABORT_HLE_STORE_TO_ELIDED_LOCK 0x04
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_NOT_EMPTY 0x08
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_MISMATCH 0x10
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_UNSUPPORTED_ALIGNMENT 0x20
+UMASK_TX_MEM_HLE_ELISION_BUFFER_FULL 0x40
+
+EVENT_MOVE_ELIMINATION 0x58 PMC
+UMASK_MOVE_ELIMINATION_INT_NOT_ELIMINATED 0x04
+UMASK_MOVE_ELIMINATION_SIMD_NOT_ELIMINATED 0x08
+UMASK_MOVE_ELIMINATION_INT_ELIMINATED 0x01
+UMASK_MOVE_ELIMINATION_SIMD_ELIMINATED 0x02
+
+EVENT_CPL_CYCLES 0x5C PMC
+UMASK_CPL_CYCLES_RING0 0x01
+UMASK_CPL_CYCLES_RING123 0x02
+DEFAULT_OPTIONS_CPL_CYCLES_RING0_TRANS EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_EDGE=1
+UMASK_CPL_CYCLES_RING0_TRANS 0x01
+
+EVENT_TX_EXEC 0x5D PMC
+EVENT_TX_EXEC_MISC1 0x01
+EVENT_TX_EXEC_MISC2 0x02
+EVENT_TX_EXEC_MISC3 0x04
+EVENT_TX_EXEC_MISC4 0x08
+EVENT_TX_EXEC_MISC5 0x10
+
+EVENT_RS_EVENTS 0x5E PMC
+UMASK_RS_EVENTS_EMPTY_CYCLES 0x01
+
+EVENT_OFFCORE_REQUESTS_OUTSTANDING 0x60 PMC
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD EVENT_OPTION_THRESHOLD=0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD 0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD 0x02
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO 0x04
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD EVENT_OPTION_THRESHOLD=0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD 0x08
+
+EVENT_LOCK_CYCLES 0x63 PMC
+UMASK_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION 0x01
+UMASK_LOCK_CYCLES_CACHE_LOCK_DURATION 0x02
+
+EVENT_IDQ 0x79 PMC
+UMASK_IDQ_EMPTY 0x02
+UMASK_IDQ_MITE_UOPS 0x04
+DEFAULT_OPTIONS_IDQ_MITE_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MITE_CYCLES 0x04
+UMASK_IDQ_DSB_UOPS 0x08
+DEFAULT_OPTIONS_IDQ_DSB_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_DSB_CYCLES 0x08
+UMASK_IDQ_MS_DSB_UOPS 0x10
+DEFAULT_OPTIONS_IDQ_MS_DSB_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MS_DSB_CYCLES 0x10
+DEFAULT_OPTIONS_IDQ_MS_DSB_OCCUR EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_EDGE=1
+UMASK_IDQ_MS_DSB_OCCUR 0x10
+UMASK_IDQ_MS_MITE_UOPS 0x20
+DEFAULT_OPTIONS_IDQ_MS_MITE_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MS_MITE_CYCLES 0x20
+UMASK_IDQ_MS_UOPS 0x30
+DEFAULT_OPTIONS_IDQ_MS_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MS_CYCLES 0x30
+DEFAULT_OPTIONS_IDQ_MS_SWITCHES EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_EDGE=1
+UMASK_IDQ_MS_SWITCHES 0x30
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS 0x18
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x04
+UMASK_IDQ_ALL_DSB_CYCLES_4_UOPS 0x18
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_ALL_MITE_CYCLES_ANY_UOPS 0x24
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x04
+UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS 0x24
+UMASK_IDQ_MITE_ALL_UOPS 0x3C
+
+EVENT_ICACHE 0x80 PMC
+UMASK_ICACHE_HIT 0x01
+UMASK_ICACHE_MISSES 0x02
+UMASK_ICACHE_ACCESSES 0x03
+
+EVENT_ITLB_MISSES 0x85 PMC
+UMASK_ITLB_MISSES_CAUSES_A_WALK 0x01
+UMASK_ITLB_MISSES_STLB_HIT 0x60
+UMASK_ITLB_MISSES_WALK_COMPLETED 0x0E
+UMASK_ITLB_MISSES_STLB_HIT_4K 0x20
+UMASK_ITLB_MISSES_WALK_COMPLETED_4K 0x02
+UMASK_ITLB_MISSES_WALK_DURATION 0x10
+
+
+EVENT_ILD_STALL 0x87 PMC
+UMASK_ILD_STALL_LCP 0x01
+
+EVENT_BR_INST_EXEC 0x88 PMC
+UMASK_BR_INST_EXEC_COND_TAKEN 0x81
+UMASK_BR_INST_EXEC_COND_NON_TAKEN 0x41
+UMASK_BR_INST_EXEC_DIRECT_JMP_TAKEN 0x82
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN 0x84
+UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN 0x88
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN 0x90
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN 0xA0
+UMASK_BR_INST_EXEC_ALL_CONDITIONAL 0xC1
+UMASK_BR_INST_EXEC_ALL_DIRECT_JMP 0xC2
+UMASK_BR_INST_EXEC_ALL_DIRECT_NEAR_CALL 0xD0
+UMASK_BR_INST_EXEC_ALL_INDIRECT_JUMP_NON_CALL_RET 0xC4
+UMASK_BR_INST_EXEC_ALL_INDIRECT_NEAR_RETURN 0xC8
+UMASK_BR_INST_EXEC_ALL_BRANCHES 0xFF
+
+EVENT_BR_MISP_EXEC 0x89 PMC
+UMASK_BR_MISP_EXEC_COND_TAKEN 0x81
+UMASK_BR_MISP_EXEC_COND_NON_TAKEN 0x41
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN 0x84
+UMASK_BR_MISP_EXEC_RETURN_NEAR_TAKEN 0x88
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_TAKEN 0x90
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN 0xA0
+UMASK_BR_MISP_EXEC_ALL_CONDITIONAL 0xC1
+UMASK_BR_MISP_EXEC_ALL_INDIRECT_JUMP_NON_CALL_RET 0xC4
+UMASK_BR_MISP_EXEC_ALL_BRANCHES 0xFF
+
+EVENT_IDQ_UOPS_NOT_DELIVERED 0x9C PMC
+UMASK_IDQ_UOPS_NOT_DELIVERED_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE EVENT_OPTION_THRESHOLD=0x04
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x03
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x02
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_INVERT=1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK 0x01
+
+EVENT_UOP_DISPATCHES_CANCELLED_SIMD_PRF 0xA0 PMC
+UMASK_UOP_DISPATCHES_CANCELLED_SIMD_PRF 0x03
+
+EVENT_UOPS_EXECUTED_PORT 0xA1 PMC
+UMASK_UOPS_EXECUTED_PORT_PORT_0 0x01
+UMASK_UOPS_EXECUTED_PORT_PORT_1 0x02
+UMASK_UOPS_EXECUTED_PORT_PORT_2 0x04
+UMASK_UOPS_EXECUTED_PORT_PORT_3 0x08
+UMASK_UOPS_EXECUTED_PORT_PORT_4 0x10
+UMASK_UOPS_EXECUTED_PORT_PORT_5 0x20
+UMASK_UOPS_EXECUTED_PORT_PORT_6 0x40
+UMASK_UOPS_EXECUTED_PORT_PORT_7 0x80
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_0_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_0_CORE 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_1_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_1_CORE 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_2_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_2_CORE 0x04
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_3_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_3_CORE 0x08
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_4_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_4_CORE 0x10
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_5_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_5_CORE 0x20
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_6_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_6_CORE 0x40
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_7_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_7_CORE 0x80
+
+EVENT_RESOURCE_STALLS 0xA2 PMC
+UMASK_RESOURCE_STALLS_ANY 0x01
+UMASK_RESOURCE_STALLS_RS 0x04
+UMASK_RESOURCE_STALLS_SB 0x08
+UMASK_RESOURCE_STALLS_ROB 0x10
+
+EVENT_CYCLE_ACTIVITY_CYCLES_L1D_MISS 0xA3 PMC2
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L1D_MISS EVENT_OPTION_THRESHOLD=0x08
+UMASK_CYCLE_ACTIVITY_CYCLES_L1D_MISS 0x08
+
+EVENT_CYCLE_ACTIVITY_CYCLES 0xA3 PMC
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L2_MISS EVENT_OPTION_THRESHOLD=0x01
+UMASK_CYCLE_ACTIVITY_CYCLES_L2_MISS 0x01
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_MEM_ANY EVENT_OPTION_THRESHOLD=0x02
+UMASK_CYCLE_ACTIVITY_CYCLES_MEM_ANY 0x02
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE EVENT_OPTION_THRESHOLD=0x04
+UMASK_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE 0x04
+
+EVENT_CYCLE_ACTIVITY_STALLS_L1D_MISS 0xA3 PMC2
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L1D_MISS EVENT_OPTION_THRESHOLD=0x0C
+UMASK_CYCLE_ACTIVITY_STALLS_L1D_MISS 0x0C
+
+EVENT_CYCLE_ACTIVITY_STALLS 0xA3 PMC
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_L2_MISS EVENT_OPTION_THRESHOLD=0x05
+UMASK_CYCLE_ACTIVITY_STALLS_L2_MISS 0x05
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_MEM_ANY EVENT_OPTION_THRESHOLD=0x06
+UMASK_CYCLE_ACTIVITY_STALLS_MEM_ANY 0x06
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_TOTAL EVENT_OPTION_THRESHOLD=0x04
+UMASK_CYCLE_ACTIVITY_STALLS_TOTAL 0x04
+
+EVENT_LSD_UOPS 0xA8 PMC
+UMASK_LSD_UOPS 0x01
+DEFAULT_OPTIONS_LSD_CYCLES_ACTIVE EVENT_OPTION_THRESHOLD=0x01
+UMASK_LSD_CYCLES_ACTIVE 0x01
+DEFAULT_OPTIONS_LSD_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x04
+UMASK_LSD_CYCLES_4_UOPS 0x01
+
+EVENT_DSB2MITE_SWITCHES_PENALTY_CYCLES 0xAB PMC
+UMASK_DSB2MITE_SWITCHES_PENALTY_CYCLES 0x02
+
+EVENT_ITLB 0xAE PMC
+UMASK_ITLB_ITLB_FLUSH 0x01
+
+EVENT_OFFCORE_REQUESTS 0xB0 PMC
+UMASK_OFFCORE_REQUESTS_DEMAND_DATA_RD 0x01
+UMASK_OFFCORE_REQUESTS_DEMAND_CODE_RD 0x02
+UMASK_OFFCORE_REQUESTS_DEMAND_RFO 0x04
+UMASK_OFFCORE_REQUESTS_ALL_DATA_RD 0x08
+
+EVENT_UOPS_EXECUTED 0xB1 PMC
+UMASK_UOPS_EXECUTED_THREAD 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC 0x01
+UMASK_UOPS_EXECUTED_CORE 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_USED_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_TOTAL_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC 0x02
+
+EVENT_OFFCORE_REQUESTS_BUFFER_SQ_FULL 0xB2 PMC
+UMASK_OFFCORE_REQUESTS_BUFFER_SQ_FULL 0x01
+
+EVENT_PAGE_WALKER_LOADS 0xBC PMC
+UMASK_PAGE_WALKER_LOADS_DTLB_L1 0x11
+UMASK_PAGE_WALKER_LOADS_ITLB_L1 0x21
+UMASK_PAGE_WALKER_LOADS_DTLB_L2 0x12
+UMASK_PAGE_WALKER_LOADS_ITLB_L2 0x22
+UMASK_PAGE_WALKER_LOADS_DTLB_L3 0x14
+UMASK_PAGE_WALKER_LOADS_ITLB_L3 0x24
+UMASK_PAGE_WALKER_LOADS_DTLB_MEMORY 0x18
+
+EVENT_INST_RETIRED_ANY_P 0xC0 PMC
+UMASK_INST_RETIRED_ANY_P 0x00
+UMASK_INST_RETIRED_X87 0x02
+
+EVENT_INST_RETIRED_PREC 0xC0 PMC1
+UMASK_INST_RETIRED_PREC_DIST 0x01
+
+EVENT_OTHER_ASSISTS 0xC1 PMC
+UMASK_OTHER_ASSISTS_AVX_TO_SSE 0x08
+UMASK_OTHER_ASSISTS_SSE_TO_AVX 0x10
+UMASK_OTHER_ASSISTS_ANY_WB_ASSIST 0x40
+
+EVENT_UOPS_RETIRED 0xC2 PMC
+UMASK_UOPS_RETIRED_ALL 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL 0x01
+UMASK_UOPS_RETIRED_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_RETIRED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL 0x01
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_TOTAL_CYCLES 0x01
+
+EVENT_MACHINE_CLEARS 0xC3 PMC
+DEFAULT_OPTIONS_MACHINE_CLEARS_COUNT EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_EDGE=1
+UMASK_MACHINE_CLEARS_COUNT 0x01
+UMASK_MACHINE_CLEARS_CYCLES 0x01
+UMASK_MACHINE_CLEARS_MEMORY_ORDERING 0x02
+UMASK_MACHINE_CLEARS_SMC 0x04
+UMASK_MACHINE_CLEARS_MASKMOV 0x20
+
+EVENT_BR_INST_RETIRED 0xC4 PMC
+UMASK_BR_INST_RETIRED_ALL_BRANCHES 0x00
+UMASK_BR_INST_RETIRED_CONDITIONAL 0x01
+UMASK_BR_INST_RETIRED_NEAR_CALL 0x02
+UMASK_BR_INST_RETIRED_NEAR_RETURN 0x08
+UMASK_BR_INST_RETIRED_NOT_TAKEN 0x10
+UMASK_BR_INST_RETIRED_NEAR_TAKEN 0x20
+UMASK_BR_INST_RETIRED_FAR_BRANCH 0x40
+
+EVENT_BR_MISP_RETIRED 0xC5 PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES 0x00
+UMASK_BR_MISP_RETIRED_CONDITIONAL 0x01
+UMASK_BR_MISP_RETIRED_NEAR_TAKEN 0x20
+
+EVENT_FP_ARITH_INST_RETIRED 0xC7 PMC
+UMASK_FP_ARITH_INST_RETIRED_SCALAR_DOUBLE 0x01
+UMASK_FP_ARITH_INST_RETIRED_SCALAR_SINGLE 0x02
+UMASK_FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE 0x04
+UMASK_FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE 0x08
+UMASK_FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE 0x10
+UMASK_FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE 0x20
+UMASK_FP_ARITH_INST_RETIRED_SCALAR 0x03
+UMASK_FP_ARITH_INST_RETIRED_PACKED 0x3C
+UMASK_FP_ARITH_INST_RETIRED_DOUBLE 0x15
+UMASK_FP_ARITH_INST_RETIRED_SINGLE 0x2A
+
+EVENT_HLE_RETIRED 0xC8 PMC
+UMASK_HLE_RETIRED_START 0x01
+UMASK_HLE_RETIRED_COMMIT 0x02
+UMASK_HLE_RETIRED_ABORTED 0x04
+UMASK_HLE_RETIRED_ABORTED_MISC1 0x08
+UMASK_HLE_RETIRED_ABORTED_MISC2 0x10
+UMASK_HLE_RETIRED_ABORTED_MISC3 0x20
+UMASK_HLE_RETIRED_ABORTED_MISC4 0x40
+UMASK_HLE_RETIRED_ABORTED_MISC5 0x80
+
+EVENT_RTM_RETIRED 0xC9 PMC
+UMASK_RTM_RETIRED_START 0x01
+UMASK_RTM_RETIRED_COMMIT 0x02
+UMASK_RTM_RETIRED_ABORTED 0x04
+UMASK_RTM_RETIRED_ABORTED_MISC1 0x08
+UMASK_RTM_RETIRED_ABORTED_MISC2 0x10
+UMASK_RTM_RETIRED_ABORTED_MISC3 0x20
+UMASK_RTM_RETIRED_ABORTED_MISC4 0x40
+UMASK_RTM_RETIRED_ABORTED_MISC5 0x80
+
+EVENT_FP_ASSIST 0xCA PMC
+UMASK_FP_ASSIST_X87_OUTPUT 0x02
+UMASK_FP_ASSIST_X87_INPUT 0x04
+UMASK_FP_ASSIST_SIMD_OUTPUT 0x08
+UMASK_FP_ASSIST_SIMD_INPUT 0x10
+UMASK_FP_ASSIST_ANY 0x1E
+
+EVENT_ROB_MISC_EVENT_LBR_INSERTS 0xCC PMC
+UMASK_ROB_MISC_EVENT_LBR_INSERTS 0x20
+
+EVENT_MEM_UOPS_RETIRED 0xD0 PMC
+UMASK_MEM_UOPS_RETIRED_LOADS_ALL 0x81
+UMASK_MEM_UOPS_RETIRED_STORES_ALL 0x82
+UMASK_MEM_UOPS_RETIRED_LOADS_LOCK 0x21
+UMASK_MEM_UOPS_RETIRED_LOADS_STLB_MISS 0x11
+UMASK_MEM_UOPS_RETIRED_STORES_STLB_MISS 0x12
+UMASK_MEM_UOPS_RETIRED_LOADS_SPLIT 0x41
+UMASK_MEM_UOPS_RETIRED_STORES_SPLIT 0x42
+
+EVENT_MEM_LOAD_UOPS_RETIRED 0xD1 PMC
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT 0x01
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_MISS 0x08
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_ALL 0x09
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_HIT 0x02
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_MISS 0x10
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_ALL 0x12
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT 0x04
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS 0x20
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL 0x24
+UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB 0x40
+
+EVENT_MEM_LOAD_UOPS_L3_HIT_RETIRED 0xD2 PMC
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_MISS 0x01
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HIT 0x02
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM 0x04
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_NONE 0x08
+
+EVENT_MEM_LOAD_UOPS_L3_MISS_RETIRED 0xD3 PMC
+UMASK_MEM_LOAD_UOPS_L3_MISS_RETIRED_LOCAL_DRAM 0x01
+
+EVENT_BACLEARS 0xE6 PMC
+UMASK_BACLEARS_ANY 0x1F
+
+EVENT_L2_TRANS 0xF0 PMC
+UMASK_L2_TRANS_DEMAND_DATA_RD 0x01
+UMASK_L2_TRANS_RFO 0x02
+UMASK_L2_TRANS_CODE_RD 0x04
+UMASK_L2_TRANS_ALL_PF 0x08
+UMASK_L2_TRANS_L1D_WB 0x10
+UMASK_L2_TRANS_L2_FILL 0x20
+UMASK_L2_TRANS_L2_WB 0x40
+UMASK_L2_TRANS_ALL_REQUESTS 0x80
+
+EVENT_L2_LINES_IN 0xF1 PMC
+UMASK_L2_LINES_IN_I 0x01
+UMASK_L2_LINES_IN_S 0x02
+UMASK_L2_LINES_IN_E 0x04
+UMASK_L2_LINES_IN_ALL 0x07
+
+EVENT_L2_LINES_OUT 0xF2 PMC
+UMASK_L2_LINES_OUT_DEMAND_CLEAN 0x05
+UMASK_L2_LINES_OUT_DEMAND_DIRTY 0x06
+
+EVENT_OFFCORE_RESPONSE_0 0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS 0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY 0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY 0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY 0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY 0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY 0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY 0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY 0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY 0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY 0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY 0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY 0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY 0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY 0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1 0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS 0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY 0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY 0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY 0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY 0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY 0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY 0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY 0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY 0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY 0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY 0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY 0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY 0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY 0x01 0x0F 0x10
+
+EVENT_CACHE_LOOKUP 0x34 CBOX
+UMASK_CACHE_LOOKUP_M 0x01
+UMASK_CACHE_LOOKUP_E 0x02
+UMASK_CACHE_LOOKUP_S 0x04
+UMASK_CACHE_LOOKUP_I 0x08
+UMASK_CACHE_LOOKUP_READ_FILTER 0x10
+UMASK_CACHE_LOOKUP_WRITE_FILTER 0x20
+UMASK_CACHE_LOOKUP_EXTSNP_FILTER 0x40
+UMASK_CACHE_LOOKUP_ANY_REQUEST_FILTER 0x80
+UMASK_CACHE_LOOKUP_READ_M 0x11
+UMASK_CACHE_LOOKUP_WRITE_M 0x21
+UMASK_CACHE_LOOKUP_EXTSNP_M 0x41
+UMASK_CACHE_LOOKUP_ANY_M 0x81
+UMASK_CACHE_LOOKUP_READ_E 0x12
+UMASK_CACHE_LOOKUP_WRITE_E 0x22
+UMASK_CACHE_LOOKUP_EXTSNP_E 0x42
+UMASK_CACHE_LOOKUP_ANY_E 0x82
+UMASK_CACHE_LOOKUP_READ_S 0x14
+UMASK_CACHE_LOOKUP_WRITE_S 0x24
+UMASK_CACHE_LOOKUP_EXTSNP_S 0x44
+UMASK_CACHE_LOOKUP_ANY_S 0x84
+UMASK_CACHE_LOOKUP_READ_ES 0x16
+UMASK_CACHE_LOOKUP_WRITE_ES 0x26
+UMASK_CACHE_LOOKUP_EXTSNP_ES 0x46
+UMASK_CACHE_LOOKUP_ANY_ES 0x86
+UMASK_CACHE_LOOKUP_READ_I 0x18
+UMASK_CACHE_LOOKUP_WRITE_I 0x28
+UMASK_CACHE_LOOKUP_EXTSNP_I 0x48
+UMASK_CACHE_LOOKUP_ANY_I 0x88
+UMASK_CACHE_LOOKUP_READ_MESI 0x1F
+UMASK_CACHE_LOOKUP_WRITE_MESI 0x2F
+UMASK_CACHE_LOOKUP_EXTSNP_MESI 0x4F
+UMASK_CACHE_LOOKUP_ANY_MESI 0x8F
+
+EVENT_XSNP_RESPONSE 0x22 CBOX
+UMASK_XSNP_RESPONSE_MISS_EXTERNAL 0x21
+UMASK_XSNP_RESPONSE_MISS_XCORE 0x41
+UMASK_XSNP_RESPONSE_MISS_EVICTION 0x81
+UMASK_XSNP_RESPONSE_HIT_EXTERNAL 0x24
+UMASK_XSNP_RESPONSE_HIT_XCORE 0x44
+UMASK_XSNP_RESPONSE_HIT_EVICTION 0x84
+UMASK_XSNP_RESPONSE_HITM_EXTERNAL 0x28
+UMASK_XSNP_RESPONSE_HITM_XCORE 0x48
+UMASK_XSNP_RESPONSE_HITM_EVICTION 0x88
+
+EVENT_TRK_OCCUPANCY_ALL 0x80 UBOX0
+UMASK_TRK_OCCUPANCY_ALL 0x01
+
+EVENT_TRK_REQUESTS 0x81 UBOX
+UMASK_TRK_REQUESTS_ALL 0x01
+UMASK_TRK_REQUESTS_WRITES 0x20
+
+EVENT_COH_TRK_OCCUPANCY 0x83 UBOX0
+UMASK_COH_TRK_OCCUPANCY 0x01
+
+EVENT_COH_TRK_REQUESTS 0x84 UBOX
+UMASK_COH_TRK_REQUESTS_ALL 0x01
+
+EVENT_UNCORE_CLOCK 0x00 UBOXFIX
+UMASK_UNCORE_CLOCK 0x01
+
diff --git a/src/includes/perfmon_broadwelld_counters.h b/src/includes/perfmon_broadwelld_counters.h
new file mode 100644
index 0000000..37f70ad
--- /dev/null
+++ b/src/includes/perfmon_broadwelld_counters.h
@@ -0,0 +1,252 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: perfmon_broadwellD_counters.h
+ *
+ * Description: Counter Header File of perfmon module for Broadwell D.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+
+#define NUM_COUNTERS_BROADWELLD 141
+#define NUM_COUNTERS_CORE_BROADWELLD 8
+#define NUM_COUNTERS_UNCORE_BROADWELLD 85
+
+#define BDW_D_VALID_OPTIONS_FIXED EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_COUNT_KERNEL_MASK
+#define BDW_D_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK| \
+ EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_IN_TRANS_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define BDW_D_VALID_OPTIONS_UBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK
+#define BDW_D_VALID_OPTIONS_CBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK|\
+ EVENT_OPTION_TID_MASK|EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK|EVENT_OPTION_STATE_MASK|\
+ EVENT_OPTION_MATCH0_MASK
+#define BDW_D_VALID_OPTIONS_WBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK|\
+ EVENT_OPTION_OCCUPANCY_MASK|EVENT_OPTION_OCCUPANCY_FILTER_MASK|EVENT_OPTION_OCCUPANCY_EDGE_MASK|\
+ EVENT_OPTION_OCCUPANCY_INVERT_MASK
+#define BDW_D_VALID_OPTIONS_BBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define BDW_D_VALID_OPTIONS_MBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define BDW_D_VALID_OPTIONS_IBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define BDW_D_VALID_OPTIONS_PBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+static RegisterMap broadwelld_counter_map[NUM_COUNTERS_BROADWELLD] = {
+ /* Fixed Counters: instructions retired, cycles unhalted core */
+ {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, BDW_D_VALID_OPTIONS_FIXED},
+ {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, BDW_D_VALID_OPTIONS_FIXED},
+ {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, BDW_D_VALID_OPTIONS_FIXED},
+ /* PMC Counters: 4 48bit wide */
+ {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, BDW_D_VALID_OPTIONS_PMC},
+ {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, BDW_D_VALID_OPTIONS_PMC},
+ {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, BDW_D_VALID_OPTIONS_PMC|EVENT_OPTION_IN_TRANS_ABORT_MASK},
+ {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, BDW_D_VALID_OPTIONS_PMC},
+ /* Temperature Sensor*/
+ {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ /* RAPL counters */
+ {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"UBOX0", PMC12, UBOX, MSR_UNC_V3_U_PMON_CTL0, MSR_UNC_V3_U_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_UBOX},
+ {"UBOX1", PMC13, UBOX, MSR_UNC_V3_U_PMON_CTL1, MSR_UNC_V3_U_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_UBOX},
+ {"UBOXFIX", PMC14, UBOXFIX, MSR_UNC_V3_U_UCLK_FIXED_CTL, MSR_UNC_V3_U_UCLK_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"CBOX0C0", PMC15, CBOX0, MSR_UNC_V3_C0_PMON_CTL0, MSR_UNC_V3_C0_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX0C1", PMC16, CBOX0, MSR_UNC_V3_C0_PMON_CTL1, MSR_UNC_V3_C0_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX0C2", PMC17, CBOX0, MSR_UNC_V3_C0_PMON_CTL2, MSR_UNC_V3_C0_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX0C3", PMC18, CBOX0, MSR_UNC_V3_C0_PMON_CTL3, MSR_UNC_V3_C0_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX1C0", PMC19, CBOX1, MSR_UNC_V3_C1_PMON_CTL0, MSR_UNC_V3_C1_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX1C1", PMC20, CBOX1, MSR_UNC_V3_C1_PMON_CTL1, MSR_UNC_V3_C1_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX1C2", PMC21, CBOX1, MSR_UNC_V3_C1_PMON_CTL2, MSR_UNC_V3_C1_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX1C3", PMC22, CBOX1, MSR_UNC_V3_C1_PMON_CTL3, MSR_UNC_V3_C1_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX2C0", PMC23, CBOX2, MSR_UNC_V3_C2_PMON_CTL0, MSR_UNC_V3_C2_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX2C1", PMC24, CBOX2, MSR_UNC_V3_C2_PMON_CTL1, MSR_UNC_V3_C2_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX2C2", PMC25, CBOX2, MSR_UNC_V3_C2_PMON_CTL2, MSR_UNC_V3_C2_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX2C3", PMC26, CBOX2, MSR_UNC_V3_C2_PMON_CTL3, MSR_UNC_V3_C2_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX3C0", PMC27, CBOX3, MSR_UNC_V3_C3_PMON_CTL0, MSR_UNC_V3_C3_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX3C1", PMC28, CBOX3, MSR_UNC_V3_C3_PMON_CTL1, MSR_UNC_V3_C3_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX3C2", PMC29, CBOX3, MSR_UNC_V3_C3_PMON_CTL2, MSR_UNC_V3_C3_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX3C3", PMC30, CBOX3, MSR_UNC_V3_C3_PMON_CTL3, MSR_UNC_V3_C3_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX4C0", PMC31, CBOX4, MSR_UNC_V3_C4_PMON_CTL0, MSR_UNC_V3_C4_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX4C1", PMC32, CBOX4, MSR_UNC_V3_C4_PMON_CTL1, MSR_UNC_V3_C4_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX4C2", PMC33, CBOX4, MSR_UNC_V3_C4_PMON_CTL2, MSR_UNC_V3_C4_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX4C3", PMC34, CBOX4, MSR_UNC_V3_C4_PMON_CTL3, MSR_UNC_V3_C4_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX5C0", PMC35, CBOX5, MSR_UNC_V3_C5_PMON_CTL0, MSR_UNC_V3_C5_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX5C1", PMC36, CBOX5, MSR_UNC_V3_C5_PMON_CTL1, MSR_UNC_V3_C5_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX5C2", PMC37, CBOX5, MSR_UNC_V3_C5_PMON_CTL2, MSR_UNC_V3_C5_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX5C3", PMC38, CBOX5, MSR_UNC_V3_C5_PMON_CTL3, MSR_UNC_V3_C5_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX6C0", PMC39, CBOX6, MSR_UNC_V3_C6_PMON_CTL0, MSR_UNC_V3_C6_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX6C1", PMC40, CBOX6, MSR_UNC_V3_C6_PMON_CTL1, MSR_UNC_V3_C6_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX6C2", PMC41, CBOX6, MSR_UNC_V3_C6_PMON_CTL2, MSR_UNC_V3_C6_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX6C3", PMC42, CBOX6, MSR_UNC_V3_C6_PMON_CTL3, MSR_UNC_V3_C6_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX7C0", PMC43, CBOX7, MSR_UNC_V3_C7_PMON_CTL0, MSR_UNC_V3_C7_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX7C1", PMC44, CBOX7, MSR_UNC_V3_C7_PMON_CTL1, MSR_UNC_V3_C7_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX7C2", PMC45, CBOX7, MSR_UNC_V3_C7_PMON_CTL2, MSR_UNC_V3_C7_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX7C3", PMC46, CBOX7, MSR_UNC_V3_C7_PMON_CTL3, MSR_UNC_V3_C7_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX8C0", PMC47, CBOX8, MSR_UNC_V3_C8_PMON_CTL0, MSR_UNC_V3_C8_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX8C1", PMC48, CBOX8, MSR_UNC_V3_C8_PMON_CTL1, MSR_UNC_V3_C8_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX8C2", PMC49, CBOX8, MSR_UNC_V3_C8_PMON_CTL2, MSR_UNC_V3_C8_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX8C3", PMC50, CBOX8, MSR_UNC_V3_C8_PMON_CTL3, MSR_UNC_V3_C8_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX9C0", PMC51, CBOX9, MSR_UNC_V3_C9_PMON_CTL0, MSR_UNC_V3_C9_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX9C1", PMC52, CBOX9, MSR_UNC_V3_C9_PMON_CTL1, MSR_UNC_V3_C9_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX9C2", PMC53, CBOX9, MSR_UNC_V3_C9_PMON_CTL2, MSR_UNC_V3_C9_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX9C3", PMC54, CBOX9, MSR_UNC_V3_C9_PMON_CTL3, MSR_UNC_V3_C9_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX10C0", PMC55, CBOX10, MSR_UNC_V3_C10_PMON_CTL0, MSR_UNC_V3_C10_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX10C1", PMC56, CBOX10, MSR_UNC_V3_C10_PMON_CTL1, MSR_UNC_V3_C10_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX10C2", PMC57, CBOX10, MSR_UNC_V3_C10_PMON_CTL2, MSR_UNC_V3_C10_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX10C3", PMC58, CBOX10, MSR_UNC_V3_C10_PMON_CTL3, MSR_UNC_V3_C10_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX11C0", PMC59, CBOX11, MSR_UNC_V3_C11_PMON_CTL0, MSR_UNC_V3_C11_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX11C1", PMC60, CBOX11, MSR_UNC_V3_C11_PMON_CTL1, MSR_UNC_V3_C11_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX11C2", PMC61, CBOX11, MSR_UNC_V3_C11_PMON_CTL2, MSR_UNC_V3_C11_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX11C3", PMC62, CBOX11, MSR_UNC_V3_C11_PMON_CTL3, MSR_UNC_V3_C11_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX12C0", PMC63, CBOX12, MSR_UNC_V3_C12_PMON_CTL0, MSR_UNC_V3_C12_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX12C1", PMC64, CBOX12, MSR_UNC_V3_C12_PMON_CTL1, MSR_UNC_V3_C12_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX12C2", PMC65, CBOX12, MSR_UNC_V3_C12_PMON_CTL2, MSR_UNC_V3_C12_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX12C3", PMC66, CBOX12, MSR_UNC_V3_C12_PMON_CTL3, MSR_UNC_V3_C12_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX13C0", PMC67, CBOX13, MSR_UNC_V3_C13_PMON_CTL0, MSR_UNC_V3_C13_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX13C1", PMC68, CBOX13, MSR_UNC_V3_C13_PMON_CTL1, MSR_UNC_V3_C13_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX13C2", PMC69, CBOX13, MSR_UNC_V3_C13_PMON_CTL2, MSR_UNC_V3_C13_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX13C3", PMC70, CBOX13, MSR_UNC_V3_C13_PMON_CTL3, MSR_UNC_V3_C13_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX14C0", PMC71, CBOX14, MSR_UNC_V3_C14_PMON_CTL0, MSR_UNC_V3_C14_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX14C1", PMC72, CBOX14, MSR_UNC_V3_C14_PMON_CTL1, MSR_UNC_V3_C14_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX14C2", PMC73, CBOX14, MSR_UNC_V3_C14_PMON_CTL2, MSR_UNC_V3_C14_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX14C3", PMC74, CBOX14, MSR_UNC_V3_C14_PMON_CTL3, MSR_UNC_V3_C14_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX15C0", PMC75, CBOX15, MSR_UNC_V3_C15_PMON_CTL0, MSR_UNC_V3_C15_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX15C1", PMC76, CBOX15, MSR_UNC_V3_C15_PMON_CTL1, MSR_UNC_V3_C15_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX15C2", PMC77, CBOX15, MSR_UNC_V3_C15_PMON_CTL2, MSR_UNC_V3_C15_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"CBOX15C3", PMC78, CBOX15, MSR_UNC_V3_C15_PMON_CTL3, MSR_UNC_V3_C15_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+ {"WBOX0", PMC79, WBOX, MSR_UNC_V3_PCU_PMON_CTL0, MSR_UNC_V3_PCU_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_WBOX},
+ {"WBOX1", PMC80, WBOX, MSR_UNC_V3_PCU_PMON_CTL1, MSR_UNC_V3_PCU_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_WBOX},
+ {"WBOX2", PMC81, WBOX, MSR_UNC_V3_PCU_PMON_CTL2, MSR_UNC_V3_PCU_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_WBOX},
+ {"WBOX3", PMC82, WBOX, MSR_UNC_V3_PCU_PMON_CTL3, MSR_UNC_V3_PCU_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_WBOX},
+ {"WBOX0FIX", PMC83, WBOX0FIX, 0, MSR_UNC_V3_PCU_CC3_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"WBOX1FIX", PMC84, WBOX0FIX, 0, MSR_UNC_V3_PCU_CC6_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"BBOX0C0", PMC85, BBOX0, PCI_UNC_HA_PMON_CTL_0, PCI_UNC_HA_PMON_CTR_0_A, PCI_UNC_HA_PMON_CTR_0_B, PCI_HA_DEVICE_0, BDW_D_VALID_OPTIONS_BBOX},
+ {"BBOX0C1", PMC86, BBOX0, PCI_UNC_HA_PMON_CTL_1, PCI_UNC_HA_PMON_CTR_1_A, PCI_UNC_HA_PMON_CTR_1_B, PCI_HA_DEVICE_0, BDW_D_VALID_OPTIONS_BBOX},
+ {"BBOX0C2", PMC87, BBOX0, PCI_UNC_HA_PMON_CTL_2, PCI_UNC_HA_PMON_CTR_2_A, PCI_UNC_HA_PMON_CTR_2_B, PCI_HA_DEVICE_0, BDW_D_VALID_OPTIONS_BBOX},
+ {"BBOX0C3", PMC88, BBOX0, PCI_UNC_HA_PMON_CTL_3, PCI_UNC_HA_PMON_CTR_3_A, PCI_UNC_HA_PMON_CTR_3_B, PCI_HA_DEVICE_0, BDW_D_VALID_OPTIONS_BBOX},
+ {"BBOX1C0", PMC89, BBOX1, PCI_UNC_HA_PMON_CTL_0, PCI_UNC_HA_PMON_CTR_0_A, PCI_UNC_HA_PMON_CTR_0_B, PCI_HA_DEVICE_1, BDW_D_VALID_OPTIONS_BBOX},
+ {"BBOX1C1", PMC90, BBOX1, PCI_UNC_HA_PMON_CTL_1, PCI_UNC_HA_PMON_CTR_1_A, PCI_UNC_HA_PMON_CTR_1_B, PCI_HA_DEVICE_1, BDW_D_VALID_OPTIONS_BBOX},
+ {"BBOX1C2", PMC91, BBOX1, PCI_UNC_HA_PMON_CTL_2, PCI_UNC_HA_PMON_CTR_2_A, PCI_UNC_HA_PMON_CTR_2_B, PCI_HA_DEVICE_1, BDW_D_VALID_OPTIONS_BBOX},
+ {"BBOX1C3", PMC92, BBOX1, PCI_UNC_HA_PMON_CTL_3, PCI_UNC_HA_PMON_CTR_3_A, PCI_UNC_HA_PMON_CTR_3_B, PCI_HA_DEVICE_1, BDW_D_VALID_OPTIONS_BBOX},
+ {"MBOX0C0", PMC93, MBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_0, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX0C1", PMC94, MBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_0, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX0C2", PMC95, MBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_0, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX0FIX", PMC96, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_INVERT_MASK},
+ {"MBOX0C3", PMC97, MBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_0, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX1C0", PMC98, MBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_1, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX1C1", PMC99, MBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_1, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX1C2", PMC100, MBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_1, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX1C3", PMC101, MBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_1, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX1FIX", PMC102, MBOX1FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_1, EVENT_OPTION_INVERT_MASK},
+ {"MBOX2C0", PMC103, MBOX2, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_2, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX2C1", PMC104, MBOX2, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_2, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX2C2", PMC105, MBOX2, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_2, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX2C3", PMC106, MBOX2, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_2, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX2FIX", PMC107, MBOX2FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_2, EVENT_OPTION_INVERT_MASK},
+ {"MBOX3C0", PMC108, MBOX3, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_3, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX3C1", PMC109, MBOX3, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_3, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX3C2", PMC110, MBOX3, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_3, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX3C3", PMC111, MBOX3, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_3, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX3FIX", PMC112, MBOX3FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_3, EVENT_OPTION_INVERT_MASK},
+ {"MBOX4C0", PMC113, MBOX4, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_0, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX4C1", PMC114, MBOX4, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_0, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX4C2", PMC115, MBOX4, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_0, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX4C3", PMC116, MBOX4, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_0, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX4FIX", PMC117, MBOX4FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_0, EVENT_OPTION_INVERT_MASK},
+ {"MBOX5C0", PMC118, MBOX5, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_1, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX5C1", PMC119, MBOX5, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_1, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX5C2", PMC120, MBOX5, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_1, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX5C3", PMC121, MBOX5, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_1, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX5FIX", PMC122, MBOX5FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_1, EVENT_OPTION_INVERT_MASK},
+ {"MBOX6C0", PMC123, MBOX6, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_2, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX6C1", PMC124, MBOX6, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_2, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX6C2", PMC125, MBOX6, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_2, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX6C3", PMC126, MBOX6, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_2, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX6FIX", PMC127, MBOX6FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_2, EVENT_OPTION_INVERT_MASK},
+ {"MBOX7C0", PMC128, MBOX7, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_3, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX7C1", PMC129, MBOX7, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_3, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX7C2", PMC130, MBOX7, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_3, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX7C3", PMC131, MBOX7, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_3, BDW_D_VALID_OPTIONS_MBOX},
+ {"MBOX7FIX", PMC132, MBOX7FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_3, EVENT_OPTION_INVERT_MASK},
+ {"IBOX0C0", PMC133, IBOX0, PCI_UNC_IRP0_PMON_CTL_0, PCI_UNC_IRP0_PMON_CTR_0, 0, PCI_IRP_DEVICE, BDW_D_VALID_OPTIONS_IBOX},
+ {"IBOX0C1", PMC134, IBOX0, PCI_UNC_IRP0_PMON_CTL_1, PCI_UNC_IRP0_PMON_CTR_1, 0, PCI_IRP_DEVICE, BDW_D_VALID_OPTIONS_IBOX},
+ {"IBOX1C0", PMC135, IBOX1, PCI_UNC_IRP1_PMON_CTL_0, PCI_UNC_IRP1_PMON_CTR_0, 0, PCI_IRP_DEVICE, BDW_D_VALID_OPTIONS_IBOX},
+ {"IBOX1C1", PMC136, IBOX1, PCI_UNC_IRP1_PMON_CTL_1, PCI_UNC_IRP1_PMON_CTR_1, 0, PCI_IRP_DEVICE, BDW_D_VALID_OPTIONS_IBOX},
+ {"PBOX0", PMC137, PBOX, PCI_UNC_R2PCIE_PMON_CTL_0, PCI_UNC_R2PCIE_PMON_CTR_0_A, PCI_UNC_R2PCIE_PMON_CTR_0_B, PCI_R2PCIE_DEVICE, BDW_D_VALID_OPTIONS_PBOX},
+ {"PBOX1", PMC138, PBOX, PCI_UNC_R2PCIE_PMON_CTL_1, PCI_UNC_R2PCIE_PMON_CTR_1_A, PCI_UNC_R2PCIE_PMON_CTR_1_B, PCI_R2PCIE_DEVICE, BDW_D_VALID_OPTIONS_PBOX},
+ {"PBOX2", PMC139, PBOX, PCI_UNC_R2PCIE_PMON_CTL_2, PCI_UNC_R2PCIE_PMON_CTR_2_A, PCI_UNC_R2PCIE_PMON_CTR_2_B, PCI_R2PCIE_DEVICE, BDW_D_VALID_OPTIONS_PBOX},
+ {"PBOX3", PMC140, PBOX, PCI_UNC_R2PCIE_PMON_CTL_3, PCI_UNC_R2PCIE_PMON_CTR_3_A, PCI_UNC_R2PCIE_PMON_CTR_3_B, PCI_R2PCIE_DEVICE, BDW_D_VALID_OPTIONS_PBOX},
+};
+
+static BoxMap broadwelld_box_map[NUM_UNITS] = {
+ [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+ [THERMAL] = {0, 0, 0, 0, 0, 0, 8},
+ [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+ [POWER] = {0, 0, 0, 0, 0, 0, 32},
+ [UBOX] = {0, MSR_UNC_V3_U_PMON_BOX_STATUS, MSR_UNC_V3_U_PMON_BOX_STATUS, 1, 0, 0, 48},
+ [UBOXFIX] = {0, MSR_UNC_V3_U_PMON_BOX_STATUS, MSR_UNC_V3_U_PMON_BOX_STATUS, 0, 0, 0, 48},
+ [CBOX0] = {MSR_UNC_V3_C0_PMON_BOX_CTL, MSR_UNC_V3_C0_PMON_BOX_STATUS, MSR_UNC_V3_C0_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C0_PMON_BOX_FILTER0, MSR_UNC_V3_C0_PMON_BOX_FILTER1},
+ [CBOX1] = {MSR_UNC_V3_C1_PMON_BOX_CTL, MSR_UNC_V3_C1_PMON_BOX_STATUS, MSR_UNC_V3_C1_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C1_PMON_BOX_FILTER0, MSR_UNC_V3_C1_PMON_BOX_FILTER1},
+ [CBOX2] = {MSR_UNC_V3_C2_PMON_BOX_CTL, MSR_UNC_V3_C2_PMON_BOX_STATUS, MSR_UNC_V3_C2_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C2_PMON_BOX_FILTER0, MSR_UNC_V3_C2_PMON_BOX_FILTER1},
+ [CBOX3] = {MSR_UNC_V3_C3_PMON_BOX_CTL, MSR_UNC_V3_C3_PMON_BOX_STATUS, MSR_UNC_V3_C3_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C3_PMON_BOX_FILTER0, MSR_UNC_V3_C3_PMON_BOX_FILTER1},
+ [CBOX4] = {MSR_UNC_V3_C4_PMON_BOX_CTL, MSR_UNC_V3_C4_PMON_BOX_STATUS, MSR_UNC_V3_C4_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C4_PMON_BOX_FILTER0, MSR_UNC_V3_C4_PMON_BOX_FILTER1},
+ [CBOX5] = {MSR_UNC_V3_C5_PMON_BOX_CTL, MSR_UNC_V3_C5_PMON_BOX_STATUS, MSR_UNC_V3_C5_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C5_PMON_BOX_FILTER0, MSR_UNC_V3_C5_PMON_BOX_FILTER1},
+ [CBOX6] = {MSR_UNC_V3_C6_PMON_BOX_CTL, MSR_UNC_V3_C6_PMON_BOX_STATUS, MSR_UNC_V3_C6_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C6_PMON_BOX_FILTER0, MSR_UNC_V3_C6_PMON_BOX_FILTER1},
+ [CBOX7] = {MSR_UNC_V3_C7_PMON_BOX_CTL, MSR_UNC_V3_C7_PMON_BOX_STATUS, MSR_UNC_V3_C7_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C7_PMON_BOX_FILTER0, MSR_UNC_V3_C7_PMON_BOX_FILTER1},
+ [CBOX8] = {MSR_UNC_V3_C8_PMON_BOX_CTL, MSR_UNC_V3_C8_PMON_BOX_STATUS, MSR_UNC_V3_C8_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C8_PMON_BOX_FILTER0, MSR_UNC_V3_C8_PMON_BOX_FILTER1},
+ [CBOX9] = {MSR_UNC_V3_C9_PMON_BOX_CTL, MSR_UNC_V3_C9_PMON_BOX_STATUS, MSR_UNC_V3_C9_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C9_PMON_BOX_FILTER0, MSR_UNC_V3_C9_PMON_BOX_FILTER1},
+ [CBOX10] = {MSR_UNC_V3_C10_PMON_BOX_CTL, MSR_UNC_V3_C10_PMON_BOX_STATUS, MSR_UNC_V3_C10_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C10_PMON_BOX_FILTER0, MSR_UNC_V3_C10_PMON_BOX_FILTER1},
+ [CBOX11] = {MSR_UNC_V3_C11_PMON_BOX_CTL, MSR_UNC_V3_C11_PMON_BOX_STATUS, MSR_UNC_V3_C11_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C11_PMON_BOX_FILTER0, MSR_UNC_V3_C11_PMON_BOX_FILTER1},
+ [CBOX12] = {MSR_UNC_V3_C12_PMON_BOX_CTL, MSR_UNC_V3_C12_PMON_BOX_STATUS, MSR_UNC_V3_C12_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C12_PMON_BOX_FILTER0, MSR_UNC_V3_C12_PMON_BOX_FILTER1},
+ [CBOX13] = {MSR_UNC_V3_C13_PMON_BOX_CTL, MSR_UNC_V3_C13_PMON_BOX_STATUS, MSR_UNC_V3_C13_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C13_PMON_BOX_FILTER0, MSR_UNC_V3_C13_PMON_BOX_FILTER1},
+ [CBOX14] = {MSR_UNC_V3_C14_PMON_BOX_CTL, MSR_UNC_V3_C14_PMON_BOX_STATUS, MSR_UNC_V3_C14_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C14_PMON_BOX_FILTER0, MSR_UNC_V3_C14_PMON_BOX_FILTER1},
+ [CBOX15] = {MSR_UNC_V3_C15_PMON_BOX_CTL, MSR_UNC_V3_C15_PMON_BOX_STATUS, MSR_UNC_V3_C15_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C15_PMON_BOX_FILTER0, MSR_UNC_V3_C15_PMON_BOX_FILTER1},
+ [WBOX] = {MSR_UNC_V3_PCU_PMON_BOX_CTL, MSR_UNC_V3_PCU_PMON_BOX_STATUS,MSR_UNC_V3_PCU_PMON_BOX_STATUS, 2, 0, 0, 48, MSR_UNC_V3_PCU_PMON_BOX_FILTER},
+ [WBOX0FIX] = {0,0,0,-1,0,0,64},
+ [BBOX0] = {PCI_UNC_HA_PMON_BOX_CTL, PCI_UNC_HA_PMON_BOX_STATUS, PCI_UNC_HA_PMON_BOX_STATUS, 21, 1, PCI_HA_DEVICE_0, 48},
+ [BBOX1] = {PCI_UNC_HA_PMON_BOX_CTL, PCI_UNC_HA_PMON_BOX_STATUS, PCI_UNC_HA_PMON_BOX_STATUS, -1, 1, PCI_HA_DEVICE_1, 48},
+ [MBOX0] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_0, 48},
+ [MBOX1] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_1, 48},
+ [MBOX2] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_2, 48},
+ [MBOX3] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_3, 48},
+ [MBOX4] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, -1, 1, PCI_IMC_DEVICE_1_CH_0, 48},
+ [MBOX5] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, -1, 1, PCI_IMC_DEVICE_1_CH_1, 48},
+ [MBOX6] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, -1, 1, PCI_IMC_DEVICE_1_CH_2, 48},
+ [MBOX7] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, -1, 1, PCI_IMC_DEVICE_1_CH_3, 48},
+ [IBOX0] = {PCI_UNC_IRP_PMON_BOX_CTL, PCI_UNC_IRP_PMON_BOX_STATUS, PCI_UNC_IRP_PMON_BOX_STATUS, 34, 1, PCI_IRP_DEVICE, 48},
+ [IBOX1] = {PCI_UNC_IRP_PMON_BOX_CTL, PCI_UNC_IRP_PMON_BOX_STATUS, PCI_UNC_IRP_PMON_BOX_STATUS, 34, 1, PCI_IRP_DEVICE, 48},
+ [PBOX] = {PCI_UNC_R2PCIE_PMON_BOX_CTL, PCI_UNC_R2PCIE_PMON_BOX_STATUS, PCI_UNC_R2PCIE_PMON_BOX_STATUS, 29, 1, PCI_R2PCIE_DEVICE, 48},
+};
+
+static PciDevice broadwelld_pci_devices[MAX_NUM_PCI_DEVICES] = {
+ [PCI_HA_DEVICE_0] = {HA, "12.1", "PCI_HA_DEVICE_0", "BBOX0", 0x6F30},
+ [PCI_HA_DEVICE_1] = {HA, "12.5", "PCI_HA_DEVICE_1", "BBOX1", 0x6F38},
+ [PCI_IMC_DEVICE_0_CH_0] = {IMC, "14.0", "PCI_IMC_DEVICE_0_CH_0", "MBOX0", 0x6FB4},
+ [PCI_IMC_DEVICE_0_CH_1] = {IMC, "14.1", "PCI_IMC_DEVICE_0_CH_1", "MBOX1", 0x6FB5},
+ [PCI_IMC_DEVICE_0_CH_2] = {IMC, "15.0", "PCI_IMC_DEVICE_0_CH_2", "MBOX2", 0x6FB0},
+ [PCI_IMC_DEVICE_0_CH_3] = {IMC, "15.1", "PCI_IMC_DEVICE_0_CH_3", "MBOX3", 0x6FB1},
+ [PCI_IMC_DEVICE_1_CH_0] = {IMC, "17.0", "PCI_IMC_DEVICE_1_CH_0", "MBOX4", 0x6FD4},
+ [PCI_IMC_DEVICE_1_CH_1] = {IMC, "17.1", "PCI_IMC_DEVICE_1_CH_1", "MBOX5", 0x6FD5},
+ [PCI_IMC_DEVICE_1_CH_2] = {IMC, "18.0", "PCI_IMC_DEVICE_1_CH_2", "MBOX6", 0x6FD0},
+ [PCI_IMC_DEVICE_1_CH_3] = {IMC, "18.1", "PCI_IMC_DEVICE_1_CH_3", "MBOX7", 0x6FD1},
+ [PCI_IRP_DEVICE] = {IRP, "05.6", "PCI_IRP_DEVICE", "IBOX0", 0x6F39},
+ [PCI_R2PCIE_DEVICE] = {R2PCIE, "10.1", "PCI_R2PCIE_DEVICE", "PBOX0", 0x6F34},
+};
diff --git a/src/includes/perfmon_broadwelld_events.txt b/src/includes/perfmon_broadwelld_events.txt
new file mode 100644
index 0000000..88c5add
--- /dev/null
+++ b/src/includes/perfmon_broadwelld_events.txt
@@ -0,0 +1,1984 @@
+# =======================================================================================
+#
+# Filename: perfmon_broadwelld_events.txt
+#
+# Description: Event list for Intel Broadwell D
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
+# Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+# Project: likwid
+#
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation, either version 3 of the License, or (at your option) any later
+# version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
+
+EVENT_TEMP_CORE 0x00 TMP0
+UMASK_TEMP_CORE 0x00
+
+EVENT_PWR_PKG_ENERGY 0x00 PWR0
+UMASK_PWR_PKG_ENERGY 0x00
+
+EVENT_PWR_PP0_ENERGY 0x00 PWR1
+UMASK_PWR_PP0_ENERGY 0x00
+
+EVENT_PWR_PP1_ENERGY 0x00 PWR2
+UMASK_PWR_PP1_ENERGY 0x00
+
+EVENT_PWR_DRAM_ENERGY 0x00 PWR3
+UMASK_PWR_DRAM_ENERGY 0x00
+
+EVENT_INSTR_RETIRED 0x00 FIXC0
+UMASK_INSTR_RETIRED_ANY 0x00
+
+EVENT_CPU_CLK_UNHALTED 0x00 FIXC1
+UMASK_CPU_CLK_UNHALTED_CORE 0x00
+
+EVENT_CPU_CLK_UNHALTED 0x00 FIXC2
+UMASK_CPU_CLK_UNHALTED_REF 0x00
+
+EVENT_LD_BLOCKS 0x03 PMC
+UMASK_LD_BLOCKS_STORE_FORWARD 0x02
+UMASK_LD_BLOCKS_NO_SR 0x08
+
+EVENT_MISALIGN_MEM_REF 0x05 PMC
+UMASK_MISALIGN_MEM_REF_LOADS 0x01
+UMASK_MISALIGN_MEM_REF_STORES 0x02
+UMASK_MISALIGN_MEM_REF_ANY 0x03
+
+EVENT_LD_BLOCKS_PARTIAL 0x07 PMC
+UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS 0x01
+
+EVENT_DTLB_LOAD_MISSES 0x08 PMC
+UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK 0x01
+UMASK_DTLB_LOAD_MISSES_STLB_HIT 0x60
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED 0x0E
+UMASK_DTLB_LOAD_MISSES_STLB_HIT_4K 0x20
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED_4K 0x02
+UMASK_DTLB_LOAD_MISSES_WALK_DURATION 0x10
+
+EVENT_INT_MISC 0x0D PMC
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RECOVERY_CYCLES 0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_COUNT EVENT_OPTION_EDGE=1,EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RECOVERY_COUNT 0x03
+DEFAULT_OPTIONS_INT_MISC_RAT_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RAT_STALL_CYCLES 0x08
+DEFAULT_OPTIONS_INT_MISC_RAT_STALL_COUNT EVENT_OPTION_EDGE=1,EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RAT_STALL_COUNT 0x08
+
+EVENT_UOPS_ISSUED 0x0E PMC
+UMASK_UOPS_ISSUED_ANY 0x01
+UMASK_UOPS_ISSUED_FLAGS_MERGE 0x10
+UMASK_UOPS_ISSUED_SLOW_LEA 0x20
+UMASK_UOPS_ISSUED_SINGLE_MUL 0x40
+DEFAULT_OPTIONS_UOPS_ISSUED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_ISSUED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_ANY 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_FLAGS_MERGE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_FLAGS_MERGE 0x10
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_SLOW_LEA EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_SLOW_LEA 0x20
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_SINGLE_MUL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_SINGLE_MUL 0x40
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_TOTAL_CYCLES 0x01
+
+EVENT_ARITH_FPU_DIV_ACTIVE 0x14 PMC
+UMASK_ARITH_FPU_DIV_ACTIVE 0x01
+
+EVENT_L2_RQSTS 0x24 PMC
+UMASK_L2_RQSTS_DEMAND_DATA_RD_MISS 0x21
+UMASK_L2_RQSTS_DEMAND_DATA_RD_HIT 0x41
+UMASK_L2_RQSTS_RFO_MISS 0x22
+UMASK_L2_RQSTS_RFO_HIT 0x42
+UMASK_L2_RQSTS_CODE_RD_MISS 0x24
+UMASK_L2_RQSTS_CODE_RD_HIT 0x44
+UMASK_L2_RQSTS_L2_PF_HIT 0x50
+UMASK_L2_RQSTS_L2_PF_MISS 0x30
+UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD 0xE1
+UMASK_L2_RQSTS_ALL_DEMAND_MISS 0x27
+UMASK_L2_RQSTS_ALL_RFO 0xE2
+UMASK_L2_RQSTS_ALL_CODE_RD 0xE4
+UMASK_L2_RQSTS_ALL_DEMAND_REFERENCES 0xE7
+UMASK_L2_RQSTS_ALL_PF 0xF8
+UMASK_L2_RQSTS_MISS 0x3F
+UMASK_L2_RQSTS_REFERENCES 0xFF
+
+EVENT_L2_DEMAND_RQST_WB_HIT 0x27 PMC
+UMASK_L2_DEMAND_RQST_WB_HIT 0x50
+
+EVENT_LONGEST_LAT_CACHE 0x2E PMC
+UMASK_LONGEST_LAT_CACHE_REFERENCE 0x4F
+UMASK_LONGEST_LAT_CACHE_MISS 0x41
+
+EVENT_CPU_CLOCK_UNHALTED 0x3C PMC
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P 0x00
+UMASK_CPU_CLOCK_UNHALTED_REF_XCLK 0x01
+UMASK_CPU_CLOCK_UNHALTED_ONE_THREAD_ACTIVE 0x02
+
+EVENT_L1D_PEND_MISS 0x48 PMC2
+UMASK_L1D_PEND_MISS_PENDING 0x01
+DEFAULT_OPTIONS_L1D_PEND_MISS_PENDING_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_L1D_PEND_MISS_PENDING_CYCLES 0x01
+DEFAULT_OPTIONS_L1D_PEND_MISS_OCCURRENCES EVENT_OPTION_EDGE=1,EVENT_OPTION_THRESHOLD=0x01
+UMASK_L1D_PEND_MISS_OCCURRENCES 0x01
+
+EVENT_DTLB_STORE_MISSES 0x49 PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK 0x01
+UMASK_DTLB_STORE_MISSES_STLB_HIT 0x60
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED 0x0E
+UMASK_DTLB_STORE_MISSES_STLB_HIT_4K 0x20
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED_4K 0x02
+UMASK_DTLB_STORE_MISSES_WALK_DURATION 0x10
+
+EVENT_LOAD_HIT_PRE 0x4C PMC
+UMASK_LOAD_HIT_PRE_HW_PF 0x02
+
+EVENT_EPT_WALK_CYCLES 0x4F PMC
+UMASK_EPT_WALK_CYCLES 0x10
+
+EVENT_L1D 0x51 PMC
+UMASK_L1D_REPLACEMENT 0x01
+
+EVENT_TX_MEM 0x54 PMC
+UMASK_TX_MEM_ABORT_CONFLICT 0x01
+UMASK_TX_MEM_ABORT_CAPACITY_WRITE 0x02
+UMASK_TX_MEM_ABORT_HLE_STORE_TO_ELIDED_LOCK 0x04
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_NOT_EMPTY 0x08
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_MISMATCH 0x10
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_UNSUPPORTED_ALIGNMENT 0x20
+UMASK_TX_MEM_HLE_ELISION_BUFFER_FULL 0x40
+
+EVENT_MOVE_ELIMINATION 0x58 PMC
+UMASK_MOVE_ELIMINATION_INT_NOT_ELIMINATED 0x04
+UMASK_MOVE_ELIMINATION_SIMD_NOT_ELIMINATED 0x08
+UMASK_MOVE_ELIMINATION_INT_ELIMINATED 0x01
+UMASK_MOVE_ELIMINATION_SIMD_ELIMINATED 0x02
+
+EVENT_CPL_CYCLES 0x5C PMC
+UMASK_CPL_CYCLES_RING0 0x01
+UMASK_CPL_CYCLES_RING123 0x02
+DEFAULT_OPTIONS_CPL_CYCLES_RING0_TRANS EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_EDGE=1
+UMASK_CPL_CYCLES_RING0_TRANS 0x01
+
+EVENT_TX_EXEC 0x5D PMC
+EVENT_TX_EXEC_MISC1 0x01
+EVENT_TX_EXEC_MISC2 0x02
+EVENT_TX_EXEC_MISC3 0x04
+EVENT_TX_EXEC_MISC4 0x08
+EVENT_TX_EXEC_MISC5 0x10
+
+EVENT_RS_EVENTS 0x5E PMC
+UMASK_RS_EVENTS_EMPTY_CYCLES 0x01
+
+EVENT_OFFCORE_REQUESTS_OUTSTANDING 0x60 PMC
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD EVENT_OPTION_THRESHOLD=0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD 0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD 0x02
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO 0x04
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD EVENT_OPTION_THRESHOLD=0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD 0x08
+
+EVENT_LOCK_CYCLES 0x63 PMC
+UMASK_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION 0x01
+UMASK_LOCK_CYCLES_CACHE_LOCK_DURATION 0x02
+
+EVENT_IDQ 0x79 PMC
+UMASK_IDQ_EMPTY 0x02
+UMASK_IDQ_MITE_UOPS 0x04
+DEFAULT_OPTIONS_IDQ_MITE_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MITE_CYCLES 0x04
+UMASK_IDQ_DSB_UOPS 0x08
+DEFAULT_OPTIONS_IDQ_DSB_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_DSB_CYCLES 0x08
+UMASK_IDQ_MS_DSB_UOPS 0x10
+DEFAULT_OPTIONS_IDQ_MS_DSB_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MS_DSB_CYCLES 0x10
+DEFAULT_OPTIONS_IDQ_MS_DSB_OCCUR EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_EDGE=1
+UMASK_IDQ_MS_DSB_OCCUR 0x10
+UMASK_IDQ_MS_MITE_UOPS 0x20
+DEFAULT_OPTIONS_IDQ_MS_MITE_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MS_MITE_CYCLES 0x20
+UMASK_IDQ_MS_UOPS 0x30
+DEFAULT_OPTIONS_IDQ_MS_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MS_CYCLES 0x30
+DEFAULT_OPTIONS_IDQ_MS_SWITCHES EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_EDGE=1
+UMASK_IDQ_MS_SWITCHES 0x30
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS 0x18
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x04
+UMASK_IDQ_ALL_DSB_CYCLES_4_UOPS 0x18
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_ALL_MITE_CYCLES_ANY_UOPS 0x24
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x04
+UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS 0x24
+UMASK_IDQ_MITE_ALL_UOPS 0x3C
+
+EVENT_ICACHE 0x80 PMC
+UMASK_ICACHE_HIT 0x01
+UMASK_ICACHE_MISSES 0x02
+UMASK_ICACHE_ACCESSES 0x03
+
+EVENT_ITLB_MISSES 0x85 PMC
+UMASK_ITLB_MISSES_CAUSES_A_WALK 0x01
+UMASK_ITLB_MISSES_STLB_HIT 0x60
+UMASK_ITLB_MISSES_WALK_COMPLETED 0x0E
+UMASK_ITLB_MISSES_STLB_HIT_4K 0x20
+UMASK_ITLB_MISSES_WALK_COMPLETED_4K 0x02
+UMASK_ITLB_MISSES_WALK_DURATION 0x10
+
+
+EVENT_ILD_STALL 0x87 PMC
+UMASK_ILD_STALL_LCP 0x01
+
+EVENT_BR_INST_EXEC 0x88 PMC
+UMASK_BR_INST_EXEC_COND_TAKEN 0x81
+UMASK_BR_INST_EXEC_COND_NON_TAKEN 0x41
+UMASK_BR_INST_EXEC_DIRECT_JMP_TAKEN 0x82
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN 0x84
+UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN 0x88
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN 0x90
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN 0xA0
+UMASK_BR_INST_EXEC_ALL_CONDITIONAL 0xC1
+UMASK_BR_INST_EXEC_ALL_DIRECT_JMP 0xC2
+UMASK_BR_INST_EXEC_ALL_DIRECT_NEAR_CALL 0xD0
+UMASK_BR_INST_EXEC_ALL_INDIRECT_JUMP_NON_CALL_RET 0xC4
+UMASK_BR_INST_EXEC_ALL_INDIRECT_NEAR_RETURN 0xC8
+UMASK_BR_INST_EXEC_ALL_BRANCHES 0xFF
+
+EVENT_BR_MISP_EXEC 0x89 PMC
+UMASK_BR_MISP_EXEC_COND_TAKEN 0x81
+UMASK_BR_MISP_EXEC_COND_NON_TAKEN 0x41
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN 0x84
+UMASK_BR_MISP_EXEC_RETURN_NEAR_TAKEN 0x88
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_TAKEN 0x90
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN 0xA0
+UMASK_BR_MISP_EXEC_ALL_CONDITIONAL 0xC1
+UMASK_BR_MISP_EXEC_ALL_INDIRECT_JUMP_NON_CALL_RET 0xC4
+UMASK_BR_MISP_EXEC_ALL_BRANCHES 0xFF
+
+EVENT_IDQ_UOPS_NOT_DELIVERED 0x9C PMC
+UMASK_IDQ_UOPS_NOT_DELIVERED_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE EVENT_OPTION_THRESHOLD=0x04
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x03
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x02
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_INVERT=1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK 0x01
+
+EVENT_UOP_DISPATCHES_CANCELLED_SIMD_PRF 0xA0 PMC
+UMASK_UOP_DISPATCHES_CANCELLED_SIMD_PRF 0x03
+
+EVENT_UOPS_EXECUTED_PORT 0xA1 PMC
+UMASK_UOPS_EXECUTED_PORT_PORT_0 0x01
+UMASK_UOPS_EXECUTED_PORT_PORT_1 0x02
+UMASK_UOPS_EXECUTED_PORT_PORT_2 0x04
+UMASK_UOPS_EXECUTED_PORT_PORT_3 0x08
+UMASK_UOPS_EXECUTED_PORT_PORT_4 0x10
+UMASK_UOPS_EXECUTED_PORT_PORT_5 0x20
+UMASK_UOPS_EXECUTED_PORT_PORT_6 0x40
+UMASK_UOPS_EXECUTED_PORT_PORT_7 0x80
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_0_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_0_CORE 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_1_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_1_CORE 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_2_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_2_CORE 0x04
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_3_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_3_CORE 0x08
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_4_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_4_CORE 0x10
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_5_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_5_CORE 0x20
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_6_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_6_CORE 0x40
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_7_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_7_CORE 0x80
+
+EVENT_RESOURCE_STALLS 0xA2 PMC
+UMASK_RESOURCE_STALLS_ANY 0x01
+UMASK_RESOURCE_STALLS_RS 0x04
+UMASK_RESOURCE_STALLS_SB 0x08
+UMASK_RESOURCE_STALLS_ROB 0x10
+
+EVENT_CYCLE_ACTIVITY_CYCLES_L1D_MISS 0xA3 PMC2
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L1D_MISS EVENT_OPTION_THRESHOLD=0x08
+UMASK_CYCLE_ACTIVITY_CYCLES_L1D_MISS 0x08
+
+EVENT_CYCLE_ACTIVITY_CYCLES 0xA3 PMC
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L2_MISS EVENT_OPTION_THRESHOLD=0x01
+UMASK_CYCLE_ACTIVITY_CYCLES_L2_MISS 0x01
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_MEM_ANY EVENT_OPTION_THRESHOLD=0x02
+UMASK_CYCLE_ACTIVITY_CYCLES_MEM_ANY 0x02
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE EVENT_OPTION_THRESHOLD=0x04
+UMASK_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE 0x04
+
+EVENT_CYCLE_ACTIVITY_STALLS_L1D_MISS 0xA3 PMC2
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L1D_MISS EVENT_OPTION_THRESHOLD=0x0C
+UMASK_CYCLE_ACTIVITY_STALLS_L1D_MISS 0x0C
+
+EVENT_CYCLE_ACTIVITY_STALLS 0xA3 PMC
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_L2_MISS EVENT_OPTION_THRESHOLD=0x05
+UMASK_CYCLE_ACTIVITY_STALLS_L2_MISS 0x05
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_MEM_ANY EVENT_OPTION_THRESHOLD=0x06
+UMASK_CYCLE_ACTIVITY_STALLS_MEM_ANY 0x06
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_TOTAL EVENT_OPTION_THRESHOLD=0x04
+UMASK_CYCLE_ACTIVITY_STALLS_TOTAL 0x04
+
+EVENT_LSD_UOPS 0xA8 PMC
+UMASK_LSD_UOPS 0x01
+DEFAULT_OPTIONS_LSD_CYCLES_ACTIVE EVENT_OPTION_THRESHOLD=0x01
+UMASK_LSD_CYCLES_ACTIVE 0x01
+DEFAULT_OPTIONS_LSD_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x04
+UMASK_LSD_CYCLES_4_UOPS 0x01
+
+EVENT_DSB2MITE_SWITCHES_PENALTY_CYCLES 0xAB PMC
+UMASK_DSB2MITE_SWITCHES_PENALTY_CYCLES 0x02
+
+EVENT_ITLB 0xAE PMC
+UMASK_ITLB_ITLB_FLUSH 0x01
+
+EVENT_OFFCORE_REQUESTS 0xB0 PMC
+UMASK_OFFCORE_REQUESTS_DEMAND_DATA_RD 0x01
+UMASK_OFFCORE_REQUESTS_DEMAND_CODE_RD 0x02
+UMASK_OFFCORE_REQUESTS_DEMAND_RFO 0x04
+UMASK_OFFCORE_REQUESTS_ALL_DATA_RD 0x08
+
+EVENT_UOPS_EXECUTED 0xB1 PMC
+UMASK_UOPS_EXECUTED_THREAD 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC 0x01
+UMASK_UOPS_EXECUTED_CORE 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_USED_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_TOTAL_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC 0x02
+
+EVENT_OFFCORE_REQUESTS_BUFFER_SQ_FULL 0xB2 PMC
+UMASK_OFFCORE_REQUESTS_BUFFER_SQ_FULL 0x01
+
+EVENT_PAGE_WALKER_LOADS 0xBC PMC
+UMASK_PAGE_WALKER_LOADS_DTLB_L1 0x11
+UMASK_PAGE_WALKER_LOADS_ITLB_L1 0x21
+UMASK_PAGE_WALKER_LOADS_DTLB_L2 0x12
+UMASK_PAGE_WALKER_LOADS_ITLB_L2 0x22
+UMASK_PAGE_WALKER_LOADS_DTLB_L3 0x14
+UMASK_PAGE_WALKER_LOADS_ITLB_L3 0x24
+UMASK_PAGE_WALKER_LOADS_DTLB_MEMORY 0x18
+
+EVENT_INST_RETIRED_ANY_P 0xC0 PMC
+UMASK_INST_RETIRED_ANY_P 0x00
+UMASK_INST_RETIRED_X87 0x02
+
+EVENT_INST_RETIRED_PREC 0xC0 PMC1
+UMASK_INST_RETIRED_PREC_DIST 0x01
+
+EVENT_OTHER_ASSISTS 0xC1 PMC
+UMASK_OTHER_ASSISTS_AVX_TO_SSE 0x08
+UMASK_OTHER_ASSISTS_SSE_TO_AVX 0x10
+UMASK_OTHER_ASSISTS_ANY_WB_ASSIST 0x40
+
+EVENT_UOPS_RETIRED 0xC2 PMC
+UMASK_UOPS_RETIRED_ALL 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL 0x01
+UMASK_UOPS_RETIRED_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_RETIRED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL 0x01
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_TOTAL_CYCLES 0x01
+
+EVENT_MACHINE_CLEARS 0xC3 PMC
+DEFAULT_OPTIONS_MACHINE_CLEARS_COUNT EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_EDGE=1
+UMASK_MACHINE_CLEARS_COUNT 0x01
+UMASK_MACHINE_CLEARS_CYCLES 0x01
+UMASK_MACHINE_CLEARS_MEMORY_ORDERING 0x02
+UMASK_MACHINE_CLEARS_SMC 0x04
+UMASK_MACHINE_CLEARS_MASKMOV 0x20
+
+EVENT_BR_INST_RETIRED 0xC4 PMC
+UMASK_BR_INST_RETIRED_ALL_BRANCHES 0x00
+UMASK_BR_INST_RETIRED_CONDITIONAL 0x01
+UMASK_BR_INST_RETIRED_NEAR_CALL 0x02
+UMASK_BR_INST_RETIRED_NEAR_RETURN 0x08
+UMASK_BR_INST_RETIRED_NOT_TAKEN 0x10
+UMASK_BR_INST_RETIRED_NEAR_TAKEN 0x20
+UMASK_BR_INST_RETIRED_FAR_BRANCH 0x40
+
+EVENT_BR_MISP_RETIRED 0xC5 PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES 0x00
+UMASK_BR_MISP_RETIRED_CONDITIONAL 0x01
+UMASK_BR_MISP_RETIRED_NEAR_TAKEN 0x20
+
+EVENT_FP_ARITH_INST_RETIRED 0xC7 PMC
+UMASK_FP_ARITH_INST_RETIRED_SCALAR_DOUBLE 0x01
+UMASK_FP_ARITH_INST_RETIRED_SCALAR_SINGLE 0x02
+UMASK_FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE 0x04
+UMASK_FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE 0x08
+UMASK_FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE 0x10
+UMASK_FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE 0x20
+UMASK_FP_ARITH_INST_RETIRED_SCALAR 0x03
+UMASK_FP_ARITH_INST_RETIRED_PACKED 0x3C
+UMASK_FP_ARITH_INST_RETIRED_DOUBLE 0x15
+UMASK_FP_ARITH_INST_RETIRED_SINGLE 0x2A
+
+EVENT_HLE_RETIRED 0xC8 PMC
+UMASK_HLE_RETIRED_START 0x01
+UMASK_HLE_RETIRED_COMMIT 0x02
+UMASK_HLE_RETIRED_ABORTED 0x04
+UMASK_HLE_RETIRED_ABORTED_MISC1 0x08
+UMASK_HLE_RETIRED_ABORTED_MISC2 0x10
+UMASK_HLE_RETIRED_ABORTED_MISC3 0x20
+UMASK_HLE_RETIRED_ABORTED_MISC4 0x40
+UMASK_HLE_RETIRED_ABORTED_MISC5 0x80
+
+EVENT_RTM_RETIRED 0xC9 PMC
+UMASK_RTM_RETIRED_START 0x01
+UMASK_RTM_RETIRED_COMMIT 0x02
+UMASK_RTM_RETIRED_ABORTED 0x04
+UMASK_RTM_RETIRED_ABORTED_MISC1 0x08
+UMASK_RTM_RETIRED_ABORTED_MISC2 0x10
+UMASK_RTM_RETIRED_ABORTED_MISC3 0x20
+UMASK_RTM_RETIRED_ABORTED_MISC4 0x40
+UMASK_RTM_RETIRED_ABORTED_MISC5 0x80
+
+EVENT_FP_ASSIST 0xCA PMC
+UMASK_FP_ASSIST_X87_OUTPUT 0x02
+UMASK_FP_ASSIST_X87_INPUT 0x04
+UMASK_FP_ASSIST_SIMD_OUTPUT 0x08
+UMASK_FP_ASSIST_SIMD_INPUT 0x10
+UMASK_FP_ASSIST_ANY 0x1E
+
+EVENT_ROB_MISC_EVENT_LBR_INSERTS 0xCC PMC
+UMASK_ROB_MISC_EVENT_LBR_INSERTS 0x20
+
+EVENT_MEM_UOPS_RETIRED 0xD0 PMC
+UMASK_MEM_UOPS_RETIRED_LOADS_ALL 0x81
+UMASK_MEM_UOPS_RETIRED_STORES_ALL 0x82
+UMASK_MEM_UOPS_RETIRED_LOADS_LOCK 0x21
+UMASK_MEM_UOPS_RETIRED_LOADS_STLB_MISS 0x11
+UMASK_MEM_UOPS_RETIRED_STORES_STLB_MISS 0x12
+UMASK_MEM_UOPS_RETIRED_LOADS_SPLIT 0x41
+UMASK_MEM_UOPS_RETIRED_STORES_SPLIT 0x42
+R
+EVENT_MEM_LOAD_UOPS_RETIRED 0xD1 PMC
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT 0x01
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_MISS 0x08
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_ALL 0x09
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_HIT 0x02
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_MISS 0x10
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_ALL 0x12
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT 0x04
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS 0x20
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL 0x24
+UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB 0x40
+
+EVENT_MEM_LOAD_UOPS_L3_HIT_RETIRED 0xD2 PMC
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_MISS 0x01
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HIT 0x02
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM 0x04
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_NONE 0x08
+
+EVENT_MEM_LOAD_UOPS_L3_MISS_RETIRED 0xD3 PMC
+UMASK_MEM_LOAD_UOPS_L3_MISS_RETIRED_LOCAL_DRAM 0x01
+
+EVENT_BACLEARS 0xE6 PMC
+UMASK_BACLEARS_ANY 0x1F
+
+EVENT_L2_TRANS 0xF0 PMC
+UMASK_L2_TRANS_DEMAND_DATA_RD 0x01
+UMASK_L2_TRANS_RFO 0x02
+UMASK_L2_TRANS_CODE_RD 0x04
+UMASK_L2_TRANS_ALL_PF 0x08
+UMASK_L2_TRANS_L1D_WB 0x10
+UMASK_L2_TRANS_L2_FILL 0x20
+UMASK_L2_TRANS_L2_WB 0x40
+UMASK_L2_TRANS_ALL_REQUESTS 0x80
+
+EVENT_L2_LINES_IN 0xF1 PMC
+UMASK_L2_LINES_IN_I 0x01
+UMASK_L2_LINES_IN_S 0x02
+UMASK_L2_LINES_IN_E 0x04
+UMASK_L2_LINES_IN_ALL 0x07
+
+EVENT_L2_LINES_OUT 0xF2 PMC
+UMASK_L2_LINES_OUT_DEMAND_CLEAN 0x05
+UMASK_L2_LINES_OUT_DEMAND_DIRTY 0x06
+
+EVENT_OFFCORE_RESPONSE_0 0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS 0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY 0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY 0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY 0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY 0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY 0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY 0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY 0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY 0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY 0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY 0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY 0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY 0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY 0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1 0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS 0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY 0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY 0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY 0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY 0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY 0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY 0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY 0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY 0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY 0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY 0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY 0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY 0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY 0x01 0x0F 0x10
+
+EVENT_EVENT_MSG 0x42 UBOX
+UMASK_EVENT_MSG_DOORBELL_RCVD 0x08
+
+EVENT_PHOLD_CYCLES 0x45 UBOX
+UMASK_PHOLD_CYCLES_ASSERT_TO_ACK 0x01
+
+EVENT_RACU_REQUESTS 0x46 UBOX
+UMASK_RACU_REQUESTS 0x00
+
+EVENT_UNCORE_CLOCK 0x00 UBOXFIX
+UMASK_UNCORE_CLOCK 0x00
+
+EVENT_CBOX_CLOCKTICKS 0x00 CBOX
+UMASK_CBOX_CLOCKTICKS 0x00
+
+EVENT_TXR_INSERTS 0x02 CBOX
+UMASK_TXR_INSERTS_AD_CACHE 0x01
+UMASK_TXR_INSERTS_AK_CACHE 0x02
+UMASK_TXR_INSERTS_BL_CACHE 0x04
+UMASK_TXR_INSERTS_IV_CACHE 0x08
+UMASK_TXR_INSERTS_AD_CORE 0x10
+UMASK_TXR_INSERTS_AK_CORE 0x20
+UMASK_TXR_INSERTS_BL_CORE 0x40
+
+EVENT_TXR_ADS_USED 0x04 CBOX
+UMASK_TXR_ADS_USED_AD 0x01
+UMASK_TXR_ADS_USED_AK 0x02
+UMASK_TXR_ADS_USED_BL 0x04
+
+EVENT_RING_BOUNCES 0x05 CBOX
+UMASK_RING_BOUNCES_AD 0x01
+UMASK_RING_BOUNCES_AK 0x02
+UMASK_RING_BOUNCES_BL 0x04
+UMASK_RING_BOUNCES_IV 0x10
+
+EVENT_RING_SRC_THRTL 0x07 CBOX
+UMASK_RING_SRC_THRTL 0x00
+
+EVENT_FAST_ASSERTED 0x09 CBOX0C0|CBOX0C1|CBOX1C0|CBOX1C1|CBOX2C0|CBOX2C1|CBOX3C0|CBOX3C1|CBOX4C0|CBOX4C1|CBOX5C0|CBOX5C1|CBOX6C0|CBOX6C1|CBOX7C0|CBOX7C1|CBOX8C0|CBOX8C1|CBOX9C0|CBOX9C1|CBOX10C0|CBOX10C1|CBOX11C0|CBOX11C1|CBOX12C0|CBOX12C1|CBOX13C0|CBOX13C1|CBOX14C0|CBOX14C1|CBOX15C0|CBOX15C1
+UMASK_FAST_ASSERTED 0x00
+
+EVENT_BOUNCE_CONTROL 0x0A CBOX
+UMASK_BOUNCE_CONTROL 0x00
+
+EVENT_RING_AD_USED 0x1B CBOX
+UMASK_RING_AD_USED_UP_EVEN 0x01
+UMASK_RING_AD_USED_UP_ODD 0x02
+UMASK_RING_AD_USED_UP 0x03
+UMASK_RING_AD_USED_DOWN_EVEN 0x04
+UMASK_RING_AD_USED_DOWN_ODD 0x08
+UMASK_RING_AD_USED_DOWN 0x0C
+UMASK_RING_AD_USED_ANY 0x0F
+
+EVENT_RING_AK_USED 0x1C CBOX
+UMASK_RING_AK_USED_UP_EVEN 0x01
+UMASK_RING_AK_USED_UP_ODD 0x02
+UMASK_RING_AK_USED_UP 0x03
+UMASK_RING_AK_USED_DOWN_EVEN 0x04
+UMASK_RING_AK_USED_DOWN_ODD 0x08
+UMASK_RING_AK_USED_DOWN 0x0C
+UMASK_RING_AK_USED_ANY 0x0F
+
+EVENT_RING_BL_USED 0x1D CBOX
+UMASK_RING_BL_USED_UP_EVEN 0x01
+UMASK_RING_BL_USED_UP_ODD 0x02
+UMASK_RING_BL_USED_UP 0x03
+UMASK_RING_BL_USED_DOWN_EVEN 0x04
+UMASK_RING_BL_USED_DOWN_ODD 0x08
+UMASK_RING_BL_USED_DOWN 0x0C
+UMASK_RING_BL_USED_ANY 0x0F
+
+EVENT_RING_IV_USED 0x1E CBOX
+UMASK_RING_IV_USED_UP 0x03
+UMASK_RING_IV_USED_DN 0x0C
+UMASK_RING_IV_USED_ANY 0x0F
+
+EVENT_COUNTER0_OCCUPANCY 0x1F CBOX
+UMASK_COUNTER0_OCCUPANCY 0x00
+
+EVENT_RXR_OCCUPANCY 0x11 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX15C0
+UMASK_RXR_OCCUPANCY_IRQ 0x01
+UMASK_RXR_OCCUPANCY_IRQ_REJ 0x02
+UMASK_RXR_OCCUPANCY_IPQ 0x04
+UMASK_RXR_OCCUPANCY_PRQ_REJ 0x20
+
+EVENT_RXR_EXT_STARVED 0x12 CBOX
+UMASK_RXR_EXT_STARVED_IRQ 0x01
+UMASK_RXR_EXT_STARVED_IPQ 0x02
+UMASK_RXR_EXT_STARVED_PRQ 0x04
+UMASK_RXR_EXT_STARVED_ISMQ_BIDS 0x08
+
+EVENT_RXR_INSERTS 0x13 CBOX
+UMASK_RXR_INSERTS_IRQ 0x01
+UMASK_RXR_INSERTS_IRQ_REJ 0x02
+UMASK_RXR_INSERTS_IPQ 0x04
+UMASK_RXR_INSERTS_PRQ 0x10
+UMASK_RXR_INSERTS_PRQ_REJ 0x20
+
+EVENT_RXR_IPQ_RETRY 0x31 CBOX
+UMASK_RXR_IPQ_RETRY_ANY 0x01
+UMASK_RXR_IPQ_RETRY_FULL 0x02
+UMASK_RXR_IPQ_RETRY_ADDR_CONFLICT 0x04
+UMASK_RXR_IPQ_RETRY_QPI_CREDITS 0x10
+
+EVENT_RXR_IPQ_RETRY2 0x28 CBOX
+UMASK_RXR_IPQ_RETRY2_AD_SBO 0x01
+OPTIONS_RXR_IPQ_RETRY2_TARGET EVENT_OPTION_NID_MASK
+UMASK_RXR_IPQ_RETRY2_TARGET 0x40
+
+EVENT_RXR_IRQ_RETRY 0x32 CBOX
+UMASK_RXR_IRQ_RETRY_ANY 0x01
+UMASK_RXR_IRQ_RETRY_FULL 0x02
+UMASK_RXR_IRQ_RETRY_ADDR_CONFLICT 0x04
+UMASK_RXR_IRQ_RETRY_RTID 0x08
+UMASK_RXR_IRQ_RETRY_QPI_CREDITS 0x10
+UMASK_RXR_IRQ_RETRY_IIO_CREDITS 0x20
+OPTIONS_RXR_IRQ_RETRY_NID EVENT_OPTION_NID_MASK
+UMASK_RXR_IRQ_RETRY_NID 0x40
+
+EVENT_RXR_IRQ_RETRY2 0x29 CBOX
+UMASK_RXR_IRQ_RETRY2_AD_SBO 0x01
+UMASK_RXR_IRQ_RETRY2_BL_SBO 0x02
+OPTIONS_RXR_IRQ_RETRY2_TARGET EVENT_OPTION_NID_MASK
+UMASK_RXR_IRQ_RETRY2_TARGET 0x40
+
+EVENT_RXR_ISMQ_RETRY 0x33 CBOX
+UMASK_RXR_ISMQ_RETRY_ANY 0x01
+UMASK_RXR_ISMQ_RETRY_FULL 0x02
+UMASK_RXR_ISMQ_RETRY_RTID 0x08
+UMASK_RXR_ISMQ_RETRY_QPI_CREDITS 0x10
+UMASK_RXR_ISMQ_RETRY_IIO_CREDITS 0x20
+OPTIONS_RXR_ISMQ_RETRY_NID EVENT_OPTION_NID_MASK
+UMASK_RXR_ISMQ_RETRY_NID 0x40
+OPTIONS_RXR_ISMQ_RETRY_WB_CREDITS EVENT_OPTION_NID_MASK
+UMASK_RXR_ISMQ_RETRY_WB_CREDITS 0x80
+
+EVENT_RXR_ISMQ_RETRY2 0x2A CBOX
+UMASK_RXR_ISMQ_RETRY2_AD_SBO 0x01
+UMASK_RXR_ISMQ_RETRY2_BL_SBO 0x02
+OPTIONS_RXR_ISMQ_RETRY2_TARGET EVENT_OPTION_NID_MASK
+UMASK_RXR_ISMQ_RETRY2_TARGET 0x40
+
+EVENT_LLC_LOOKUP 0x34 CBOX
+OPTIONS_LLC_LOOKUP_DATA_READ EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_DATA_READ 0x03
+OPTIONS_LLC_LOOKUP_WRITE EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_WRITE 0x05
+OPTIONS_LLC_LOOKUP_REMOTE_SNOOP EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_REMOTE_SNOOP 0x09
+OPTIONS_LLC_LOOKUP_ANY EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_ANY 0x11
+OPTIONS_LLC_LOOKUP_READ EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_READ 0x21
+OPTIONS_LLC_LOOKUP_NID EVENT_OPTION_STATE_MASK|EVENT_OPTION_NID_MASK
+UMASK_LLC_LOOKUP_NID 0x41
+
+EVENT_LLC_VICTIMS 0x37 CBOX
+UMASK_LLC_VICTIMS_M 0x01
+UMASK_LLC_VICTIMS_E 0x02
+UMASK_LLC_VICTIMS_S 0x04
+UMASK_LLC_VICTIMS_F 0x08
+UMASK_LLC_VICTIMS_MISS 0x10
+OPTIONS_LLC_VICTIMS_NID EVENT_OPTION_STATE_MASK|EVENT_OPTION_NID_MASK
+UMASK_LLC_VICTIMS_NID 0x40
+
+EVENT_TOR_INSERTS 0x35 CBOX
+OPTIONS_TOR_INSERTS_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_OPCODE 0x01
+OPTIONS_TOR_INSERTS_MISS_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_MISS_OPCODE 0x03
+UMASK_TOR_INSERTS_EVICTION 0x04
+UMASK_TOR_INSERTS_ALL 0x08
+UMASK_TOR_INSERTS_WB 0x10
+OPTIONS_TOR_INSERTS_LOCAL_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_LOCAL_OPCODE 0x21
+OPTIONS_TOR_INSERTS_MISS_LOCAL_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_MISS_LOCAL_OPCODE 0x23
+UMASK_TOR_INSERTS_LOCAL 0x28
+UMASK_TOR_INSERTS_MISS_LOCAL 0x2A
+OPTIONS_TOR_INSERTS_NID_OPCODE EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_NID_OPCODE 0x41
+OPTIONS_TOR_INSERTS_NID_MISS_OPCODE EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_NID_MISS_OPCODE 0x43
+OPTIONS_TOR_INSERTS_NID_EVICION EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_EVICION 0x44
+OPTIONS_TOR_INSERTS_NID_ALL EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_ALL 0x48
+OPTIONS_TOR_INSERTS_NID_MISS_ALL EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_MISS_ALL 0x4A
+OPTIONS_TOR_INSERTS_NID_WB EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_WB 0x50
+OPTIONS_TOR_INSERTS_REMOTE_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_REMOTE_OPCODE 0x81
+OPTIONS_TOR_INSERTS_MISS_REMOTE_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_MISS_REMOTE_OPCODE 0x83
+UMASK_TOR_INSERTS_REMOTE 0x88
+UMASK_TOR_INSERTS_MISS_REMOTE 0x8A
+
+EVENT_TOR_OCCUPANCY 0x36 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX15C0
+OPTIONS_TOR_OCCUPANCY_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_OPCODE 0x01
+OPTIONS_TOR_OCCUPANCY_MISS_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_MISS_OPCODE 0x03
+UMASK_TOR_OCCUPANCY_EVICTION 0x04
+UMASK_TOR_OCCUPANCY_ALL 0x08
+UMASK_TOR_OCCUPANCY_MISS_ALL 0x0A
+UMASK_TOR_OCCUPANCY_WB 0x10
+UMASK_TOR_OCCUPANCY_LOCAL_OPCODE 0x21
+UMASK_TOR_OCCUPANCY_MISS_LOCAL_OPCODE 0x23
+UMASK_TOR_OCCUPANCY_LOCAL 0x28
+UMASK_TOR_OCCUPANCY_MISS_LOCAL 0x2A
+OPTIONS_TOR_OCCUPANCY_NID_OPCODE EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_NID_OPCODE 0x41
+OPTIONS_TOR_OCCUPANCY_NID_MISS_OPCODE EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_NID_MISS_OPCODE 0x43
+OPTIONS_TOR_OCCUPANCY_NID_EVICTION EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_EVICTION 0x44
+OPTIONS_TOR_OCCUPANCY_NID_ALL EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_ALL 0x48
+OPTIONS_TOR_OCCUPANCY_NID_MISS_ALL EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_MISS_ALL 0x4A
+OPTIONS_TOR_OCCUPANCY_NID_WB EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_WB 0x50
+OPTIONS_TOR_OCCUPANCY_REMOTE_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_REMOTE_OPCODE 0x81
+OPTIONS_TOR_OCCUPANCY_MISS_REMOTE_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_MISS_REMOTE_OPCODE 0x83
+UMASK_TOR_OCCUPANCY_REMOTE 0x88
+UMASK_TOR_OCCUPANCY_MISS_REMOTE 0x8A
+
+EVENT_MISC 0x39 CBOX
+UMASK_MISC_RSPI_WAS_FSE 0x01
+UMASK_MISC_WC_ALIASING 0x02
+UMASK_MISC_STARTED 0x04
+UMASK_MISC_RFO_HIT_S 0x08
+UMASK_MISC_CVZERO_PREFETCH_VICTIM 0x10
+UMASK_MISC_CVZERO_PREFETCH_MISS 0x20
+
+EVENT_WBOX_CLOCKTICKS 0x00 WBOX
+UMASK_WBOX_CLOCKTICKS 0x00
+
+EVENT_CORE0_TRANSITION_CYCLES 0x60 WBOX
+UMASK_CORE0_TRANSITION_CYCLES 0x00
+
+EVENT_CORE1_TRANSITION_CYCLES 0x61 WBOX
+UMASK_CORE1_TRANSITION_CYCLES 0x00
+
+EVENT_CORE2_TRANSITION_CYCLES 0x62 WBOX
+UMASK_CORE2_TRANSITION_CYCLES 0x00
+
+EVENT_CORE3_TRANSITION_CYCLES 0x63 WBOX
+UMASK_CORE3_TRANSITION_CYCLES 0x00
+
+EVENT_CORE4_TRANSITION_CYCLES 0x64 WBOX
+UMASK_CORE4_TRANSITION_CYCLES 0x00
+
+EVENT_CORE5_TRANSITION_CYCLES 0x65 WBOX
+UMASK_CORE5_TRANSITION_CYCLES 0x00
+
+EVENT_CORE6_TRANSITION_CYCLES 0x66 WBOX
+UMASK_CORE6_TRANSITION_CYCLES 0x00
+
+EVENT_CORE7_TRANSITION_CYCLES 0x67 WBOX
+UMASK_CORE7_TRANSITION_CYCLES 0x00
+
+EVENT_CORE8_TRANSITION_CYCLES 0x68 WBOX
+UMASK_CORE8_TRANSITION_CYCLES 0x00
+
+EVENT_CORE9_TRANSITION_CYCLES 0x69 WBOX
+UMASK_CORE9_TRANSITION_CYCLES 0x00
+
+EVENT_CORE10_TRANSITION_CYCLES 0x6A WBOX
+UMASK_CORE10_TRANSITION_CYCLES 0x00
+
+EVENT_CORE11_TRANSITION_CYCLES 0x6B WBOX
+UMASK_CORE11_TRANSITION_CYCLES 0x00
+
+EVENT_CORE12_TRANSITION_CYCLES 0x6C WBOX
+UMASK_CORE12_TRANSITION_CYCLES 0x00
+
+EVENT_CORE13_TRANSITION_CYCLES 0x6D WBOX
+UMASK_CORE13_TRANSITION_CYCLES 0x00
+
+EVENT_CORE14_TRANSITION_CYCLES 0x6E WBOX
+UMASK_CORE14_TRANSITION_CYCLES 0x00
+
+EVENT_CORE15_TRANSITION_CYCLES 0x6F WBOX
+UMASK_CORE15_TRANSITION_CYCLES 0x00
+
+EVENT_CORE16_TRANSITION_CYCLES 0x70 WBOX
+UMASK_CORE16_TRANSITION_CYCLES 0x00
+
+EVENT_CORE17_TRANSITION_CYCLES 0x71 WBOX
+UMASK_CORE17_TRANSITION_CYCLES 0x00
+
+EVENT_FIVR_PS_PS0_CYCLES 0x75 WBOX
+UMASK_FIVR_PS_PS0_CYCLES 0x00
+
+EVENT_FIVR_PS_PS1_CYCLES 0x75 WBOX
+UMASK_FIVR_PS_PS1_CYCLES 0x00
+
+EVENT_FIVR_PS_PS2_CYCLES 0x75 WBOX
+UMASK_FIVR_PS_PS2_CYCLES 0x00
+
+EVENT_FIVR_PS_PS3_CYCLES 0x75 WBOX
+UMASK_FIVR_PS_PS3_CYCLES 0x00
+
+EVENT_DEMOTIONS_CORE0 0x30 WBOX
+UMASK_DEMOTIONS_CORE0 0x00
+
+EVENT_DEMOTIONS_CORE1 0x31 WBOX
+UMASK_DEMOTIONS_CORE1 0x00
+
+EVENT_DEMOTIONS_CORE2 0x32 WBOX
+UMASK_DEMOTIONS_CORE2 0x00
+
+EVENT_DEMOTIONS_CORE3 0x33 WBOX
+UMASK_DEMOTIONS_CORE3 0x00
+
+EVENT_DEMOTIONS_CORE4 0x34 WBOX
+UMASK_DEMOTIONS_CORE4 0x00
+
+EVENT_DEMOTIONS_CORE5 0x35 WBOX
+UMASK_DEMOTIONS_CORE5 0x00
+
+EVENT_DEMOTIONS_CORE6 0x36 WBOX
+UMASK_DEMOTIONS_CORE6 0x00
+
+EVENT_DEMOTIONS_CORE7 0x37 WBOX
+UMASK_DEMOTIONS_CORE7 0x00
+
+EVENT_DEMOTIONS_CORE8 0x38 WBOX
+UMASK_DEMOTIONS_CORE8 0x00
+
+EVENT_DEMOTIONS_CORE9 0x39 WBOX
+UMASK_DEMOTIONS_CORE9 0x00
+
+EVENT_DEMOTIONS_CORE10 0x3A WBOX
+UMASK_DEMOTIONS_CORE10 0x00
+
+EVENT_DEMOTIONS_CORE11 0x3B WBOX
+UMASK_DEMOTIONS_CORE11 0x00
+
+EVENT_DEMOTIONS_CORE12 0x3C WBOX
+UMASK_DEMOTIONS_CORE12 0x00
+
+EVENT_DEMOTIONS_CORE13 0x3D WBOX
+UMASK_DEMOTIONS_CORE13 0x00
+
+EVENT_DEMOTIONS_CORE14 0x3E WBOX
+UMASK_DEMOTIONS_CORE14 0x00
+
+EVENT_DEMOTIONS_CORE15 0x3F WBOX
+UMASK_DEMOTIONS_CORE15 0x00
+
+EVENT_DEMOTIONS_CORE16 0x40 WBOX
+UMASK_DEMOTIONS_CORE16 0x00
+
+EVENT_DEMOTIONS_CORE17 0x41 WBOX
+UMASK_DEMOTIONS_CORE17 0x00
+
+EVENT_FREQ_BAND0_CYCLES 0x0B WBOX
+OPTIONS_FREQ_BAND0_CYCLES EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND0_CYCLES 0x00
+
+EVENT_FREQ_BAND1_CYCLES 0x0C WBOX
+OPTIONS_FREQ_BAND1_CYCLES EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND1_CYCLES 0x00
+
+EVENT_FREQ_BAND2_CYCLES 0x0D WBOX
+OPTIONS_FREQ_BAND2_CYCLES EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND2_CYCLES 0x00
+
+EVENT_FREQ_BAND3_CYCLES 0x0E WBOX
+OPTIONS_FREQ_BAND3_CYCLES EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND3_CYCLES 0x00
+
+EVENT_FREQ_MAX_LIMIT_THERMAL_CYCLES 0x04 WBOX
+UMASK_FREQ_MAX_LIMIT_THERMAL_CYCLES 0x00
+
+EVENT_FREQ_MAX_OS_CYCLES 0x06 WBOX
+UMASK_FREQ_MAX_OS_CYCLES 0x00
+
+EVENT_FREQ_MAX_POWER_CYCLES 0x05 WBOX
+UMASK_FREQ_MAX_POWER_CYCLES 0x00
+
+EVENT_FREQ_MIN_IO_P_CYCLES 0x73 WBOX
+UMASK_FREQ_MIN_IO_P_CYCLES 0x00
+
+EVENT_FREQ_TRANS_CYCLES 0x74 WBOX
+UMASK_FREQ_TRANS_CYCLES 0x00
+
+EVENT_MEMORY_PHASE_SHEDDING_CYCLES 0x2F WBOX
+UMASK_MEMORY_PHASE_SHEDDING_CYCLES 0x00
+
+EVENT_POWER_STATE_OCCUPANCY 0x80 WBOX
+UMASK_POWER_STATE_OCCUPANCY_CORES_C0 0x40
+UMASK_POWER_STATE_OCCUPANCY_CORES_C3 0x80
+UMASK_POWER_STATE_OCCUPANCY_CORES_C6 0xC0
+
+EVENT_PROCHOT_EXTERNAL_CYCLES 0x0A WBOX
+UMASK_PROCHOT_EXTERNAL_CYCLES 0x00
+
+EVENT_PROCHOT_INTERNAL_CYCLES 0x09 WBOX
+UMASK_PROCHOT_INTERNAL_CYCLES 0x00
+
+EVENT_TOTAL_TRANSITION_CYCLES 0x72 WBOX
+UMASK_TOTAL_TRANSITION_CYCLES 0x00
+
+EVENT_VR_HOT_CYCLES 0x42 WBOX
+UMASK_VR_HOT_CYCLES 0x00
+
+EVENT_UFS_BANDWIDTH_MAX_RANGE 0x7E WBOX
+UMASK_UFS_BANDWIDTH_MAX_RANGE 0x00
+
+EVENT_UFS_TRANSITIONS_DOWN 0x7C WBOX
+UMASK_UFS_TRANSITIONS_DOWN 0x00
+
+EVENT_UFS_TRANSITIONS_IO_P_LIMIT 0x7D WBOX
+UMASK_UFS_TRANSITIONS_IO_P_LIMIT 0x00
+
+EVENT_UFS_TRANSITIONS_NO_CHANGE 0x79 WBOX
+UMASK_UFS_TRANSITIONS_NO_CHANGE 0x00
+
+EVENT_UFS_TRANSITIONS_UP_RING 0x7A WBOX
+UMASK_UFS_TRANSITIONS_UP_RING 0x00
+
+EVENT_UFS_TRANSITIONS_UP_STALL 0x7B WBOX
+UMASK_UFS_TRANSITIONS_UP_STALL 0x00
+
+EVENT_CORES_IN_C3 0x00 WBOX0FIX
+UMASK_CORES_IN_C3 0x00
+
+EVENT_CORES_IN_C6 0x00 WBOX1FIX
+UMASK_CORES_IN_C6 0x00
+
+EVENT_BBOX_CLOCKTICKS 0x00 BBOX
+UMASK_BBOX_CLOCKTICKS 0x00
+
+EVENT_ADDR_OPC_MATCH 0x20 BBOX
+OPTIONS_ADDR_OPC_MATCH_ADDR EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_ADDR_OPC_MATCH_ADDR 0x01
+OPTIONS_ADDR_OPC_MATCH_OPC EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_OPC 0x02
+OPTIONS_ADDR_OPC_MATCH_FILT EVENT_OPTION_OPCODE_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_ADDR_OPC_MATCH_FILT 0x03
+OPTIONS_ADDR_OPC_MATCH_AD EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_AD 0x04
+OPTIONS_ADDR_OPC_MATCH_BL EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_BL 0x08
+OPTIONS_ADDR_OPC_MATCH_AK EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_AK 0x10
+
+EVENT_BT_CYCLES_NE 0x42 BBOX
+UMASK_BT_CYCLES_NE 0x00
+
+EVENT_BT_OCCUPANCY 0x43 BBOX
+UMASK_BT_OCCUPANCY 0x00
+
+EVENT_BYPASS_IMC 0x14 BBOX
+UMASK_BYPASS_IMC_TAKEN 0x01
+UMASK_BYPASS_IMC_NOT_TAKEN 0x02
+
+EVENT_CONFLICT_CYCLES 0x0B BBOX0C1|BBOX1C1
+UMASK_CONFLICT_CYCLES 0x00
+
+EVENT_DIRECT2CORE_COUNT 0x11 BBOX
+UMASK_DIRECT2CORE_COUNT 0x00
+
+EVENT_DIRECT2CORE_CYCLES_DISABLED 0x12 BBOX
+UMASK_DIRECT2CORE_CYCLES_DISABLED 0x00
+
+EVENT_DIRECT2CORE_TXN_OVERRIDE 0x13 BBOX
+UMASK_DIRECT2CORE_TXN_OVERRIDE 0x00
+
+EVENT_DIRECTORY_LAT_OPT 0x41 BBOX
+UMASK_DIRECTORY_LAT_OPT 0x00
+
+EVENT_DIRECTORY_LOOKUP 0x0C BBOX
+UMASK_DIRECTORY_LOOKUP_SNP 0x01
+UMASK_DIRECTORY_LOOKUP_NO_SNP 0x02
+
+EVENT_DIRECTORY_UPDATE 0x0D BBOX
+UMASK_DIRECTORY_UPDATE_SET 0x01
+UMASK_DIRECTORY_UPDATE_CLEAR 0x02
+UMASK_DIRECTORY_UPDATE_ANY 0x03
+
+EVENT_HITME_LOOKUP 0x70 BBOX
+UMASK_HITME_LOOKUP_READ_OR_INVITOE 0x01
+UMASK_HITME_LOOKUP_WBMTOI 0x02
+UMASK_HITME_LOOKUP_ACKCNFLTWBI 0x04
+UMASK_HITME_LOOKUP_WBMTOE_OR_S 0x08
+UMASK_HITME_LOOKUP_HOM 0x0F
+UMASK_HITME_LOOKUP_RSPFWDI_REMOTE 0x10
+UMASK_HITME_LOOKUP_RSPFWDI_LOCAL 0x20
+UMASK_HITME_LOOKUP_INVALS 0x26
+UMASK_HITME_LOOKUP_RSPFWDS 0x40
+UMASK_HITME_LOOKUP_ALLOCS 0x70
+UMASK_HITME_LOOKUP_RSP 0x80
+UMASK_HITME_LOOKUP_ALL 0xFF
+
+EVENT_HITME_HIT 0x71 BBOX
+UMASK_HITME_HIT_READ_OR_INVITOE 0x01
+UMASK_HITME_HIT_WBMTOI 0x02
+UMASK_HITME_HIT_ACKCNFLTWBI 0x04
+UMASK_HITME_HIT_WBMTOE_OR_S 0x08
+UMASK_HITME_HIT_HOM 0x0F
+UMASK_HITME_HIT_RSPFWDI_REMOTE 0x10
+UMASK_HITME_HIT_RSPFWDI_LOCAL 0x20
+UMASK_HITME_HIT_INVALS 0x26
+UMASK_HITME_HIT_RSPFWDS 0x40
+UMASK_HITME_HIT_EVICTS 0x42
+UMASK_HITME_HIT_ALLOCS 0x70
+UMASK_HITME_HIT_RSP 0x80
+UMASK_HITME_HIT_ALL 0xFF
+
+EVENT_HITME_HIT_PV_BITS_SET 0x72 BBOX
+UMASK_HITME_HIT_PV_BITS_SET_READ_OR_INVITOE 0x01
+UMASK_HITME_HIT_PV_BITS_SET_WBMTOI 0x02
+UMASK_HITME_HIT_PV_BITS_SET_ACKCNFLTWBI 0x04
+UMASK_HITME_HIT_PV_BITS_SET_WBMTOE_OR_S 0x08
+UMASK_HITME_HIT_PV_BITS_SET_HOM 0x0F
+UMASK_HITME_HIT_PV_BITS_SET_RSPFWDI_REMOTE 0x10
+UMASK_HITME_HIT_PV_BITS_SET_RSPFWDI_LOCAL 0x20
+UMASK_HITME_HIT_PV_BITS_SET_RSPFWDS 0x40
+UMASK_HITME_HIT_PV_BITS_SET_RSP 0x80
+UMASK_HITME_HIT_PV_BITS_SET_ALL 0xFF
+
+EVENT_IGR_NO_CREDIT_CYCLES 0x22 BBOX
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI0 0x01
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI1 0x02
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI0 0x04
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI1 0x08
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI2 0x10
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI2 0x20
+
+EVENT_IMC_READS 0x17 BBOX
+UMASK_IMC_READS_NORMAL 0x01
+
+EVENT_IMC_RETRY 0x1E BBOX
+UMASK_IMC_RETRY 0x00
+
+EVENT_IMC_WRITES 0x1A BBOX
+UMASK_IMC_WRITES_FULL 0x01
+UMASK_IMC_WRITES_PARTIAL 0x02
+UMASK_IMC_WRITES_FULL_ISOCH 0x04
+UMASK_IMC_WRITES_PARTIAL_ISOCH 0x08
+UMASK_IMC_WRITES_ALL 0x0F
+
+EVENT_OSB 0x53 BBOX
+UMASK_OSB_READS_LOCAL 0x02
+UMASK_OSB_INVITOE_LOCAL 0x04
+UMASK_OSB_REMOTE 0x08
+UMASK_OSB_CANCELLED 0x10
+UMASK_OSB_READS_LOCAL_USEFUL 0x20
+UMASK_OSB_REMOTE_USEFUL 0x40
+
+EVENT_OSB_EDR 0x54 BBOX
+UMASK_OSB_EDR_ALL 0x01
+UMASK_OSB_EDR_READS_LOCAL_I 0x02
+UMASK_OSB_EDR_READS_REMOTE_I 0x04
+UMASK_OSB_EDR_READS_LOCAL_S 0x08
+UMASK_OSB_EDR_READS_REMOTE_S 0x10
+
+EVENT_REQUESTS 0x01 BBOX
+UMASK_REQUESTS_READS_LOCAL 0x01
+UMASK_REQUESTS_READS_REMOTE 0x02
+UMASK_REQUESTS_READS 0x03
+UMASK_REQUESTS_WRITES_LOCAL 0x04
+UMASK_REQUESTS_WRITES_REMOTE 0x08
+UMASK_REQUESTS_WRITES 0x0C
+UMASK_REQUESTS_INVITOE_LOCAL 0x10
+UMASK_REQUESTS_INVITOE_REMOTE 0x20
+
+EVENT_RING_AD_USED 0x3E BBOX
+UMASK_RING_AD_USED_CW_EVEN 0x01
+UMASK_RING_AD_USED_CW_ODD 0x02
+UMASK_RING_AD_USED_CW 0x03
+UMASK_RING_AD_USED_CCW_EVEN 0x04
+UMASK_RING_AD_USED_CCW_ODD 0x08
+UMASK_RING_AD_USED_CCW 0x0C
+
+EVENT_RING_AK_USED 0x3F BBOX
+UMASK_RING_AK_USED_CW_EVEN 0x01
+UMASK_RING_AK_USED_CW_ODD 0x02
+UMASK_RING_AK_USED_CW 0x03
+UMASK_RING_AK_USED_CCW_EVEN 0x04
+UMASK_RING_AK_USED_CCW_ODD 0x08
+UMASK_RING_AK_USED_CCW 0x0C
+
+EVENT_RING_BL_USED 0x40 BBOX
+UMASK_RING_BL_USED_CW_EVEN 0x01
+UMASK_RING_BL_USED_CW_ODD 0x02
+UMASK_RING_BL_USED_CW 0x03
+UMASK_RING_BL_USED_CCW_EVEN 0x04
+UMASK_RING_BL_USED_CCW_ODD 0x08
+UMASK_RING_BL_USED_CCW 0x0C
+
+EVENT_RPQ_CYCLES_NO_REG_CREDITS 0x15 BBOX
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN0 0x01
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN1 0x02
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN2 0x04
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN3 0x08
+
+EVENT_WPQ_CYCLES_NO_REG_CREDITS 0x18 BBOX
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0 0x01
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0 0x02
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0 0x04
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0 0x08
+
+EVENT_SNOOPS_RSP_AFTER_DATA 0x0A BBOX
+UMASK_SNOOPS_RSP_AFTER_DATA_LOCAL 0x01
+UMASK_SNOOPS_RSP_AFTER_DATA_REMOTE 0x02
+
+EVENT_SNOOP_CYCLES_NE 0x08 BBOX
+UMASK_SNOOP_CYCLES_NE_LOCAL 0x01
+UMASK_SNOOP_CYCLES_NE_REMOTE 0x02
+UMASK_SNOOP_CYCLES_NE_ALL 0x03
+
+EVENT_SNOOP_OCCUPANCY 0x09 BBOX
+UMASK_SNOOP_OCCUPANCY_LOCAL 0x01
+UMASK_SNOOP_OCCUPANCY_REMOTE 0x02
+
+EVENT_SNOOP_RESP 0x21 BBOX
+UMASK_SNOOP_RESP_RSPI 0x01
+UMASK_SNOOP_RESP_RSPS 0x02
+UMASK_SNOOP_RESP_RSPIFWD 0x04
+UMASK_SNOOP_RESP_RSPSFWD 0x08
+UMASK_SNOOP_RESP_RSP_WB 0x10
+UMASK_SNOOP_RESP_RSP_FWD_WB 0x20
+UMASK_SNOOP_RESP_RSPCNFLCT 0x40
+
+EVENT_SNP_RESP_RECV_LOCAL 0x60 BBOX
+UMASK_SNP_RESP_RECV_LOCAL_RSPI 0x01
+UMASK_SNP_RESP_RECV_LOCAL_RSPS 0x02
+UMASK_SNP_RESP_RECV_LOCAL_RSPIFWD 0x04
+UMASK_SNP_RESP_RECV_LOCAL_RSPSFWD 0x08
+UMASK_SNP_RESP_RECV_LOCAL_RSPXWB 0x10
+UMASK_SNP_RESP_RECV_LOCAL_RSPXFWDXWB 0x20
+UMASK_SNP_RESP_RECV_LOCAL_RSPCNFLCT 0x40
+UMASK_SNP_RESP_RECV_LOCAL_OTHER 0x80
+
+EVENT_TAD_REQUESTS_G0 0x1B BBOX
+UMASK_TAD_REQUESTS_G0_REGION0 0x01
+UMASK_TAD_REQUESTS_G0_REGION1 0x02
+UMASK_TAD_REQUESTS_G0_REGION2 0x04
+UMASK_TAD_REQUESTS_G0_REGION3 0x08
+UMASK_TAD_REQUESTS_G0_REGION4 0x10
+UMASK_TAD_REQUESTS_G0_REGION5 0x20
+UMASK_TAD_REQUESTS_G0_REGION6 0x40
+UMASK_TAD_REQUESTS_G0_REGION7 0x80
+
+EVENT_TAD_REQUESTS_G1 0x1C BBOX
+UMASK_TAD_REQUESTS_G1_REGION8 0x01
+UMASK_TAD_REQUESTS_G1_REGION9 0x02
+UMASK_TAD_REQUESTS_G1_REGION10 0x04
+UMASK_TAD_REQUESTS_G1_REGION11 0x08
+
+EVENT_TRACKER_CYCLES_FULL 0x02 BBOX
+UMASK_TRACKER_CYCLES_FULL_GP 0x01
+UMASK_TRACKER_CYCLES_FULL_ALL 0x02
+
+EVENT_TRACKER_CYCLES_NE 0x03 BBOX
+UMASK_TRACKER_CYCLES_NE_LOCAL 0x01
+UMASK_TRACKER_CYCLES_NE_REMOTE 0x02
+UMASK_TRACKER_CYCLES_NE_ALL 0x03
+
+EVENT_TRACKER_OCCUPANCY 0x04 BBOX
+UMASK_TRACKER_OCCUPANCY_READS_LOCAL 0x04
+UMASK_TRACKER_OCCUPANCY_READS_REMOTE 0x08
+UMASK_TRACKER_OCCUPANCY_WRITES_LOCAL 0x10
+UMASK_TRACKER_OCCUPANCY_WRITES_REMOTE 0x20
+UMASK_TRACKER_OCCUPANCY_INVITOE_LOCAL 0x40
+UMASK_TRACKER_OCCUPANCY_INVITOE_REMOTE 0x80
+
+EVENT_TRACKER_PENDING_OCCUPANCY 0x05 BBOX
+UMASK_TRACKER_PENDING_OCCUPANCY_LOCAL 0x01
+UMASK_TRACKER_PENDING_OCCUPANCY_REMOTE 0x02
+
+EVENT_TXR_AD_CYCLES_FULL 0x2A BBOX
+UMASK_TXR_AD_CYCLES_FULL_SCHED0 0x01
+UMASK_TXR_AD_CYCLES_FULL_SCHED1 0x02
+UMASK_TXR_AD_CYCLES_FULL_ALL 0x03
+
+EVENT_TXR_AK 0x0E BBOX
+UMASK_TXR_AK 0x00
+
+EVENT_TXR_AK_CYCLES_FULL 0x32 BBOX
+UMASK_TXR_AK_CYCLES_FULL_SCHED0 0x01
+UMASK_TXR_AK_CYCLES_FULL_SCHED1 0x02
+UMASK_TXR_AK_CYCLES_FULL_ALL 0x03
+
+EVENT_TXR_BL 0x10 BBOX
+UMASK_TXR_BL_DRS_CACHE 0x01
+UMASK_TXR_BL_DRS_CORE 0x02
+UMASK_TXR_BL_DRS_QPI 0x04
+
+EVENT_TXR_BL_CYCLES_FULL 0x36 BBOX
+UMASK_TXR_BL_CYCLES_FULL_SCHED0 0x01
+UMASK_TXR_BL_CYCLES_FULL_SCHED1 0x02
+UMASK_TXR_BL_CYCLES_FULL_ALL 0x03
+
+EVENT_TXR_BL_OCCUPANCY 0x34 BBOX
+UMASK_TXR_BL_OCCUPANCY 0x00
+
+EVENT_TXR_STARVED 0x6D BBOX
+UMASK_TXR_STARVED_AK 0x01
+UMASK_TXR_STARVED_BL 0x02
+
+EVENT_DRAM_CLOCKTICKS 0x00 MBOX
+UMASK_DRAM_CLOCKTICKS 0x00
+
+EVENT_ACT_COUNT 0x01 MBOX
+UMASK_ACT_COUNT_RD 0x01
+UMASK_ACT_COUNT_WR 0x02
+UMASK_ACT_COUNT_BYP 0x08
+
+EVENT_BYP_CMDS 0xA1 MBOX
+UMASK_BYP_CMDS_ACT 0x01
+UMASK_BYP_CMDS_CAS 0x02
+UMASK_BYP_CMDS_PRE 0x04
+
+EVENT_CAS_COUNT 0x04 MBOX
+UMASK_CAS_COUNT_RD_REG 0x01
+UMASK_CAS_COUNT_RD_UNDERFILL 0x02
+UMASK_CAS_COUNT_RD 0x03
+UMASK_CAS_COUNT_RD_WMM 0x10
+UMASK_CAS_COUNT_RD_RMM 0x20
+UMASK_CAS_COUNT_WR_WMM 0x04
+UMASK_CAS_COUNT_WR_RMM 0x08
+UMASK_CAS_COUNT_WR 0x0C
+UMASK_CAS_COUNT_ALL 0x0F
+
+EVENT_DRAM_PRE_ALL 0x06 MBOX
+UMASK_DRAM_PRE_ALL 0x00
+
+EVENT_DRAM_REFRESH 0x05 MBOX
+UMASK_DRAM_REFRESH_PANIC 0x02
+UMASK_DRAM_REFRESH_HIGH 0x04
+
+EVENT_ECC_CORRECTABLE_ERRORS 0x09 MBOX
+UMASK_ECC_CORRECTABLE_ERRORS 0x00
+
+EVENT_MAJOR_MODES 0x07 MBOX
+UMASK_MAJOR_MODES_READ 0x01
+UMASK_MAJOR_MODES_WRITE 0x02
+UMASK_MAJOR_MODES_PARTIAL 0x03
+UMASK_MAJOR_MODES_ISOCH 0x04
+
+EVENT_POWER_CHANNEL_DLLOFF 0x84 MBOX
+UMASK_POWER_CHANNEL_DLLOFF 0x00
+
+EVENT_POWER_CHANNEL_PPD 0x85 MBOX
+UMASK_POWER_CHANNEL_PPD 0x00
+
+EVENT_POWER_CKE_CYCLES 0x83 MBOX
+UMASK_POWER_CKE_CYCLES_RANK0 0x01
+UMASK_POWER_CKE_CYCLES_RANK1 0x02
+UMASK_POWER_CKE_CYCLES_RANK2 0x04
+UMASK_POWER_CKE_CYCLES_RANK3 0x08
+UMASK_POWER_CKE_CYCLES_RANK4 0x10
+UMASK_POWER_CKE_CYCLES_RANK5 0x20
+UMASK_POWER_CKE_CYCLES_RANK6 0x40
+UMASK_POWER_CKE_CYCLES_RANK7 0x80
+
+EVENT_POWER_CRITICAL_THROTTLE_CYCLES 0x86 MBOX
+UMASK_POWER_CRITICAL_THROTTLE_CYCLES 0x00
+
+EVENT_POWER_PCU_THROTTLING 0x42 MBOX
+UMASK_POWER_PCU_THROTTLING 0x00
+
+EVENT_POWER_SELF_REFRESH 0x43 MBOX
+UMASK_POWER_SELF_REFRESH 0x00
+
+EVENT_POWER_THROTTLE_CYCLES 0x41 MBOX
+UMASK_POWER_THROTTLE_CYCLES_RANK0 0x01
+UMASK_POWER_THROTTLE_CYCLES_RANK1 0x02
+UMASK_POWER_THROTTLE_CYCLES_RANK2 0x04
+UMASK_POWER_THROTTLE_CYCLES_RANK3 0x08
+UMASK_POWER_THROTTLE_CYCLES_RANK4 0x10
+UMASK_POWER_THROTTLE_CYCLES_RANK5 0x20
+UMASK_POWER_THROTTLE_CYCLES_RANK6 0x40
+UMASK_POWER_THROTTLE_CYCLES_RANK7 0x80
+
+EVENT_PREEMPTION 0x08 MBOX
+UMASK_PREEMPTION_RD_PREEMPT_RD 0x01
+UMASK_PREEMPTION_RD_PREEMPT_WR 0x02
+
+EVENT_PRE_COUNT 0x02 MBOX
+UMASK_PRE_COUNT_PAGE_MISS 0x01
+UMASK_PRE_COUNT_PAGE_CLOSE 0x02
+UMASK_PRE_COUNT_RD 0x04
+UMASK_PRE_COUNT_WR 0x08
+UMASK_PRE_COUNT_BYP 0x10
+
+EVENT_RD_CAS_PRIO 0xA0 MBOX
+UMASK_RD_CAS_PRIO_LOW 0x01
+UMASK_RD_CAS_PRIO_MED 0x02
+UMASK_RD_CAS_PRIO_HIGH 0x04
+UMASK_RD_CAS_PRIO_PANIC 0x08
+
+EVENT_RD_CAS_RANK0 0xB0 MBOX
+UMASK_RD_CAS_RANK0_BANK0 0x00
+UMASK_RD_CAS_RANK0_BANK1 0x01
+UMASK_RD_CAS_RANK0_BANK2 0x02
+UMASK_RD_CAS_RANK0_BANK3 0x03
+UMASK_RD_CAS_RANK0_BANK4 0x04
+UMASK_RD_CAS_RANK0_BANK5 0x05
+UMASK_RD_CAS_RANK0_BANK6 0x06
+UMASK_RD_CAS_RANK0_BANK7 0x07
+UMASK_RD_CAS_RANK0_BANK8 0x08
+UMASK_RD_CAS_RANK0_BANK9 0x09
+UMASK_RD_CAS_RANK0_BANK10 0x0A
+UMASK_RD_CAS_RANK0_BANK11 0x0B
+UMASK_RD_CAS_RANK0_BANK12 0x0C
+UMASK_RD_CAS_RANK0_BANK13 0x0D
+UMASK_RD_CAS_RANK0_BANK14 0x0E
+UMASK_RD_CAS_RANK0_BANK15 0x0F
+UMASK_RD_CAS_RANK0_ALLBANKS 0x10
+UMASK_RD_CAS_RANK0_BANKG0 0x11
+UMASK_RD_CAS_RANK0_BANKG1 0x12
+UMASK_RD_CAS_RANK0_BANKG2 0x13
+UMASK_RD_CAS_RANK0_BANKG3 0x14
+
+EVENT_RD_CAS_RANK1 0xB1 MBOX
+UMASK_RD_CAS_RANK1_BANK0 0x00
+UMASK_RD_CAS_RANK1_BANK1 0x01
+UMASK_RD_CAS_RANK1_BANK2 0x02
+UMASK_RD_CAS_RANK1_BANK3 0x03
+UMASK_RD_CAS_RANK1_BANK4 0x04
+UMASK_RD_CAS_RANK1_BANK5 0x05
+UMASK_RD_CAS_RANK1_BANK6 0x06
+UMASK_RD_CAS_RANK1_BANK7 0x07
+UMASK_RD_CAS_RANK1_BANK8 0x08
+UMASK_RD_CAS_RANK1_BANK9 0x09
+UMASK_RD_CAS_RANK1_BANK10 0x0A
+UMASK_RD_CAS_RANK1_BANK11 0x0B
+UMASK_RD_CAS_RANK1_BANK12 0x0C
+UMASK_RD_CAS_RANK1_BANK13 0x0D
+UMASK_RD_CAS_RANK1_BANK14 0x0E
+UMASK_RD_CAS_RANK1_BANK15 0x0F
+UMASK_RD_CAS_RANK1_ALLBANKS 0x10
+UMASK_RD_CAS_RANK1_BANKG0 0x11
+UMASK_RD_CAS_RANK1_BANKG1 0x12
+UMASK_RD_CAS_RANK1_BANKG2 0x13
+UMASK_RD_CAS_RANK1_BANKG3 0x14
+
+EVENT_RD_CAS_RANK2 0xB2 MBOX
+UMASK_RD_CAS_RANK2_BANK0 0x00
+UMASK_RD_CAS_RANK2_BANK1 0x01
+UMASK_RD_CAS_RANK2_BANK2 0x02
+UMASK_RD_CAS_RANK2_BANK3 0x03
+UMASK_RD_CAS_RANK2_BANK4 0x04
+UMASK_RD_CAS_RANK2_BANK5 0x05
+UMASK_RD_CAS_RANK2_BANK6 0x06
+UMASK_RD_CAS_RANK2_BANK7 0x07
+UMASK_RD_CAS_RANK2_BANK8 0x08
+UMASK_RD_CAS_RANK2_BANK9 0x09
+UMASK_RD_CAS_RANK2_BANK10 0x0A
+UMASK_RD_CAS_RANK2_BANK11 0x0B
+UMASK_RD_CAS_RANK2_BANK12 0x0C
+UMASK_RD_CAS_RANK2_BANK13 0x0D
+UMASK_RD_CAS_RANK2_BANK14 0x0E
+UMASK_RD_CAS_RANK2_BANK15 0x0F
+UMASK_RD_CAS_RANK2_ALLBANKS 0x10
+UMASK_RD_CAS_RANK2_BANKG0 0x11
+UMASK_RD_CAS_RANK2_BANKG1 0x12
+UMASK_RD_CAS_RANK2_BANKG2 0x13
+UMASK_RD_CAS_RANK2_BANKG3 0x14
+
+EVENT_RD_CAS_RANK3 0xB3 MBOX
+UMASK_RD_CAS_RANK3_BANK0 0x00
+UMASK_RD_CAS_RANK3_BANK1 0x01
+UMASK_RD_CAS_RANK3_BANK2 0x02
+UMASK_RD_CAS_RANK3_BANK3 0x03
+UMASK_RD_CAS_RANK3_BANK4 0x04
+UMASK_RD_CAS_RANK3_BANK5 0x05
+UMASK_RD_CAS_RANK3_BANK6 0x06
+UMASK_RD_CAS_RANK3_BANK7 0x07
+UMASK_RD_CAS_RANK3_BANK8 0x08
+UMASK_RD_CAS_RANK3_BANK9 0x09
+UMASK_RD_CAS_RANK3_BANK10 0x0A
+UMASK_RD_CAS_RANK3_BANK11 0x0B
+UMASK_RD_CAS_RANK3_BANK12 0x0C
+UMASK_RD_CAS_RANK3_BANK13 0x0D
+UMASK_RD_CAS_RANK3_BANK14 0x0E
+UMASK_RD_CAS_RANK3_BANK15 0x0F
+UMASK_RD_CAS_RANK3_ALLBANKS 0x10
+UMASK_RD_CAS_RANK3_BANKG0 0x11
+UMASK_RD_CAS_RANK3_BANKG1 0x12
+UMASK_RD_CAS_RANK3_BANKG2 0x13
+UMASK_RD_CAS_RANK3_BANKG3 0x14
+
+EVENT_RD_CAS_RANK4 0xB4 MBOX
+UMASK_RD_CAS_RANK4_BANK0 0x00
+UMASK_RD_CAS_RANK4_BANK1 0x01
+UMASK_RD_CAS_RANK4_BANK2 0x02
+UMASK_RD_CAS_RANK4_BANK3 0x03
+UMASK_RD_CAS_RANK4_BANK4 0x04
+UMASK_RD_CAS_RANK4_BANK5 0x05
+UMASK_RD_CAS_RANK4_BANK6 0x06
+UMASK_RD_CAS_RANK4_BANK7 0x07
+UMASK_RD_CAS_RANK4_BANK8 0x08
+UMASK_RD_CAS_RANK4_BANK9 0x09
+UMASK_RD_CAS_RANK4_BANK10 0x0A
+UMASK_RD_CAS_RANK4_BANK11 0x0B
+UMASK_RD_CAS_RANK4_BANK12 0x0C
+UMASK_RD_CAS_RANK4_BANK13 0x0D
+UMASK_RD_CAS_RANK4_BANK14 0x0E
+UMASK_RD_CAS_RANK4_BANK15 0x0F
+UMASK_RD_CAS_RANK4_ALLBANKS 0x10
+UMASK_RD_CAS_RANK4_BANKG0 0x11
+UMASK_RD_CAS_RANK4_BANKG1 0x12
+UMASK_RD_CAS_RANK4_BANKG2 0x13
+UMASK_RD_CAS_RANK4_BANKG3 0x14
+
+EVENT_RD_CAS_RANK5 0xB5 MBOX
+UMASK_RD_CAS_RANK5_BANK0 0x00
+UMASK_RD_CAS_RANK5_BANK1 0x01
+UMASK_RD_CAS_RANK5_BANK2 0x02
+UMASK_RD_CAS_RANK5_BANK3 0x03
+UMASK_RD_CAS_RANK5_BANK4 0x04
+UMASK_RD_CAS_RANK5_BANK5 0x05
+UMASK_RD_CAS_RANK5_BANK6 0x06
+UMASK_RD_CAS_RANK5_BANK7 0x07
+UMASK_RD_CAS_RANK5_BANK8 0x08
+UMASK_RD_CAS_RANK5_BANK9 0x09
+UMASK_RD_CAS_RANK5_BANK10 0x0A
+UMASK_RD_CAS_RANK5_BANK11 0x0B
+UMASK_RD_CAS_RANK5_BANK12 0x0C
+UMASK_RD_CAS_RANK5_BANK13 0x0D
+UMASK_RD_CAS_RANK5_BANK14 0x0E
+UMASK_RD_CAS_RANK5_BANK15 0x0F
+UMASK_RD_CAS_RANK5_ALLBANKS 0x10
+UMASK_RD_CAS_RANK5_BANKG0 0x11
+UMASK_RD_CAS_RANK5_BANKG1 0x12
+UMASK_RD_CAS_RANK5_BANKG2 0x13
+UMASK_RD_CAS_RANK5_BANKG3 0x14
+
+EVENT_RD_CAS_RANK6 0xB6 MBOX
+UMASK_RD_CAS_RANK6_BANK0 0x00
+UMASK_RD_CAS_RANK6_BANK1 0x01
+UMASK_RD_CAS_RANK6_BANK2 0x02
+UMASK_RD_CAS_RANK6_BANK3 0x03
+UMASK_RD_CAS_RANK6_BANK4 0x04
+UMASK_RD_CAS_RANK6_BANK5 0x05
+UMASK_RD_CAS_RANK6_BANK6 0x06
+UMASK_RD_CAS_RANK6_BANK7 0x07
+UMASK_RD_CAS_RANK6_BANK8 0x08
+UMASK_RD_CAS_RANK6_BANK9 0x09
+UMASK_RD_CAS_RANK6_BANK10 0x0A
+UMASK_RD_CAS_RANK6_BANK11 0x0B
+UMASK_RD_CAS_RANK6_BANK12 0x0C
+UMASK_RD_CAS_RANK6_BANK13 0x0D
+UMASK_RD_CAS_RANK6_BANK14 0x0E
+UMASK_RD_CAS_RANK6_BANK15 0x0F
+UMASK_RD_CAS_RANK6_ALLBANKS 0x10
+UMASK_RD_CAS_RANK6_BANKG0 0x11
+UMASK_RD_CAS_RANK6_BANKG1 0x12
+UMASK_RD_CAS_RANK6_BANKG2 0x13
+UMASK_RD_CAS_RANK6_BANKG3 0x14
+
+EVENT_RD_CAS_RANK7 0xB7 MBOX
+UMASK_RD_CAS_RANK7_BANK0 0x00
+UMASK_RD_CAS_RANK7_BANK1 0x01
+UMASK_RD_CAS_RANK7_BANK2 0x02
+UMASK_RD_CAS_RANK7_BANK3 0x03
+UMASK_RD_CAS_RANK7_BANK4 0x04
+UMASK_RD_CAS_RANK7_BANK5 0x05
+UMASK_RD_CAS_RANK7_BANK6 0x06
+UMASK_RD_CAS_RANK7_BANK7 0x07
+UMASK_RD_CAS_RANK7_BANK8 0x08
+UMASK_RD_CAS_RANK7_BANK9 0x09
+UMASK_RD_CAS_RANK7_BANK10 0x0A
+UMASK_RD_CAS_RANK7_BANK11 0x0B
+UMASK_RD_CAS_RANK7_BANK12 0x0C
+UMASK_RD_CAS_RANK7_BANK13 0x0D
+UMASK_RD_CAS_RANK7_BANK14 0x0E
+UMASK_RD_CAS_RANK7_BANK15 0x0F
+UMASK_RD_CAS_RANK7_ALLBANKS 0x10
+UMASK_RD_CAS_RANK7_BANKG0 0x11
+UMASK_RD_CAS_RANK7_BANKG1 0x12
+UMASK_RD_CAS_RANK7_BANKG2 0x13
+UMASK_RD_CAS_RANK7_BANKG3 0x14
+
+EVENT_RPQ_CYCLES_NE 0x11 MBOX
+UMASK_RPQ_CYCLES_NE 0x00
+
+EVENT_RPQ_INSERTS 0x10 MBOX
+UMASK_RPQ_INSERTS 0x00
+
+EVENT_VMSE_MXB_WR_OCCUPANCY 0x91 MBOX
+UMASK_VMSE_MXB_WR_OCCUPANCY 0x00
+
+EVENT_VMSE_WR_PUSH 0x90 MBOX
+UMASK_VMSE_WR_PUSH_WMM 0x01
+UMASK_VMSE_WR_PUSH_RMM 0x02
+
+EVENT_WMM_TO_RMM 0xC0 MBOX
+UMASK_WMM_TO_RMM_LOW_THRESH 0x01
+UMASK_WMM_TO_RMM_STARVE 0x02
+UMASK_WMM_TO_RMM_VMSE_RETRY 0x04
+
+# Undocumented event, mentioned in metrics table but not defined
+EVENT_WPQ_INSERTS 0x20 MBOX
+UMASK_WPQ_INSERTS 0x00
+
+EVENT_WPQ_CYCLES_FULL 0x22 MBOX
+UMASK_WPQ_CYCLES_FULL 0x00
+
+EVENT_WPQ_CYCLES_NE 0x21 MBOX
+UMASK_WPQ_CYCLES_NE 0x00
+
+EVENT_WPQ_READ_HIT 0x23 MBOX
+UMASK_WPQ_READ_HIT 0x00
+
+EVENT_WPQ_WRITE_HIT 0x24 MBOX
+UMASK_WPQ_WRITE_HIT 0x00
+
+EVENT_WR_CAS_RANK0 0xB8 MBOX
+UMASK_WR_CAS_RANK0_BANK0 0x00
+UMASK_WR_CAS_RANK0_BANK1 0x01
+UMASK_WR_CAS_RANK0_BANK2 0x02
+UMASK_WR_CAS_RANK0_BANK3 0x03
+UMASK_WR_CAS_RANK0_BANK4 0x04
+UMASK_WR_CAS_RANK0_BANK5 0x05
+UMASK_WR_CAS_RANK0_BANK6 0x06
+UMASK_WR_CAS_RANK0_BANK7 0x07
+UMASK_WR_CAS_RANK0_BANK8 0x08
+UMASK_WR_CAS_RANK0_BANK9 0x09
+UMASK_WR_CAS_RANK0_BANK10 0x0A
+UMASK_WR_CAS_RANK0_BANK11 0x0B
+UMASK_WR_CAS_RANK0_BANK12 0x0C
+UMASK_WR_CAS_RANK0_BANK13 0x0D
+UMASK_WR_CAS_RANK0_BANK14 0x0E
+UMASK_WR_CAS_RANK0_BANK15 0x0F
+UMASK_WR_CAS_RANK0_ALLBANKS 0x10
+UMASK_WR_CAS_RANK0_BANKG0 0x11
+UMASK_WR_CAS_RANK0_BANKG1 0x12
+UMASK_WR_CAS_RANK0_BANKG2 0x13
+UMASK_WR_CAS_RANK0_BANKG3 0x14
+
+EVENT_WR_CAS_RANK1 0xB9 MBOX
+UMASK_WR_CAS_RANK1_BANK0 0x00
+UMASK_WR_CAS_RANK1_BANK1 0x01
+UMASK_WR_CAS_RANK1_BANK2 0x02
+UMASK_WR_CAS_RANK1_BANK3 0x03
+UMASK_WR_CAS_RANK1_BANK4 0x04
+UMASK_WR_CAS_RANK1_BANK5 0x05
+UMASK_WR_CAS_RANK1_BANK6 0x06
+UMASK_WR_CAS_RANK1_BANK7 0x07
+UMASK_WR_CAS_RANK1_BANK8 0x08
+UMASK_WR_CAS_RANK1_BANK9 0x09
+UMASK_WR_CAS_RANK1_BANK10 0x0A
+UMASK_WR_CAS_RANK1_BANK11 0x0B
+UMASK_WR_CAS_RANK1_BANK12 0x0C
+UMASK_WR_CAS_RANK1_BANK13 0x0D
+UMASK_WR_CAS_RANK1_BANK14 0x0E
+UMASK_WR_CAS_RANK1_BANK15 0x0F
+UMASK_WR_CAS_RANK1_ALLBANKS 0x10
+UMASK_WR_CAS_RANK1_BANKG0 0x11
+UMASK_WR_CAS_RANK1_BANKG1 0x12
+UMASK_WR_CAS_RANK1_BANKG2 0x13
+UMASK_WR_CAS_RANK1_BANKG3 0x14
+
+EVENT_WR_CAS_RANK2 0xBA MBOX
+UMASK_WR_CAS_RANK2_BANK0 0x00
+UMASK_WR_CAS_RANK2_BANK1 0x01
+UMASK_WR_CAS_RANK2_BANK2 0x02
+UMASK_WR_CAS_RANK2_BANK3 0x03
+UMASK_WR_CAS_RANK2_BANK4 0x04
+UMASK_WR_CAS_RANK2_BANK5 0x05
+UMASK_WR_CAS_RANK2_BANK6 0x06
+UMASK_WR_CAS_RANK2_BANK7 0x07
+UMASK_WR_CAS_RANK2_BANK8 0x08
+UMASK_WR_CAS_RANK2_BANK9 0x09
+UMASK_WR_CAS_RANK2_BANK10 0x0A
+UMASK_WR_CAS_RANK2_BANK11 0x0B
+UMASK_WR_CAS_RANK2_BANK12 0x0C
+UMASK_WR_CAS_RANK2_BANK13 0x0D
+UMASK_WR_CAS_RANK2_BANK14 0x0E
+UMASK_WR_CAS_RANK2_BANK15 0x0F
+UMASK_WR_CAS_RANK2_ALLBANKS 0x10
+UMASK_WR_CAS_RANK2_BANKG0 0x11
+UMASK_WR_CAS_RANK2_BANKG1 0x12
+UMASK_WR_CAS_RANK2_BANKG2 0x13
+UMASK_WR_CAS_RANK2_BANKG3 0x14
+
+EVENT_WR_CAS_RANK3 0xBB MBOX
+UMASK_WR_CAS_RANK3_BANK0 0x00
+UMASK_WR_CAS_RANK3_BANK1 0x01
+UMASK_WR_CAS_RANK3_BANK2 0x02
+UMASK_WR_CAS_RANK3_BANK3 0x03
+UMASK_WR_CAS_RANK3_BANK4 0x04
+UMASK_WR_CAS_RANK3_BANK5 0x05
+UMASK_WR_CAS_RANK3_BANK6 0x06
+UMASK_WR_CAS_RANK3_BANK7 0x07
+UMASK_WR_CAS_RANK3_BANK8 0x08
+UMASK_WR_CAS_RANK3_BANK9 0x09
+UMASK_WR_CAS_RANK3_BANK10 0x0A
+UMASK_WR_CAS_RANK3_BANK11 0x0B
+UMASK_WR_CAS_RANK3_BANK12 0x0C
+UMASK_WR_CAS_RANK3_BANK13 0x0D
+UMASK_WR_CAS_RANK3_BANK14 0x0E
+UMASK_WR_CAS_RANK3_BANK15 0x0F
+UMASK_WR_CAS_RANK3_ALLBANKS 0x10
+UMASK_WR_CAS_RANK3_BANKG0 0x11
+UMASK_WR_CAS_RANK3_BANKG1 0x12
+UMASK_WR_CAS_RANK3_BANKG2 0x13
+UMASK_WR_CAS_RANK3_BANKG3 0x14
+
+EVENT_WR_CAS_RANK4 0xBC MBOX
+UMASK_WR_CAS_RANK4_BANK0 0x00
+UMASK_WR_CAS_RANK4_BANK1 0x01
+UMASK_WR_CAS_RANK4_BANK2 0x02
+UMASK_WR_CAS_RANK4_BANK3 0x03
+UMASK_WR_CAS_RANK4_BANK4 0x04
+UMASK_WR_CAS_RANK4_BANK5 0x05
+UMASK_WR_CAS_RANK4_BANK6 0x06
+UMASK_WR_CAS_RANK4_BANK7 0x07
+UMASK_WR_CAS_RANK4_BANK8 0x08
+UMASK_WR_CAS_RANK4_BANK9 0x09
+UMASK_WR_CAS_RANK4_BANK10 0x0A
+UMASK_WR_CAS_RANK4_BANK11 0x0B
+UMASK_WR_CAS_RANK4_BANK12 0x0C
+UMASK_WR_CAS_RANK4_BANK13 0x0D
+UMASK_WR_CAS_RANK4_BANK14 0x0E
+UMASK_WR_CAS_RANK4_BANK15 0x0F
+UMASK_WR_CAS_RANK4_ALLBANKS 0x10
+UMASK_WR_CAS_RANK4_BANKG0 0x11
+UMASK_WR_CAS_RANK4_BANKG1 0x12
+UMASK_WR_CAS_RANK4_BANKG2 0x13
+UMASK_WR_CAS_RANK4_BANKG3 0x14
+
+EVENT_WR_CAS_RANK5 0xBD MBOX
+UMASK_WR_CAS_RANK5_BANK0 0x00
+UMASK_WR_CAS_RANK5_BANK1 0x01
+UMASK_WR_CAS_RANK5_BANK2 0x02
+UMASK_WR_CAS_RANK5_BANK3 0x03
+UMASK_WR_CAS_RANK5_BANK4 0x04
+UMASK_WR_CAS_RANK5_BANK5 0x05
+UMASK_WR_CAS_RANK5_BANK6 0x06
+UMASK_WR_CAS_RANK5_BANK7 0x07
+UMASK_WR_CAS_RANK5_BANK8 0x08
+UMASK_WR_CAS_RANK5_BANK9 0x09
+UMASK_WR_CAS_RANK5_BANK10 0x0A
+UMASK_WR_CAS_RANK5_BANK11 0x0B
+UMASK_WR_CAS_RANK5_BANK12 0x0C
+UMASK_WR_CAS_RANK5_BANK13 0x0D
+UMASK_WR_CAS_RANK5_BANK14 0x0E
+UMASK_WR_CAS_RANK5_BANK15 0x0F
+UMASK_WR_CAS_RANK5_ALLBANKS 0x10
+UMASK_WR_CAS_RANK5_BANKG0 0x11
+UMASK_WR_CAS_RANK5_BANKG1 0x12
+UMASK_WR_CAS_RANK5_BANKG2 0x13
+UMASK_WR_CAS_RANK5_BANKG3 0x14
+
+EVENT_WR_CAS_RANK6 0xBE MBOX
+UMASK_WR_CAS_RANK6_BANK0 0x00
+UMASK_WR_CAS_RANK6_BANK1 0x01
+UMASK_WR_CAS_RANK6_BANK2 0x02
+UMASK_WR_CAS_RANK6_BANK3 0x03
+UMASK_WR_CAS_RANK6_BANK4 0x04
+UMASK_WR_CAS_RANK6_BANK5 0x05
+UMASK_WR_CAS_RANK6_BANK6 0x06
+UMASK_WR_CAS_RANK6_BANK7 0x07
+UMASK_WR_CAS_RANK6_BANK8 0x08
+UMASK_WR_CAS_RANK6_BANK9 0x09
+UMASK_WR_CAS_RANK6_BANK10 0x0A
+UMASK_WR_CAS_RANK6_BANK11 0x0B
+UMASK_WR_CAS_RANK6_BANK12 0x0C
+UMASK_WR_CAS_RANK6_BANK13 0x0D
+UMASK_WR_CAS_RANK6_BANK14 0x0E
+UMASK_WR_CAS_RANK6_BANK15 0x0F
+UMASK_WR_CAS_RANK6_ALLBANKS 0x10
+UMASK_WR_CAS_RANK6_BANKG0 0x11
+UMASK_WR_CAS_RANK6_BANKG1 0x12
+UMASK_WR_CAS_RANK6_BANKG2 0x13
+UMASK_WR_CAS_RANK6_BANKG3 0x14
+
+EVENT_WR_CAS_RANK7 0xBF MBOX
+UMASK_WR_CAS_RANK7_BANK0 0x00
+UMASK_WR_CAS_RANK7_BANK1 0x01
+UMASK_WR_CAS_RANK7_BANK2 0x02
+UMASK_WR_CAS_RANK7_BANK3 0x03
+UMASK_WR_CAS_RANK7_BANK4 0x04
+UMASK_WR_CAS_RANK7_BANK5 0x05
+UMASK_WR_CAS_RANK7_BANK6 0x06
+UMASK_WR_CAS_RANK7_BANK7 0x07
+UMASK_WR_CAS_RANK7_BANK8 0x08
+UMASK_WR_CAS_RANK7_BANK9 0x09
+UMASK_WR_CAS_RANK7_BANK10 0x0A
+UMASK_WR_CAS_RANK7_BANK11 0x0B
+UMASK_WR_CAS_RANK7_BANK12 0x0C
+UMASK_WR_CAS_RANK7_BANK13 0x0D
+UMASK_WR_CAS_RANK7_BANK14 0x0E
+UMASK_WR_CAS_RANK7_BANK15 0x0F
+UMASK_WR_CAS_RANK7_ALLBANKS 0x10
+UMASK_WR_CAS_RANK7_BANKG0 0x11
+UMASK_WR_CAS_RANK7_BANKG1 0x12
+UMASK_WR_CAS_RANK7_BANKG2 0x13
+UMASK_WR_CAS_RANK7_BANKG3 0x14
+
+EVENT_PBOX_CLOCKTICKS 0x01 PBOX
+UMASK_PBOX_CLOCKTICKS 0x00
+
+EVENT_IIO_CREDIT 0x2D PBOX0|PBOX1
+UMASK_IIO_CREDIT_PRQ_QPI0 0x01
+UMASK_IIO_CREDIT_PRQ_QPI1 0x02
+UMASK_IIO_CREDIT_ISOCH_QPI0 0x04
+UMASK_IIO_CREDIT_ISOCH_QPI1 0x08
+
+EVENT_RING_AD_USED 0x07 PBOX
+UMASK_RING_AD_USED_CW_EVEN 0x01
+UMASK_RING_AD_USED_CW_ODD 0x02
+UMASK_RING_AD_USED_CW 0x03
+UMASK_RING_AD_USED_CCW_EVEN 0x04
+UMASK_RING_AD_USED_CCW_ODD 0x08
+UMASK_RING_AD_USED_CCW 0x0C
+
+EVENT_RING_AK_BOUNCES 0x12 PBOX
+UMASK_RING_AK_BOUNCES_UP 0x01
+UMASK_RING_AK_BOUNCES_DN 0x02
+
+EVENT_RING_AK_USED 0x08 PBOX
+UMASK_RING_AK_USED_CW_EVEN 0x01
+UMASK_RING_AK_USED_CW_ODD 0x02
+UMASK_RING_AK_USED_CW 0x03
+UMASK_RING_AK_USED_CCW_EVEN 0x04
+UMASK_RING_AK_USED_CCW_ODD 0x08
+UMASK_RING_AK_USED_CCW 0x0C
+
+EVENT_RING_BL_USED 0x09 PBOX
+UMASK_RING_BL_USED_CW_EVEN 0x01
+UMASK_RING_BL_USED_CW_ODD 0x02
+UMASK_RING_BL_USED_CW 0x03
+UMASK_RING_BL_USED_CCW_EVEN 0x04
+UMASK_RING_BL_USED_CCW_ODD 0x08
+UMASK_RING_BL_USED_CCW 0x0C
+
+EVENT_RING_IV_USED 0x0A PBOX
+UMASK_RING_IV_USED_CW 0x03
+UMASK_RING_IV_USED_CCW 0x0C
+UMASK_RING_IV_USED_ANY 0x0F
+
+EVENT_RXR_CYCLES_NE 0x10 PBOX0|PBOX1
+UMASK_RXR_CYCLES_NE_NCB 0x10
+UMASK_RXR_CYCLES_NE_NCS 0x20
+
+EVENT_RXR_INSERTS 0x11 PBOX0|PBOX1
+UMASK_RXR_INSERTS_NCB 0x10
+UMASK_RXR_INSERTS_NCS 0x20
+
+EVENT_RXR_OCCUPANCY 0x13 PBOX0
+UMASK_RXR_OCCUPANCY_DRS 0x08
+
+EVENT_TXR_CYCLES_FULL 0x25 PBOX0
+UMASK_TXR_CYCLES_FULL_AD 0x01
+UMASK_TXR_CYCLES_FULL_AK 0x02
+UMASK_TXR_CYCLES_FULL_BL 0x04
+
+EVENT_TXR_CYCLES_NE 0x23 PBOX0
+UMASK_TXR_CYCLES_NE_AD 0x01
+UMASK_TXR_CYCLES_NE_AK 0x02
+UMASK_TXR_CYCLES_NE_BL 0x04
+
+EVENT_TXR_NACK_CW 0x26 PBOX0|PBOX1
+UMASK_TXR_NACK_CW_DN_AD 0x01
+UMASK_TXR_NACK_CW_DN_BL 0x02
+UMASK_TXR_NACK_CW_DN_AK 0x04
+UMASK_TXR_NACK_CW_UP_AD 0x08
+UMASK_TXR_NACK_CW_UP_BL 0x10
+UMASK_TXR_NACK_CW_UP_AK 0x20
+
+EVENT_IBOX_CLOCKTICKS 0x00 IBOX
+UMASK_IBOX_CLOCKTICKS 0x00
+
+EVENT_CACHE_TOTAL_OCCUPANCY 0x12 IBOX
+UMASK_CACHE_TOTAL_OCCUPANCY_ANY 0x01
+UMASK_CACHE_TOTAL_OCCUPANCY_SOURCE 0x02
+
+EVENT_COHERENT_OPS 0x13 IBOX
+UMASK_COHERENT_OPS_PCIRDCUR 0x01
+UMASK_COHERENT_OPS_CRD 0x02
+UMASK_COHERENT_OPS_DRD 0x04
+UMASK_COHERENT_OPS_RFO 0x08
+UMASK_COHERENT_OPS_PCITOM 0x10
+UMASK_COHERENT_OPS_PCIDCAHINT 0x20
+UMASK_COHERENT_OPS_WBMTOI 0x40
+UMASK_COHERENT_OPS_CLFLUSH 0x80
+
+EVENT_MISC0 0x14 IBOX
+UMASK_MISC0_FAST_REQ 0x01
+UMASK_MISC0_FAST_REJ 0x02
+UMASK_MISC0_2ND_RD_INSERT 0x04
+UMASK_MISC0_2ND_WR_INSERT 0x08
+UMASK_MISC0_2ND_ATOMIC_INSERT 0x10
+UMASK_MISC0_INSERTS 0x1C
+UMASK_MISC0_FAST_XFER 0x20
+UMASK_MISC0_PF_ACK_HINT 0x40
+UMASK_MISC0_PF_TIMEOUT 0x80
+
+EVENT_MISC1 0x15 IBOX
+UMASK_MISC1_SLOW_I 0x01
+UMASK_MISC1_SLOW_S 0x02
+UMASK_MISC1_SLOW_E 0x04
+UMASK_MISC1_SLOW_M 0x08
+UMASK_MISC1_SLOW 0x0F
+UMASK_MISC1_LOST_FWD 0x10
+UMASK_MISC1_SEC_RCVD_INVLD 0x20
+UMASK_MISC1_SEC_RCVD_VLD 0x40
+UMASK_MISC1_DATA_THROTTLE 0x80
+
+EVENT_SNOOP_RESP 0x17 IBOX
+UMASK_SNOOP_RESP_MISS 0x01
+UMASK_SNOOP_RESP_HIT_I 0x02
+UMASK_SNOOP_RESP_HIT_ES 0x04
+UMASK_SNOOP_RESP_HIT_M 0x08
+UMASK_SNOOP_RESP_HIT 0x0E
+UMASK_SNOOP_RESP_SNPCODE 0x10
+UMASK_SNOOP_RESP_SNPDATA 0x20
+UMASK_SNOOP_RESP_SNPINV 0x40
+
+EVENT_TRANSACTIONS 0x16 IBOX
+UMASK_TRANSACTIONS_READS 0x01
+UMASK_TRANSACTIONS_WRITES 0x02
+UMASK_TRANSACTIONS_RD_PREF 0x04
+UMASK_TRANSACTIONS_WR_PREF 0x08
+UMASK_TRANSACTIONS_ALL_READS 0x05
+UMASK_TRANSACTIONS_ALL_WRITES 0x0A
+UMASK_TRANSACTIONS_ATOMIC 0x10
+UMASK_TRANSACTIONS_OTHER 0x20
+UMASK_TRANSACTIONS_ORDERINGQ 0x40
+
+EVENT_RXR_AK_INSERTS 0x0A IBOX
+UMASK_RXR_AK_INSERTS 0x00
+
+EVENT_RXR_BL_DRS_CYCLES_FULL 0x04 IBOX
+UMASK_RXR_BL_DRS_CYCLES_FULL 0x00
+
+EVENT_RXR_BL_DRS_INSERTS 0x01 IBOX
+UMASK_RXR_BL_DRS_INSERTS 0x00
+
+EVENT_RXR_BL_DRS_OCCUPANCY 0x07 IBOX
+UMASK_RXR_BL_DRS_OCCUPANCY 0x00
+
+EVENT_RXR_BL_NCB_CYCLES_FULL 0x05 IBOX
+UMASK_RXR_BL_NCB_CYCLES_FULL 0x00
+
+EVENT_RXR_BL_NCB_INSERTS 0x02 IBOX
+UMASK_RXR_BL_NCB_INSERTS 0x00
+
+EVENT_RXR_BL_NCB_OCCUPANCY 0x08 IBOX
+UMASK_RXR_BL_NCB_OCCUPANCY 0x00
+
+EVENT_RXR_BL_NCS_CYCLES_FULL 0x06 IBOX
+UMASK_RXR_BL_NCS_CYCLES_FULL 0x00
+
+EVENT_RXR_BL_NCS_INSERTS 0x03 IBOX
+UMASK_RXR_BL_NCS_INSERTS 0x00
+
+EVENT_RXR_BL_NCS_OCCUPANCY 0x09 IBOX
+UMASK_RXR_BL_NCS_OCCUPANCY 0x00
+
+EVENT_TXR_AD_STALL_CREDIT_CYCLES 0x18 IBOX
+UMASK_TXR_AD_STALL_CREDIT_CYCLES 0x00
+
+EVENT_TXR_BL_STALL_CREDIT_CYCLES 0x19 IBOX
+UMASK_TXR_BL_STALL_CREDIT_CYCLES 0x00
+
+EVENT_TXR_DATA_INSERTS_NCB 0x0E IBOX
+UMASK_TXR_DATA_INSERTS_NCB 0x00
+
+EVENT_TXR_DATA_INSERTS_NCS 0x0F IBOX
+UMASK_TXR_DATA_INSERTS_NCS 0x00
+
+EVENT_TXR_REQUEST_OCCUPANCY 0x0D IBOX
+UMASK_TXR_REQUEST_OCCUPANCY 0x00
+
+
diff --git a/src/includes/perfmon_core2.h b/src/includes/perfmon_core2.h
index f737dda..9c4ba1d 100644
--- a/src/includes/perfmon_core2.h
+++ b/src/includes/perfmon_core2.h
@@ -3,15 +3,16 @@
*
* Filename: perfmon_core2.h
*
- * Description: Header file of perfmon module for Core 2
+ * Description: Header file of perfmon module for Intel Core 2
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -29,155 +30,307 @@
*/
#include <perfmon_core2_events.h>
-#include <perfmon_core2_groups.h>
#include <perfmon_core2_counters.h>
+#include <error.h>
+
static int perfmon_numCountersCore2 = NUM_COUNTERS_CORE2;
-static int perfmon_numGroupsCore2 = NUM_GROUPS_CORE2;
static int perfmon_numArchEventsCore2 = NUM_ARCH_EVENTS_CORE2;
-void perfmon_init_core2(PerfmonThread *thread)
+int perfmon_init_core2(int cpu_id)
{
- uint64_t flags = 0x0ULL;
- int cpu_id = thread->processorId;
-
- /* Initialize registers */
- msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+ return 0;
+}
- msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
- /* always initialize fixed counters
- * FIXED 0: Instructions retired
- * FIXED 1: Clocks unhalted */
- msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x22ULL);
+uint32_t core2_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint32_t flags = (1ULL<<(1+(index*4)));
+ for(j=0;j<event->numberOfOptions;j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<(index*4));
+ break;
+ default:
+ break;
+ }
+ }
+ return flags;
+}
- /* Preinit of PMC counters */
- flags |= (1<<16); /* user mode flag */
- flags |= (1<<19); /* pin control flag */
- flags |= (1<<22); /* enable flag */
+int core2_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
- msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
- msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
+ flags = (1ULL<<22)|(1ULL<<16)|(1ULL<<19);
+ flags |= (event->umask<<8) + event->eventId;
+ if ( event->cfgBits != 0 ) /* set custom cfg and cmask */
+ {
+ flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+ }
+ if (event->numberOfOptions > 0)
+ {
+ for(j=0;j<event->numberOfOptions;j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<17);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0xFFULL)<<24);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
}
-
-void perfmon_setupCounterThread_core2(
- int thread_id,
- PerfmonEvent* event,
- PerfmonCounterIndex index)
+int perfmon_setupCounterThread_core2( int thread_id, PerfmonEventSet* eventSet)
{
- uint64_t flags;
- uint64_t reg = core2_counter_map[index].configRegister;
- int cpu_id = perfmon_threadData[thread_id].processorId;
- uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
- perfmon_threadData[thread_id].counters[index].init = TRUE;
+ uint64_t fixed_flags = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- if ( core2_counter_map[index].type == PMC )
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
{
- flags = (1<<16)|(1<<19)|(1<<22);
-
- /* Intel with standard 8 bit event mask: [7:0] */
- flags |= (event->umask<<8) + event->eventId;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
- if ( event->cfgBits != 0 ) /* set custom cfg and cmask */
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- flags &= ~(0xFFFFU<<16); /* clear upper 16bits */
- flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+ continue;
}
-
- msr_write(cpu_id, reg , flags);
-
- if (perfmon_verbose)
+ RegisterIndex index = eventSet->events[i].index;
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ eventSet->events[i].threadCounter[thread_id].init = TRUE;
+ switch (type)
{
- printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n",
- cpu_id,
- LLU_CAST reg,
- LLU_CAST flags);
+ case PMC:
+ core2_pmc_setup(cpu_id, index, event);
+ break;
+ case FIXED:
+ fixed_flags |= core2_fixed_setup(cpu_id, index, event);
+ break;
+ default:
+ break;
}
}
- else if (core2_counter_map[index].type == FIXED)
+ if (fixed_flags > 0x0ULL)
{
- fixed_flags |= (0x2 << (index*4));
- msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
}
+ return 0;
}
-void perfmon_startCountersThread_core2(int thread_id)
+int perfmon_startCountersThread_core2(int thread_id, PerfmonEventSet* eventSet)
{
uint64_t flags = 0ULL;
- int cpu_id = perfmon_threadData[thread_id].processorId;
-
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+ int cpu_id = groupSet->threads[thread_id].processorId;
- for ( int i=0; i<NUM_COUNTERS_CORE2; i++ )
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- msr_write(cpu_id, core2_counter_map[i].counterRegister , 0x0ULL);
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ uint64_t counter = counter_map[index].counterRegister;
+ eventSet->events[i].threadCounter[thread_id].startData = 0;
+ eventSet->events[i].threadCounter[thread_id].counterData = 0;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter, 0x0ULL));
- if (core2_counter_map[i].type == PMC)
+ if (type == PMC)
{
- flags |= (1<<(i-2)); /* enable counter */
+ flags |= (1ULL<<(index - cpuid_info.perf_num_fixed_ctr)); /* enable counter */
}
- else if (core2_counter_map[i].type == FIXED)
+ else if (type == FIXED)
{
- flags |= (1ULL<<(i+32)); /* enable fixed counter */
+ flags |= (1ULL<<(index + 32)); /* enable fixed counter */
}
}
}
- if (perfmon_verbose)
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
{
- printf("perfmon_start_counters: Write Register 0x%X , Flags: 0x%llX \n",
- MSR_PERF_GLOBAL_CTRL, LLU_CAST flags);
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, UNFREEZE_PMC_AND_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<63)|(1ULL<<62)|flags));
}
-
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
- msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x300000003ULL);
+ return 0;
}
-void perfmon_stopCountersThread_core2(int thread_id)
+#define CORE2_CHECK_OVERFLOW(offset) \
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+ { \
+ uint64_t ovf_values = 0x0ULL; \
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values)); \
+ if (ovf_values & (1ULL<<offset)) \
+ { \
+ eventSet->events[i].threadCounter[thread_id].overflows++; \
+ } \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<offset))); \
+ }
+
+int perfmon_stopCountersThread_core2(int thread_id, PerfmonEventSet* eventSet)
{
- uint64_t flags;
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ uint64_t counter_result;
+ int cpu_id = groupSet->threads[thread_id].processorId;
/* stop counters */
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
/* read out counter results */
- for ( int i=0; i<NUM_COUNTERS_CORE2; i++)
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ counter_result = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ uint64_t counter = counter_map[index].counterRegister;
+ switch (type)
+ {
+ case PMC:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+ CORE2_CHECK_OVERFLOW(index - cpuid_info.perf_num_fixed_ctr);
+ VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, READ_PMC)
+ break;
+ case FIXED:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+ CORE2_CHECK_OVERFLOW(index + 32);
+ VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, READ_FIXED)
+ break;
+ default:
+ break;
+ }
+ eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+ }
+ }
+
+ return 0;
+}
+
+int perfmon_readCountersThread_core2(int thread_id, PerfmonEventSet* eventSet)
+{
+ int cpu_id = groupSet->threads[thread_id].processorId;
+ uint64_t counter_result;
+ uint64_t flags;
+
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
{
- if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &flags));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, SAFE_PMC_FLAGS)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, RESET_PMC_FLAGS)
+ }
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ counter_result = 0x0ULL;
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, core2_counter_map[i].counterRegister);
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ uint64_t counter = counter_map[index].counterRegister;
+ switch (type)
+ {
+ case PMC:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+ CORE2_CHECK_OVERFLOW(index - cpuid_info.perf_num_fixed_ctr);
+ VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, READ_PMC)
+ break;
+ case FIXED:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+ CORE2_CHECK_OVERFLOW(index - 32);
+ VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, READ_FIXED)
+ break;
+ default:
+ break;
+ }
+ eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
}
}
- /* check overflow status */
- flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
- if ( (flags & 0x3) || (flags & (0x3ULL<<32)) )
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
{
- printf ("Overflow occured \n");
- printf ("Status: 0x%llX \n", LLU_CAST flags);
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, RESTORE_PMC_FLAGS)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
}
+
+ return 0;
}
-void perfmon_readCountersThread_core2(int thread_id)
+
+int perfmon_finalizeCountersThread_core2(int thread_id, PerfmonEventSet* eventSet)
{
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+ uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
- for ( int i=0; i<NUM_COUNTERS_CORE2; i++ )
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ uint64_t reg = counter_map[index].configRegister;
+ eventSet->events[i].threadCounter[thread_id].init = FALSE;
+ if (type == PMC)
+ {
+ ovf_values_core |= (1ULL<<(index - cpuid_info.perf_num_fixed_ctr));
+ }
+ else if (type == FIXED)
+ {
+ ovf_values_core |= (1ULL<<(index + 32));
+ }
+ if ((reg) && ((type == PMC)||(type == FIXED)))
{
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, core2_counter_map[i].counterRegister);
+ VERBOSEPRINTREG(cpu_id, reg, LLU_CAST 0x0ULL, CLEAR_CTRL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL));
}
}
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST ovf_values_core, CLEAR_GLOBAL_OVF)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST 0x0ULL, CLEAR_GLOBAL_CTRL)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ return 0;
}
-
diff --git a/src/includes/perfmon_core2_counters.h b/src/includes/perfmon_core2_counters.h
index d6c33fb..2dada93 100644
--- a/src/includes/perfmon_core2_counters.h
+++ b/src/includes/perfmon_core2_counters.h
@@ -3,15 +3,16 @@
*
* Filename: perfmon_core2_counters.h
*
- * Description: Counter header file of perfmon module for Core 2
+ * Description: Counter header file of perfmon module for Intel Core 2
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -31,13 +32,21 @@
#define NUM_COUNTERS_CORE2 5
#define NUM_COUNTERS_CORE_CORE2 5
-static PerfmonCounterMap core2_counter_map[NUM_COUNTERS_CORE2] = {
+#define CORE2_VALID_OPTIONS_FIXED EVENT_OPTION_COUNT_KERNEL_MASK
+#define CORE2_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+static RegisterMap core2_counter_map[NUM_COUNTERS_CORE2] = {
/* Fixed Counters: instructions retired, cycles unhalted core */
- {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0},
- {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0},
- {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0},
+ {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, CORE2_VALID_OPTIONS_FIXED},
+ {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, CORE2_VALID_OPTIONS_FIXED},
+ {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, CORE2_VALID_OPTIONS_FIXED},
/* PMC Counters: 2 40bit wide */
- {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
- {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0}
+ {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, CORE2_VALID_OPTIONS_PMC},
+ {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, CORE2_VALID_OPTIONS_PMC},
};
+
+static BoxMap core2_box_map[NUM_UNITS] = {
+ [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 40},
+ [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+};
diff --git a/src/includes/perfmon_core2_events.txt b/src/includes/perfmon_core2_events.txt
index 60c6211..ebb2dc5 100644
--- a/src/includes/perfmon_core2_events.txt
+++ b/src/includes/perfmon_core2_events.txt
@@ -1,16 +1,17 @@
# =======================================================================================
-#
+#
# Filename: perfmon_core2_events.txt
-#
+#
# Description: Event list for Intel Core 2
-#
-# Version: 3.1.3
-# Released: 4.11.2014
-#
-# Author: Jan Treibig (jt), jan.treibig at gmail.com
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
+# Author: Jan Treibig (jt), jan.treibig at gmail.com
+# Thomas Roehl (tr), thomas.roehl at googlemail.com
# Project: likwid
#
-# Copyright (C) 2014 Jan Treibig
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
@@ -29,10 +30,10 @@
EVENT_INSTR_RETIRED 0x00 FIXC0
UMASK_INSTR_RETIRED_ANY 0x00
-EVENT_CPU_CLK_UNHALTED_CORE 0x00 FIXC1
+EVENT_CPU_CLK_UNHALTED 0x00 FIXC1
UMASK_CPU_CLK_UNHALTED_CORE 0x00
-EVENT_CPU_CLK_UNHALTED_REF 0x00 FIXC2
+EVENT_CPU_CLK_UNHALTED 0x00 FIXC2
UMASK_CPU_CLK_UNHALTED_REF 0x00
EVENT_LOAD_BLOCK 0x03 PMC
@@ -42,11 +43,16 @@ UMASK_LOAD_BLOCK_OVERLAP_STORE 0x08
UMASK_LOAD_BLOCK_UNTIL_RETIRE 0x10
UMASK_LOAD_BLOCK_L1D 0x20
-EVENT_STORE_BLOCK 0x04 PMC
+EVENT_SB_DRAIN_CYCLES 0x04 PMC
UMASK_SB_DRAIN_CYCLES 0x01
+
+EVENT_STORE_BLOCK 0x04 PMC
UMASK_STORE_BLOCK_ORDER 0x02
UMASK_STORE_BLOCK_SNOOP 0x08
+EVENT_MISALIGN_MEM_REF 0x05 PMC
+UMASK_MISALIGN_MEM_REF 0x00
+
EVENT_SEGMENT_REG_LOADS 0x06 PMC
UMASK_SEGMENT_REG_LOADS 0x00
@@ -97,6 +103,10 @@ EVENT_L2_ADS 0x21 PMC
UMASK_L2_ADS_ALL_CORES 0xC0
UMASK_L2_ADS_THIS_CORE 0x40
+EVENT_L2_DBUS_BUSY 0x22 PMC
+UMASK_L2_DBUS_BUSY_ALL_CORES 0xC0
+UMASK_L2_DBUS_BUSY_THIS_CORE 0x40
+
EVENT_L2_DBUS_BUSY_RD 0x23 PMC
UMASK_L2_DBUS_BUSY_RD_ALL_CORES 0xC0
UMASK_L2_DBUS_BUSY_RD_THIS_CORE 0x40
@@ -266,7 +276,8 @@ UMASK_L2_NO_REQ_ALL_CORES 0xC0
UMASK_L2_NO_REQ_THIS_CORE 0x40
EVENT_EIST_TRANS 0x3A PMC
-UMASK_EIST_TRANS 0x00
+UMASK_EIST_TRANS_ANY 0x00
+UMASK_EIST_TRANS_FREQ 0x01
EVENT_THERMAL_TRIP 0x3B PMC
UMASK_THERMAL_TRIP 0xC0
@@ -275,6 +286,8 @@ EVENT_CPU_CLK_UNHALTED 0x3C PMC
UMASK_CPU_CLK_UNHALTED_CORE_P 0x00
UMASK_CPU_CLK_UNHALTED_BUS 0x01
UMASK_CPU_CLK_UNHALTED_NO_OTHER 0x02
+DEFAULT_OPTIONS_CPU_CLK_UNHALTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0x2,EVENT_OPTION_INVERT=0x1
+UMASK_CPU_CLK_UNHALTED_TOTAL_CYCLES 0x00
EVENT_L1D_CACHE_LD 0x40 PMC
UMASK_L1D_CACHE_LD_MODIFIED 0x08
@@ -298,9 +311,11 @@ UMASK_L1D_CACHE_LOCK_INVALID 0x01
UMASK_L1D_CACHE_LOCK_MESI 0x0F
UMASK_L1D_CACHE_LOCK_DURATION 0x10
-EVENT_L1D 0x43 PMC
+EVENT_L1D_ALL 0x43 PMC
UMASK_L1D_ALL_REF 0x01
-UMASK_L1D_ALL_CACHE_REF 0x02
+
+EVENT_L1D_CACHE 0x44 PMC
+UMASK_L1D_CACHE_REF 0x02
EVENT_L1D_REPL 0x45 PMC
UMASK_L1D_REPL 0x0F
@@ -322,6 +337,7 @@ EVENT_SSE_PRE_MISS 0x4B PMC
UMASK_SSE_PRE_MISS_NTA 0x00
UMASK_SSE_PRE_MISS_L1 0x01
UMASK_SSE_PRE_MISS_L2 0x02
+UMASK_SSE_PRE_MISS_ALL_CACHES 0x03
EVENT_LOAD_HIT_PRE 0x4C PMC
UMASK_LOAD_HIT_PRE 0x00
@@ -329,6 +345,9 @@ UMASK_LOAD_HIT_PRE 0x00
EVENT_L1D_PREFETCH_REQUESTS 0x4E PMC
UMASK_L1D_PREFETCH_REQUESTS 0x10
+EVENT_L1D_PREFETCH_DCU_MISSES 0x4F PMC
+UMASK_L1D_PREFETCH_DCU_MISSES 0x00
+
EVENT_BUS_REQUEST_OUTSTANDING 0x60 PMC
UMASK_BUS_REQUEST_OUTSTANDING_ALL_CORES_THIS_A 0xC0
UMASK_BUS_REQUEST_OUTSTANDING_ALL_CORES_ALL_A 0xE0
@@ -425,19 +444,56 @@ UMASK_BUS_TRANS_ANY_ALL_CORES_ALL_A 0xE0
UMASK_BUS_TRANS_ANY_THIS_CORE_THIS_A 0x40
UMASK_BUS_TRANS_ANY_THIS_CORE_ALL_A 0x60
+EVENT_EXT_SNOOP 0x77 PMC
+UMASK_EXT_SNOOP_ALL_CORES_CLEAN 0xC1
+UMASK_EXT_SNOOP_ALL_CORES_HIT 0xC2
+UMASK_EXT_SNOOP_ALL_CORES_HITM 0xC8
+UMASK_EXT_SNOOP_THIS_CORE_CLEAN 0x41
+UMASK_EXT_SNOOP_THIS_CORE_HIT 0x42
+UMASK_EXT_SNOOP_THIS_CORE_HITM 0x48
+
+
+EVENT_CMP_SNOOP 0x78 PMC
+UMASK_CMP_SNOOP_ALL_CORES_CMP2I 0xC2
+UMASK_CMP_SNOOP_ALL_CORES_CMP2S 0xC1
+UMASK_CMP_SNOOP_THIS_CORE_CMP2I 0x42
+UMASK_CMP_SNOOP_THIS_CORE_CMP2S 0x41
+
+EVENT_BUS_HIT_DRV 0x7A PMC
+UMASK_BUS_HIT_DRV_THIS_AGENT 0x00
+UMASK_BUS_HIT_DRV_ALL_AGENTS 0x20
+
+EVENT_BUS_HITM_DRV 0x7B PMC
+UMASK_BUS_HITM_DRV_THIS_AGENT 0x00
+UMASK_BUS_HITM_DRV_ALL_AGENTS 0x20
+
+EVENT_BUSQ_EMPTY 0x7D PMC
+UMASK_BUSQ_EMPTY_ALL_CORES 0xC0
+UMASK_BUSQ_EMPTY_THIS_CORE 0x40
+
+EVENT_BUS_SNOOP_STALLED 0x7E PMC
+UMASK_BUS_SNOOP_STALLED_ALL_CORES_THIS_AGENT 0xC0
+UMASK_BUS_SNOOP_STALLED_ALL_CORES_ALL_AGENTS 0xE0
+UMASK_BUS_SNOOP_STALLED_THIS_CORE_THIS_AGENT 0x40
+UMASK_BUS_SNOOP_STALLED_THIS_CORE_ALL_AGENTS 0x60
+
+EVENT_BUS_IO_WAIT 0x7F PMC
+UMASK_BUS_IO_WAIT_ALL_CORES 0xC0
+UMASK_BUS_IO_WAIT_THIS_CORE 0x40
+
EVENT_L1I_READS 0x80 PMC
UMASK_L1I_READS 0x00
EVENT_L1I_MISSES 0x81 PMC
UMASK_L1I_MISSES 0x00
-EVENT_ITLB 0x82 PMC
+EVENT_ITLB 0x82 PMC
UMASK_ITLB_SMALL_MISS 0x02
UMASK_ITLB_LARGE_MISS 0x10
-UMASK_ITLB_FLUSH 0x40
+UMASK_ITLB_FLUSH 0x40
UMASK_ITLB_MISSES 0x12
-EVENT_INST_QUEUE 0x83 PMC
+EVENT_INST_QUEUE 0x83 PMC
UMASK_INST_QUEUE_FULL 0x02
EVENT_CYCLES_L1I_MEM_STALLED 0x86 PMC
@@ -491,27 +547,30 @@ UMASK_BR_TKN_BUBBLE_2 0x00
EVENT_RS_UOPS_DISPATCHED_ALL 0xA0 PMC
UMASK_RS_UOPS_DISPATCHED_ALL 0x00
-EVENT_RS_UOPS_DISPATCHED 0xA1 PMC0
+EVENT_RS_UOPS_DISPATCHED 0xA1 PMC0
UMASK_RS_UOPS_DISPATCHED_PORT0 0x01
UMASK_RS_UOPS_DISPATCHED_PORT1 0x02
UMASK_RS_UOPS_DISPATCHED_PORT2 0x04
UMASK_RS_UOPS_DISPATCHED_PORT3 0x08
UMASK_RS_UOPS_DISPATCHED_PORT4 0x10
UMASK_RS_UOPS_DISPATCHED_PORT5 0x20
+DEFAULT_OPTIONS_RS_UOPS_DISPATCHED_PORT_ARITH_PORTS_CORE EVENT_OPTION_EDGE=0x1
+UMASK_RS_UOPS_DISPATCHED_PORT_ARITH_PORTS_CORE 0x23
+UMASK_RS_UOPS_DISPATCHED_PORT_DATA_PORTS 0x1C
-EVENT_MACRO_INSTS 0xAA PMC
+EVENT_MACRO_INSTS 0xAA PMC
UMASK_MACRO_INSTS_DECODED 0x01
UMASK_MACRO_INSTS_CISC_DECODED 0x08
-EVENT_ESP 0xAB PMC
-UMASK_ESP_SYNCH 0x01
+EVENT_ESP 0xAB PMC
+UMASK_ESP_SYNCH 0x01
UMASK_ESP_ADDITIONS 0x02
EVENT_SIMD_UOPS_EXEC 0xB0 PMC
-UMASK_SIMD_UOPS_EXEC 0x00
+UMASK_SIMD_UOPS_EXEC 0x00
EVENT_SIMD_SAT_UOPS_EXEC 0xB1 PMC
-UMASK_SIMD_SAT_UOPS_EXEC 0x00
+UMASK_SIMD_SAT_UOPS_EXEC 0x00
EVENT_SIMD_UOP_TYPE_EXEC 0xB3 PMC
UMASK_SIMD_UOP_TYPE_EXEC_MUL 0x01
@@ -531,20 +590,28 @@ EVENT_X87_OPS_RETIRED 0xC1 PMC
UMASK_X87_OPS_RETIRED_FXCH 0x01
UMASK_X87_OPS_RETIRED_ANY 0xFE
-EVENT_UOPS_RETIRED_ANY 0xC2 PMC
+EVENT_UOPS_RETIRED 0xC2 PMC
UMASK_UOPS_RETIRED_LD_IND_BR 0x01
UMASK_UOPS_RETIRED_STD_STA 0x02
UMASK_UOPS_RETIRED_MACRO_FUSION 0x04
UMASK_UOPS_RETIRED_FUSED 0x07
UMASK_UOPS_RETIRED_NON_FUSED 0x08
UMASK_UOPS_RETIRED_ANY 0x0F
-
-EVENT_MACHINE_NUKES 0xC3 PMC
-UMASK_MACHINE_NUKES_SMC 0x01
-UMASK_MACHINE_NUKES_MEM_ORDER 0x04
-
-EVENT_BR_INST_RETIRED 0xC4 PMC
-UMASK_BR_INST_RETIRED_ANY 0x00
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_RETIRED_USED_CYCLES 0x0F
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_STALL_CYCLES 0x0F
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0x9,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_TOTAL_CYCLES 0x0F
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_EDGE=1
+UMASK_UOPS_RETIRED_STALL_COUNT 0x0F
+
+EVENT_MACHINE_NUKES 0xC3 PMC
+UMASK_MACHINE_NUKES_SMC 0x01
+UMASK_MACHINE_NUKES_MEM_ORDER 0x04
+
+EVENT_BR_INST_RETIRED 0xC4 PMC
+UMASK_BR_INST_RETIRED_ANY 0x00
UMASK_BR_INST_RETIRED_PRED_NOT_TAKEN 0x01
UMASK_BR_INST_RETIRED_MISPRED_NOT_TAKEN 0x02
UMASK_BR_INST_RETIRED_PRED_TAKEN 0x04
@@ -554,11 +621,11 @@ UMASK_BR_INST_RETIRED_TAKEN 0x0C
EVENT_BR_INST_RETIRED_MISPRED 0xC5 PMC
UMASK_BR_INST_RETIRED_MISPRED 0x00
-EVENT_CYCLES_INT 0xC6 PMC
-UMASK_CYCLES_INT_MASKED 0x01
+EVENT_CYCLES_INT 0xC6 PMC
+UMASK_CYCLES_INT_MASKED 0x01
UMASK_CYCLES_INT_PENDING_MASKED 0x02
-EVENT_SIMD_INST_RETIRED 0xC7 PMC
+EVENT_SIMD_INST_RETIRED 0xC7 PMC
UMASK_SIMD_INST_RETIRED_PACKED_SINGLE 0x01
UMASK_SIMD_INST_RETIRED_SCALAR_SINGLE 0x02
UMASK_SIMD_INST_RETIRED_PACKED_DOUBLE 0x04
@@ -567,10 +634,10 @@ UMASK_SIMD_INST_RETIRED_VECTOR 0x10
UMASK_SIMD_INST_RETIRED_ANY 0x1F
EVENT_HW_INT_RCV 0xC8 PMC
-UMASK_HW_INT_RCV 0x00
+UMASK_HW_INT_RCV 0x00
EVENT_ITLB_MISS_RETIRED 0xC9 PMC
-UMASK_ITLB_MISS_RETIRED 0x00
+UMASK_ITLB_MISS_RETIRED 0x00
EVENT_SIMD_COMP_INST_RETIRED 0xCA PMC
UMASK_SIMD_COMP_INST_RETIRED_PACKED_SINGLE 0x01
@@ -578,69 +645,69 @@ UMASK_SIMD_COMP_INST_RETIRED_SCALAR_SINGLE 0x02
UMASK_SIMD_COMP_INST_RETIRED_PACKED_DOUBLE 0x04
UMASK_SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE 0x08
-EVENT_MEM_LOAD_RETIRED 0xCB PMC0
-UMASK_MEM_LOAD_RETIRED_L1D_MISS 0x01
-UMASK_MEM_LOAD_RETIRED_L1D_LINE_MISS 0x02
-UMASK_MEM_LOAD_RETIRED_L2_MISS 0x04
-UMASK_MEM_LOAD_RETIRED_L2_LINE_MISS 0x08
-UMASK_MEM_LOAD_RETIRED_DTLB_MISS 0x10
+EVENT_MEM_LOAD_RETIRED 0xCB PMC0
+UMASK_MEM_LOAD_RETIRED_L1D_MISS 0x01
+UMASK_MEM_LOAD_RETIRED_L1D_LINE_MISS 0x02
+UMASK_MEM_LOAD_RETIRED_L2_MISS 0x04
+UMASK_MEM_LOAD_RETIRED_L2_LINE_MISS 0x08
+UMASK_MEM_LOAD_RETIRED_DTLB_MISS 0x10
EVENT_FP_MMX_TRANS_TO 0xCC PMC
-UMASK_FP_MMX_TRANS_TO_MMX 0x01
-UMASK_FP_MMX_TRANS_TO_FP 0x02
+UMASK_FP_MMX_TRANS_TO_MMX 0x01
+UMASK_FP_MMX_TRANS_TO_FP 0x02
EVENT_SIMD_ASSIST 0xCD PMC
-UMASK_SIMD_ASSIST 0x00
+UMASK_SIMD_ASSIST 0x00
EVENT_SIMD_INSTR_RETIRED 0xCE PMC
-UMASK_SIMD_INSTR_RETIRED 0x00
+UMASK_SIMD_INSTR_RETIRED 0x00
-EVENT_SIMD_SAT_INSTR_RETIRED 0xCF PMC
+EVENT_SIMD_SAT_INSTR_RETIRED 0xCF PMC
UMASK_SIMD_SAT_INSTR_RETIRED 0x00
-EVENT_RAT_STALLS 0xD2 PMC
+EVENT_RAT_STALLS 0xD2 PMC
UMASK_RAT_STALLS_ROB_READ_PORT 0x01
-UMASK_RAT_STALLS_PARTIAL_CYCLES 0x02
-UMASK_RAT_STALLS_FLAGS 0x04
-UMASK_RAT_STALLS_FPSW 0x08
-UMASK_RAT_STALLS_ANY 0x0F
+UMASK_RAT_STALLS_PARTIAL_CYCLES 0x02
+UMASK_RAT_STALLS_FLAGS 0x04
+UMASK_RAT_STALLS_FPSW 0x08
+UMASK_RAT_STALLS_ANY 0x0F
EVENT_SEG_RENAME_STALLS 0xD4 PMC
-UMASK_SEG_RENAME_STALLS_ES 0x01
-UMASK_SEG_RENAME_STALLS_DS 0x02
-UMASK_SEG_RENAME_STALLS_FS 0x04
-UMASK_SEG_RENAME_STALLS_GS 0x08
-UMASK_SEG_RENAME_STALLS_ANY 0x0F
-
-EVENT_SEG_REG_RENAMES 0xD5 PMC
-UMASK_SEG_RENAME_STALLS_ES 0x01
-UMASK_SEG_RENAME_STALLS_DS 0x02
-UMASK_SEG_RENAME_STALLS_FS 0x04
-UMASK_SEG_RENAME_STALLS_GS 0x08
+UMASK_SEG_RENAME_STALLS_ES 0x01
+UMASK_SEG_RENAME_STALLS_DS 0x02
+UMASK_SEG_RENAME_STALLS_FS 0x04
+UMASK_SEG_RENAME_STALLS_GS 0x08
UMASK_SEG_RENAME_STALLS_ANY 0x0F
-EVENT_RESOURCE_STALLS 0xDC PMC
-UMASK_RESOURCE_STALLS_ROB_FULL 0x01
-UMASK_RESOURCE_STALLS_RS_FULL 0x02
-UMASK_RESOURCE_STALLS_LD_ST 0x04
-UMASK_RESOURCE_STALLS_FPCW 0x08
-UMASK_RESOURCE_STALLS_BR_MISS_CLEAR 0x10
-UMASK_RESOURCE_STALLS_ANY 0x1F
+EVENT_SEG_REG_RENAMES 0xD5 PMC
+UMASK_SEG_REG_RENAMES_STALLS_ES 0x01
+UMASK_SEG_REG_RENAMES_STALLS_DS 0x02
+UMASK_SEG_REG_RENAMES_STALLS_FS 0x04
+UMASK_SEG_REG_RENAMES_STALLS_GS 0x08
+UMASK_SEG_REG_RENAMES_STALLS_ANY 0x0F
+
+EVENT_RESOURCE_STALLS 0xDC PMC
+UMASK_RESOURCE_STALLS_ROB_FULL 0x01
+UMASK_RESOURCE_STALLS_RS_FULL 0x02
+UMASK_RESOURCE_STALLS_LD_ST 0x04
+UMASK_RESOURCE_STALLS_FPCW 0x08
+UMASK_RESOURCE_STALLS_BR_MISS_CLEAR 0x10
+UMASK_RESOURCE_STALLS_ANY 0x1F
EVENT_BR_INST_DECODED 0xE0 PMC
-UMASK_BR_INST_DECODED 0x00
+UMASK_BR_INST_DECODED 0x00
EVENT_BOGUS_BR 0xE4 PMC
-UMASK_BOGUS_BR 0x00
+UMASK_BOGUS_BR 0x00
EVENT_BACLEARS 0xE6 PMC
-UMASK_BACLEARS 0x00
+UMASK_BACLEARS 0x00
EVENT_PREF_RQSTS_UP 0xF0 PMC
-UMASK_PREF_RQSTS_UP 0x00
+UMASK_PREF_RQSTS_UP 0x00
EVENT_PREF_RQSTS_DN 0xF8 PMC
-UMASK_PREF_RQSTS_DN 0x00
+UMASK_PREF_RQSTS_DN 0x00
diff --git a/src/includes/perfmon_haswell.h b/src/includes/perfmon_haswell.h
index 57f12af..23d1b64 100644
--- a/src/includes/perfmon_haswell.h
+++ b/src/includes/perfmon_haswell.h
@@ -3,15 +3,16 @@
*
* Filename: perfmon_haswell.h
*
- * Description: Header File of perfmon module for Haswell.
+ * Description: Header File of perfmon module for Intel Haswell.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -28,382 +29,1880 @@
* =======================================================================================
*/
+#include <perfmon_haswellEP_events.h>
#include <perfmon_haswell_events.h>
-#include <perfmon_haswell_groups.h>
+#include <perfmon_haswellEP_counters.h>
#include <perfmon_haswell_counters.h>
+#include <error.h>
+#include <affinity.h>
+#include <limits.h>
+#include <topology.h>
+#include <access.h>
+
+static int perfmon_numCountersHaswellEP = NUM_COUNTERS_HASWELL_EP;
+static int perfmon_numCoreCountersHaswellEP = NUM_COUNTERS_CORE_HASWELL_EP;
+static int perfmon_numArchEventsHaswellEP = NUM_ARCH_EVENTS_HASWELLEP;
static int perfmon_numCountersHaswell = NUM_COUNTERS_HASWELL;
-static int perfmon_numGroupsHaswell = NUM_GROUPS_HASWELL;
+static int perfmon_numCoreCountersHaswell = NUM_COUNTERS_CORE_HASWELL;
static int perfmon_numArchEventsHaswell = NUM_ARCH_EVENTS_HASWELL;
+int has_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event);
+int hasep_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event);
+int (*haswell_cbox_setup)(int, RegisterIndex, PerfmonEvent *);
+
+int perfmon_init_haswell(int cpu_id)
+{
+ int ret;
+ uint64_t data;
+ lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+ lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+ ret = HPMwrite(cpu_id, MSR_DEV, MSR_UNC_CBO_0_PERFEVTSEL0, 0x0ULL);
+ ret += HPMread(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, &data);
+ ret += HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL);
+ ret += HPMread(cpu_id, MSR_DEV, MSR_UNC_CBO_0_PERFEVTSEL0, &data);
+ if (cpuid_info.model == HASWELL_EP)
+ {
+ haswell_cbox_setup = hasep_cbox_setup;
+ }
+ else if ((ret == 0) && (data == 0x0ULL))
+ {
+ haswell_cbox_setup = has_cbox_setup;
+ }
+ return 0;
+}
+
+
+uint32_t hasep_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint32_t flags = (1ULL<<(1+(index*4)));
+ for(j=0;j<event->numberOfOptions;j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<(index*4));
+ break;
+ case EVENT_OPTION_ANYTHREAD:
+ flags |= (1ULL<<(2+(index*4)));
+ break;
+ default:
+ break;
+ }
+ }
+ return flags;
+}
-#define OFFSET_PMC 3
-void perfmon_init_haswell(PerfmonThread *thread)
+int hasep_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
{
+ int j;
uint64_t flags = 0x0ULL;
- int cpu_id = thread->processorId;
-
- /* Initialize registers */
- msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL2, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL3, 0x0ULL);
- msr_write(cpu_id, MSR_PMC0, 0x0ULL);
- msr_write(cpu_id, MSR_PMC1, 0x0ULL);
- msr_write(cpu_id, MSR_PMC2, 0x0ULL);
- msr_write(cpu_id, MSR_PMC3, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_FIXED_CTR0, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_FIXED_CTR1, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_FIXED_CTR2, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
+ uint64_t offcore_flags = 0x0ULL;
+ uint64_t latency_flags = 0x0ULL;
- lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
- if (cpuid_info.model != HASWELL_EX && cpuid_info.supportUncore)
+ flags = (1ULL<<22)|(1ULL<<16);
+ /* Intel with standard 8 bit event mask: [7:0] */
+ flags |= (event->umask<<8) + event->eventId;
+
+ /* set custom cfg and cmask */
+ if ((event->cfgBits != 0) &&
+ (event->eventId != 0xB7) &&
+ (event->eventId != 0xBB))
+ {
+ flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+ }
+
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<17);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_ANYTHREAD:
+ flags |= (1ULL<<21);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL) << 24;
+ break;
+ case EVENT_OPTION_IN_TRANS:
+ flags |= (1ULL<<32);
+ break;
+ case EVENT_OPTION_IN_TRANS_ABORT:
+ flags |= (1ULL<<33);
+ break;
+ case EVENT_OPTION_MATCH0:
+ offcore_flags |= (event->options[j].value & 0x8FFFULL);
+ break;
+ case EVENT_OPTION_MATCH1:
+ offcore_flags |= (event->options[j].value << 16);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ if (event->eventId == 0xB7)
+ {
+ if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+ {
+ offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+ }
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, offcore_flags));
+ }
+ else if (event->eventId == 0xBB)
+ {
+ if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+ {
+ offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+ }
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, offcore_flags));
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister , flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int has_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ flags = (1ULL<<22)|(1ULL<<20);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0x1FULL) << 24;
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_CBOX);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int hasep_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ uint64_t filter_flags;
+ uint32_t filter0 = box_map[counter_map[index].type].filterRegister1;
+ uint32_t filter1 = box_map[counter_map[index].type].filterRegister2;
+ int set_state_all = 0;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+
+ flags = (1ULL<<22);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->eventId == 0x34)
+ {
+ set_state_all = 1;
+ }
+
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ filter_flags = 0x0ULL;
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL) << 24;
+ break;
+ case EVENT_OPTION_OPCODE:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, filter1, &filter_flags));
+ filter_flags |= (0x3<<27);
+ filter_flags |= (extractBitField(event->options[j].value,5,0) << 20);
+ VERBOSEPRINTREG(cpu_id, filter1, filter_flags, SETUP_CBOX_FILTER_OPCODE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter1, filter_flags));
+ break;
+ case EVENT_OPTION_NID:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, filter1, &filter_flags));
+ filter_flags |= (extractBitField(event->options[j].value,16,0));
+ VERBOSEPRINTREG(cpu_id, filter1, filter_flags, SETUP_CBOX_FILTER_NID);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter1, filter_flags));
+ break;
+ case EVENT_OPTION_STATE:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, filter0, &filter_flags));
+ filter_flags |= (extractBitField(event->options[j].value,6,0) << 17);
+ VERBOSEPRINTREG(cpu_id, filter0, filter_flags, SETUP_CBOX_FILTER_STATE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter0, filter_flags));
+ set_state_all = 0;
+ break;
+ case EVENT_OPTION_TID:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, filter0, &filter_flags));
+ filter_flags |= (extractBitField(event->options[j].value,6,0));
+ VERBOSEPRINTREG(cpu_id, filter0, filter_flags, SETUP_CBOX_FILTER_TID);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter0, filter_flags));
+ flags |= (1ULL<<19);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ if (set_state_all)
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, filter0, &filter_flags));
+ filter_flags |= (0x1F << 17);
+ VERBOSEPRINTREG(cpu_id, filter0, filter_flags, SETUP_CBOX_DEF_FILTER_STATE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter0, filter_flags));
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_CBOX);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int hasep_ubox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+
+ flags = (1ULL<<22)|(1ULL<<20);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0x1FULL) << 24;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_UBOX);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int hasep_wbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ int clean_filter_reg = 1;
+ uint64_t filter = box_map[counter_map[index].type].filterRegister1;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+
+ flags = (1ULL<<22)|(1ULL<<20);
+ flags |= event->eventId;
+ if ((event->umask > 0x00) && (event->umask <= 0x3))
+ {
+ flags |= (event->umask << 14);
+ }
+ else if (event->umask == 0xFF)
+ {
+ flags = (1ULL<<21);
+ }
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0x1FULL) << 24;
+ break;
+ case EVENT_OPTION_OCCUPANCY:
+ flags |= ((event->options[j].value & 0x3ULL)<<14);
+ break;
+ case EVENT_OPTION_OCCUPANCY_FILTER:
+ clean_filter_reg = 0;
+ VERBOSEPRINTREG(cpu_id, filter, (event->options[j].value & 0xFFFFFFFFULL), SETUP_WBOX_FILTER);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter, (event->options[j].value & 0xFFFFFFFFULL)));
+ break;
+ case EVENT_OPTION_OCCUPANCY_EDGE:
+ flags |= (1ULL<<31);
+ break;
+ case EVENT_OPTION_OCCUPANCY_INVERT:
+ flags |= (1ULL<<30);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ if (clean_filter_reg)
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter, 0x0ULL));
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_WBOX);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+
+int hasep_bbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ uint64_t filter = 0x0ULL;
+ int opcode_flag = 0;
+ int match_flag = 0;
+ PciDeviceIndex dev = counter_map[index].device;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(dev, cpu_id))
+ {
+ return -ENODEV;
+ }
+
+ flags |= (1ULL<<20);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL) << 24;
+ break;
+ case EVENT_OPTION_OPCODE:
+ VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH,
+ (event->options[j].value & 0x3FULL), SETUP_BBOX_OPCODE);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH,
+ (event->options[j].value & 0x3FULL)));
+ opcode_flag = 1;
+ break;
+ case EVENT_OPTION_MATCH0:
+ filter = ((event->options[j].value & 0xFFFFFFC0ULL));
+ VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, filter, SETUP_ADDR0_FILTER);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, filter));
+ filter = (((event->options[j].value>>32) & 0x3FFFULL));
+ VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, filter, SETUP_ADDR1_FILTER);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, filter));
+ match_flag = 1;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ if (!opcode_flag)
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH, 0x0ULL, CLEAR_BBOX_OPCODE);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH, 0x0ULL));
+ }
+ if (!match_flag)
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, 0x0ULL, CLEAR_BBOX_MATCH0);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, 0x0ULL));
+ VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, 0x0ULL, CLEAR_BBOX_MATCH1);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, 0x0ULL));
+ }
+ if ((flags|(1ULL<<22)) != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_BBOX);
+ CHECK_PCI_WRITE_ERROR(HPMwrite( cpu_id, dev, counter_map[index].configRegister, flags));
+ /* Intel notes the registers must be written twice to hold, once without enable and again with enable.
+ * Not mentioned for the BBOX but we do it to be sure.
+ */
+ flags |= (1ULL<<22);
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_BBOX_TWICE);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int hasep_sbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(counter_map[index].device, cpu_id))
+ {
+ return -ENODEV;
+ }
+
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_TID:
+ flags |= (1ULL<<19);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0xFFULL)<<24);
+ break;
+
+ default:
+ break;
+ }
+ }
+ }
+ if ((flags|(1ULL<<22)) != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_SBOX);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, counter_map[index].device, counter_map[index].configRegister, flags));
+ flags |= (1ULL<<22);
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_SBOX_TWICE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, counter_map[index].device, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+
+int hasep_mbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ PciDeviceIndex dev = counter_map[index].device;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(dev, cpu_id))
+ {
+ return -ENODEV;
+ }
+
+ flags = (1ULL<<20);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL) << 24;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if ((flags|(1ULL<<22)) != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_MBOX);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ /* Intel notes the registers must be written twice to hold, once without enable and again with enable */
+ flags |= (1ULL<<22);
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_MBOX_TWICE);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int hasep_ibox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ PciDeviceIndex dev = counter_map[index].device;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(counter_map[index].device, cpu_id))
+ {
+ return -ENODEV;
+ }
+
+ flags = (1ULL<<20);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL) << 24;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if ((flags|(1ULL<<22)) != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_IBOX);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ /* Intel notes the registers must be written twice to hold, once without enable and again with enable */
+ flags |= (1ULL<<22);
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_IBOX_TWICE);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+
+int hasep_pbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ PciDeviceIndex dev = counter_map[index].device;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(counter_map[index].device, cpu_id))
+ {
+ return -ENODEV;
+ }
+
+ flags = (1ULL<<20);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL) << 24;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if ((flags|(1ULL<<22)) != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_PBOX);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ /* Intel notes the registers must be written twice to hold, once without enable and again with enable */
+ flags |= (1ULL<<22);
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_PBOX_TWICE);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int hasep_rbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ PciDeviceIndex dev = counter_map[index].device;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(counter_map[index].device, cpu_id))
+ {
+ return -ENODEV;
+ }
+
+ flags = (1ULL<<20);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL) << 24;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if ((flags|(1ULL<<22)) != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_PBOX);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ /* Intel notes the registers must be written twice to hold, once without enable and again with enable */
+ flags |= (1ULL<<22);
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_PBOX_TWICE);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int hasep_qbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event, PciDeviceIndex filterdev)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ uint64_t filterreg;
+ uint64_t filterval = 0x0ULL;
+ PciDeviceIndex dev = counter_map[index].device;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(counter_map[index].device, cpu_id))
+ {
+ return -ENODEV;
+ }
+
+ flags = (1ULL<<20);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->cfgBits == 0x01)
{
- msr_write(cpu_id, MSR_UNC_CBO_0_PERFEVTSEL0, 0xAA);
- flags = msr_read(cpu_id, MSR_UNC_CBO_0_PERFEVTSEL0);
- if (flags != 0xAA)
+ flags |= (1ULL<<21);
+ }
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
{
- fprintf(stdout, "The current system does not support Uncore MSRs, deactivating Uncore support\n");
- cpuid_info.supportUncore = 0;
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL) << 24;
+ break;
+ case EVENT_OPTION_MATCH0:
+ if (HPMcheck(filterdev, cpu_id))
+ {
+ filterreg = PCI_UNC_V3_QPI_PMON_RX_MATCH_0;
+ filterval = event->options[j].value & 0x8003FFF8ULL;
+ VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_RX_MATCH0);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+ }
+ break;
+ case EVENT_OPTION_MATCH1:
+ if (HPMcheck(filterdev, cpu_id))
+ {
+ filterreg = PCI_UNC_V3_QPI_PMON_RX_MATCH_1;
+ filterval = event->options[j].value & 0x000F000FULL;
+ VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_RX_MATCH1);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+ }
+ break;
+ case EVENT_OPTION_MATCH2:
+ if (HPMcheck(filterdev, cpu_id))
+ {
+ filterreg = PCI_UNC_V3_QPI_PMON_TX_MATCH_0;
+ filterval = event->options[j].value & 0x8003FFF8ULL;
+ VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_TX_MATCH0);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+ }
+ break;
+ case EVENT_OPTION_MATCH3:
+ if (HPMcheck(filterdev, cpu_id))
+ {
+ filterreg = PCI_UNC_V3_QPI_PMON_TX_MATCH_1;
+ filterval = event->options[j].value & 0x000F000FULL;
+ VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_TX_MATCH1);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+ }
+ break;
+ case EVENT_OPTION_MASK0:
+ if (HPMcheck(filterdev, cpu_id))
+ {
+ filterreg = PCI_UNC_V3_QPI_PMON_RX_MASK_0;
+ filterval = event->options[j].value & 0x8003FFF8ULL;
+ VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_RX_MASK0);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+ }
+ break;
+ case EVENT_OPTION_MASK1:
+ if (HPMcheck(filterdev, cpu_id))
+ {
+ filterreg = PCI_UNC_V3_QPI_PMON_RX_MASK_1;
+ filterval = event->options[j].value & 0x000F000FULL;
+ VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_RX_MASK1);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+ }
+ break;
+ case EVENT_OPTION_MASK2:
+ if (HPMcheck(filterdev, cpu_id))
+ {
+ filterreg = PCI_UNC_V3_QPI_PMON_TX_MASK_0;
+ filterval = event->options[j].value & 0x8003FFF8ULL;
+ VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_TX_MASK0);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+ }
+ break;
+ case EVENT_OPTION_MASK3:
+ if (HPMcheck(filterdev, cpu_id))
+ {
+ filterreg = PCI_UNC_V3_QPI_PMON_TX_MASK_0;
+ filterval = event->options[j].value & 0x000F000FULL;
+ VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_TX_MASK1);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+ }
+ break;
+ default:
+ break;
+ }
}
}
-
-
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) && (cpuid_info.supportUncore))
+ if ((flags|(1ULL<<22)) != currentConfig[cpu_id][index])
{
- flags = 0x0ULL;
- flags = (1ULL<<22)|(1ULL<<20);
- msr_write(cpu_id, MSR_UNC_CBO_0_PERFEVTSEL0, flags);
- msr_write(cpu_id, MSR_UNC_CBO_0_PERFEVTSEL1, flags);
- msr_write(cpu_id, MSR_UNC_CBO_1_PERFEVTSEL0, flags);
- msr_write(cpu_id, MSR_UNC_CBO_1_PERFEVTSEL1, flags);
- msr_write(cpu_id, MSR_UNC_CBO_2_PERFEVTSEL0, flags);
- msr_write(cpu_id, MSR_UNC_CBO_2_PERFEVTSEL1, flags);
- msr_write(cpu_id, MSR_UNC_CBO_3_PERFEVTSEL0, flags);
- msr_write(cpu_id, MSR_UNC_CBO_3_PERFEVTSEL1, flags);
-
- msr_write(cpu_id, MSR_UNC_ARB_PERFEVTSEL0, flags);
- msr_write(cpu_id, MSR_UNC_ARB_PERFEVTSEL1, flags);
-
- msr_write(cpu_id, MSR_UNC_PERF_FIXED_CTRL, flags);
-
- msr_write(cpu_id, MSR_UNC_CBO_0_CTR0, 0x0ULL);
- msr_write(cpu_id, MSR_UNC_CBO_0_CTR1, 0x0ULL);
- msr_write(cpu_id, MSR_UNC_CBO_1_CTR0, 0x0ULL);
- msr_write(cpu_id, MSR_UNC_CBO_1_CTR1, 0x0ULL);
- msr_write(cpu_id, MSR_UNC_CBO_2_CTR0, 0x0ULL);
- msr_write(cpu_id, MSR_UNC_CBO_2_CTR1, 0x0ULL);
- msr_write(cpu_id, MSR_UNC_CBO_3_CTR0, 0x0ULL);
- msr_write(cpu_id, MSR_UNC_CBO_3_CTR1, 0x0ULL);
-
- msr_write(cpu_id, MSR_UNC_ARB_CTR0, 0x0ULL);
- msr_write(cpu_id, MSR_UNC_ARB_CTR1, 0x0ULL);
-
- msr_write(cpu_id, MSR_UNC_PERF_FIXED_CTR, 0x0ULL);
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_QBOX);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ /* Intel notes the registers must be written twice to hold, once without enable and again with enable */
+ flags |= (1ULL<<22);
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_QBOX_TWICE);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
}
+ return 0;
}
-#define HAS_SETUP_BOX \
- if (haveLock) \
+#define HASEP_FREEZE_UNCORE \
+ if (haveLock && eventSet->regTypeMask & ~(0xFULL) && cpuid_info.model == HASWELL_EP) \
+ { \
+ VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST (1ULL<<31), FREEZE_UNCORE); \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_CTL, (1ULL<<31))); \
+ } \
+ else if (haveLock && eventSet->regTypeMask & ~(0xFULL)) \
+ { \
+ uint64_t data = 0x0ULL; \
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, &data)); \
+ if (!(data & (1ULL<<29))) \
+ { \
+ data &= ~(1ULL<<29); \
+ VERBOSEPRINTREG(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, data, FREEZE_UNCORE); \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, data)); \
+ } \
+ } \
+
+#define HASEP_UNFREEZE_UNCORE \
+ if (haveLock && eventSet->regTypeMask & ~(0xFULL) && cpuid_info.model == HASWELL_EP) \
+ { \
+ VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST (1ULL<<29), UNFREEZE_UNCORE); \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_CTL, (1ULL<<29))); \
+ } \
+ else if (haveLock && eventSet->regTypeMask & ~(0xFULL)) \
+ { \
+ uint64_t data = 0x0ULL; \
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, &data)); \
+ data |= (1ULL<<29); \
+ VERBOSEPRINTREG(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, data, UNFREEZE_UNCORE); \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, data)); \
+ }
+
+#define HASEP_UNFREEZE_UNCORE_AND_RESET_CTR \
+ if (haveLock && (eventSet->regTypeMask & ~(0xFULL))) \
+ { \
+ for (int i=0;i < eventSet->numberOfEvents;i++) \
+ { \
+ RegisterIndex index = eventSet->events[i].index; \
+ RegisterType type = counter_map[index].type; \
+ if ((type < UNCORE) || (type == WBOX0FIX)) \
+ { \
+ continue; \
+ } \
+ PciDeviceIndex dev = counter_map[index].device; \
+ if (HPMcheck(dev, cpu_id)) { \
+ int err = 0; \
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR_MANUAL); \
+ err = HPMwrite(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL); \
+ if (err != 0) \
+ { \
+ eventSet->events[index].type = NOTYPE; \
+ } \
+ else if (counter_map[index].counterRegister2 != 0x0) \
+ { \
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL, CLEAR_CTR_MANUAL); \
+ err = HPMwrite(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL); \
+ if (err != 0) \
+ { \
+ eventSet->events[index].type = NOTYPE; \
+ } \
+ } \
+ } \
+ } \
+ HASEP_UNFREEZE_UNCORE; \
+ }
+
+#define HASEP_FREEZE_UNCORE_AND_RESET_CTL \
+ if (haveLock && (eventSet->regTypeMask & ~(REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)|REG_TYPE_MASK(THERMAL)|REG_TYPE_MASK(POWER)))) \
{ \
- flags = (1ULL<<22)|(1ULL<<20); \
- flags |= (event->umask<<8) + event->eventId; \
- if (event->cfgBits != 0) /* set custom cfg and cmask */ \
+ HASEP_FREEZE_UNCORE; \
+ for (int i=0;i < eventSet->numberOfEvents;i++) \
{ \
- flags &= ~(0xFFFFU<<16); /* clear upper 16bits */ \
- flags |= ((event->cmask<<8) + event->cfgBits)<<16; \
+ RegisterIndex index = eventSet->events[i].index; \
+ RegisterType type = counter_map[index].type; \
+ if ((type < UNCORE) || (type == WBOX0FIX)) \
+ { \
+ continue; \
+ } \
+ PciDeviceIndex dev = counter_map[index].device; \
+ if (HPMcheck(dev, cpu_id)) { \
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, 0x0ULL, CLEAR_CTL_MANUAL); \
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, 0x0ULL)); \
+ if ((type >= SBOX0) && (type <= SBOX3)) { \
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, 0x0ULL)); \
+ } \
+ if (box_map[type].filterRegister1 != 0x0) \
+ { \
+ VERBOSEPRINTPCIREG(cpu_id, dev, box_map[type].filterRegister1, 0x0ULL, CLEAR_FILTER); \
+ HPMwrite(cpu_id, dev, box_map[type].filterRegister1, 0x0ULL); \
+ } \
+ if (box_map[type].filterRegister2 != 0x0) \
+ { \
+ VERBOSEPRINTPCIREG(cpu_id, dev, box_map[type].filterRegister2, 0x0ULL, CLEAR_FILTER); \
+ HPMwrite(cpu_id, dev, box_map[type].filterRegister2, 0x0ULL); \
+ } \
+ } \
} \
- msr_write(cpu_id, reg , flags); \
}
-void perfmon_setupCounterThread_haswell(
+
+
+
+int perfmon_setupCounterThread_haswell(
int thread_id,
- PerfmonEvent* event,
- PerfmonCounterIndex index)
+ PerfmonEventSet* eventSet)
{
int haveLock = 0;
- uint64_t flags = 0x0ULL;
- uint32_t uflags;
- uint64_t reg = haswell_counter_map[index].configRegister;
- int cpu_id = perfmon_threadData[thread_id].processorId;
- uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
- uint64_t orig_fixed_flags = fixed_flags;
- perfmon_threadData[thread_id].counters[index].init = TRUE;
+ uint64_t flags;
+ uint64_t fixed_flags = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
haveLock = 1;
}
- switch (haswell_counter_map[index].type)
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, 0xC00000070000000F));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+ }
+
+ HASEP_FREEZE_UNCORE;
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- case PMC:
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ uint64_t reg = counter_map[index].configRegister;
+ eventSet->events[i].threadCounter[thread_id].init = TRUE;
+ flags = 0x0ULL;
+ switch (type)
+ {
+ case PMC:
+ hasep_pmc_setup(cpu_id, index, event);
+ break;
- flags = (1<<22)|(1<<16);
+ case FIXED:
+ fixed_flags |= hasep_fixed_setup(cpu_id, index, event);
+ break;
- /* Intel with standard 8 bit event mask: [7:0] */
- flags |= (event->umask<<8) + event->eventId;
+ case POWER:
+ break;
- if (event->cfgBits != 0) /* set custom cfg and cmask */
- {
- flags &= ~(0xFFFFU<<16); /* clear upper 16bits */
- flags |= ((event->cmask<<8) + event->cfgBits)<<16;
- }
+ case CBOX0:
+ case CBOX1:
+ case CBOX2:
+ case CBOX3:
+ case CBOX4:
+ case CBOX5:
+ case CBOX6:
+ case CBOX7:
+ case CBOX8:
+ case CBOX9:
+ case CBOX10:
+ case CBOX11:
+ case CBOX12:
+ case CBOX13:
+ case CBOX14:
+ case CBOX15:
+ case CBOX16:
+ case CBOX17:
+ haswell_cbox_setup(cpu_id, index, event);
+ break;
+
+ case UBOX:
+ hasep_ubox_setup(cpu_id, index, event);
+ break;
+ case UBOXFIX:
+ if (haveLock)
+ {
+ flags = (1ULL<<22)|(1ULL<<20);
+ VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_UBOXFIX);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+ }
+ break;
+
+ case SBOX0:
+ case SBOX1:
+ case SBOX2:
+ case SBOX3:
+ hasep_sbox_setup(cpu_id, index, event);
+ break;
+
+ case BBOX0:
+ case BBOX1:
+ hasep_bbox_setup(cpu_id, index, event);
+ break;
+
+ case WBOX:
+ hasep_wbox_setup(cpu_id, index, event);
+ break;
+ case WBOX0FIX:
+ break;
+
+ case MBOX0:
+ case MBOX1:
+ case MBOX2:
+ case MBOX3:
+ case MBOX4:
+ case MBOX5:
+ case MBOX6:
+ case MBOX7:
+ hasep_mbox_setup(cpu_id, index, event);
+ break;
+ case MBOX0FIX:
+ case MBOX1FIX:
+ case MBOX2FIX:
+ case MBOX3FIX:
+ case MBOX4FIX:
+ case MBOX5FIX:
+ case MBOX6FIX:
+ case MBOX7FIX:
+ if (haveLock && HPMcheck(counter_map[index].device, cpu_id))
+ {
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, counter_map[index].device, reg, ((1ULL<<20)|(1ULL<<22))))
+ }
+ break;
+
+ case PBOX:
+ hasep_pbox_setup(cpu_id, index, event);
+ break;
+
+ case RBOX0:
+ case RBOX1:
+ hasep_rbox_setup(cpu_id, index, event);
+ break;
+
+ case QBOX0:
+ hasep_qbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_0);
+ break;
+ case QBOX1:
+ hasep_qbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_1);
+ break;
+
+ case IBOX0:
+ case IBOX1:
+ hasep_ibox_setup(cpu_id, index, event);
+ break;
+
+ default:
+ break;
+ }
+ }
+ if (fixed_flags > 0x0ULL)
+ {
+ // Erratum HSW143
+ //VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED_WORKAROUND)
+ //CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, (1ULL<<32)));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
+ }
+ return 0;
+}
+
+int perfmon_startCountersThread_haswell(int thread_id, PerfmonEventSet* eventSet)
+{
+ int haveLock = 0;
+ uint64_t flags = 0x0ULL;
+ uint64_t tmp = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveLock = 1;
+ }
- if (perfmon_verbose)
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- printf("[%d] perfmon_setup_counter PMC: Write Register 0x%llX , Flags: 0x%llX \n",
- cpu_id,
- LLU_CAST reg,
- LLU_CAST flags);
+ continue;
}
- msr_write(cpu_id, reg , flags);
- break;
-
- case FIXED:
- fixed_flags |= (0x2 << (index*4));
- break;
-
- case POWER:
- break;
-
- case CBOX0:
- case CBOX1:
- case CBOX2:
- case CBOX3:
- case UBOX:
- if (cpuid_info.supportUncore)
+ tmp = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ uint64_t counter1 = counter_map[index].counterRegister;
+ PciDeviceIndex dev = counter_map[index].device;
+ eventSet->events[i].threadCounter[thread_id].startData = 0;
+ eventSet->events[i].threadCounter[thread_id].counterData = 0;
+ switch (type)
{
- HAS_SETUP_BOX;
+ case PMC:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+ flags |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr)); /* enable counter */
+ VERBOSEPRINTREG(cpu_id, counter1, 0x0ULL, START_PMC);
+ break;
+
+ case FIXED:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+ flags |= (1ULL<<(index+32)); /* enable fixed counter */
+ VERBOSEPRINTREG(cpu_id, counter1, 0x0ULL, START_FIXED);
+ break;
+
+ case POWER:
+ if (haveLock)
+ {
+ tmp = 0x0ULL;
+ CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1,(uint32_t*)&tmp));
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST tmp, START_POWER)
+ eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[type].regWidth);
+ }
+ break;
+ case WBOX0FIX:
+ if (haveLock)
+ {
+ tmp = 0x0ULL;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, dev, counter1, &tmp));
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST tmp, START_WBOXFIX);
+ eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[type].regWidth);
+ }
+ break;
+ case QBOX0FIX:
+ case QBOX1FIX:
+ if (haveLock && HPMcheck(dev, cpu_id))
+ {
+ if (eventSet->events[i].event.eventId != 0x00)
+ {
+ CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter1, &tmp));
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST tmp, START_QBOXFIX);
+ eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[type].regWidth);
+ }
+ }
+ break;
+
+ default:
+ break;
}
- break;
+ eventSet->events[i].threadCounter[thread_id].counterData = eventSet->events[i].threadCounter[thread_id].startData;
+ }
+ }
- default:
- /* should never be reached */
- break;
+ HASEP_UNFREEZE_UNCORE_AND_RESET_CTR;
+
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST (1ULL<<63)|(1ULL<<62)|flags, CLEAR_PMC_AND_FIXED_OVERFLOW)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<63)|(1ULL<<62)|flags));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, UNFREEZE_PMC_AND_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
+ }
+
+ return 0;
+}
+
+int has_uncore_read(int cpu_id, RegisterIndex index, PerfmonEvent *event,
+ uint64_t* cur_result, int* overflows, int flags,
+ int global_offset, int box_offset)
+{
+ uint64_t result = 0x0ULL;
+ uint64_t tmp = 0x0ULL;
+ RegisterType type = counter_map[index].type;
+ PciDeviceIndex dev = counter_map[index].device;
+ uint64_t counter1 = counter_map[index].counterRegister;
+ uint64_t counter2 = counter_map[index].counterRegister2;
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+
+ CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter1, &result));
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST result, READ_REG_1);
+ if (flags & FREEZE_FLAG_CLEAR_CTR)
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST 0x0U, CLEAR_PCI_REG_1);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0U));
+ }
+ if (counter2 != 0x0)
+ {
+ result <<= 32;
+ CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter2, &tmp));
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter2, LLU_CAST tmp, READ_REG_2);
+ result += tmp;
+ if (flags & FREEZE_FLAG_CLEAR_CTR)
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter2, LLU_CAST 0x0U, CLEAR_PCI_REG_2);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter2, 0x0U));
+ }
}
- if (fixed_flags != orig_fixed_flags)
+ result = field64(result, 0, box_map[type].regWidth);
+
+ if (result < *cur_result)
{
- msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
+ uint64_t ovf_values = 0x0ULL;
+ int global_offset = box_map[type].ovflOffset;
+ int test_local = 0;
+ if (global_offset != -1)
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV,
+ MSR_UNC_V3_U_PMON_GLOBAL_STATUS,
+ &ovf_values));
+ VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, LLU_CAST ovf_values, READ_GLOBAL_OVFL);
+ if (ovf_values & (1<<global_offset))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, LLU_CAST (1<<global_offset), CLEAR_GLOBAL_OVFL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV,
+ MSR_UNC_V3_U_PMON_GLOBAL_STATUS,
+ (1<<global_offset)));
+ test_local = 1;
+ }
+ }
+ else
+ {
+ test_local = 1;
+ }
+
+ if (test_local)
+ {
+ ovf_values = 0x0ULL;
+ CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev,
+ box_map[type].statusRegister,
+ &ovf_values));
+ VERBOSEPRINTPCIREG(cpu_id, dev, box_map[type].statusRegister, LLU_CAST ovf_values, READ_BOX_OVFL);
+ if (ovf_values & (1<<box_offset))
+ {
+ (*overflows)++;
+ VERBOSEPRINTPCIREG(cpu_id, dev, box_map[type].statusRegister, LLU_CAST (1<<box_offset), RESET_BOX_OVFL);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev,
+ box_map[type].statusRegister,
+ (1<<box_offset)));
+ }
+ }
}
+ *cur_result = result;
+ return 0;
}
-void perfmon_startCountersThread_haswell(int thread_id)
+#define HASEP_CHECK_CORE_OVERFLOW(offset) \
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+ { \
+ uint64_t ovf_values = 0x0ULL; \
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values)); \
+ if (ovf_values & (1ULL<<offset)) \
+ { \
+ eventSet->events[i].threadCounter[thread_id].overflows++; \
+ } \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<offset))); \
+ }
+
+
+#define HASEP_CHECK_LOCAL_OVERFLOW \
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+ { \
+ uint64_t ovf_values = 0x0ULL; \
+ uint64_t offset = getCounterTypeOffset(eventSet->events[i].index); \
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, box_map[eventSet->events[i].type].statusRegister, &ovf_values)); \
+ if (ovf_values & (1ULL<<offset)) \
+ { \
+ eventSet->events[i].threadCounter[thread_id].overflows++; \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[eventSet->events[i].type].statusRegister, (1ULL<<offset))); \
+ } \
+ }
+
+int perfmon_stopCountersThread_haswell(int thread_id, PerfmonEventSet* eventSet)
{
int haveLock = 0;
- uint64_t flags = 0x0ULL;
- uint32_t uflags = 0x10000UL; /* Clear freeze bit */
- int cpu_id = perfmon_threadData[thread_id].processorId;
- int start_uncore = 0;
+ uint64_t counter_result = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
haveLock = 1;
}
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
+
+ HASEP_FREEZE_UNCORE;
- for ( int i=0; i<perfmon_numCountersHaswell; i++ )
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- switch (haswell_counter_map[i].type)
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ counter_result = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ PciDeviceIndex dev = counter_map[index].device;
+ uint64_t counter1 = counter_map[index].counterRegister;
+ uint64_t* current = &(eventSet->events[i].threadCounter[thread_id].counterData);
+ int* overflows = &(eventSet->events[i].threadCounter[thread_id].overflows);
+ int ovf_offset = box_map[type].ovflOffset;
+ switch (type)
{
case PMC:
- msr_write(cpu_id, haswell_counter_map[i].counterRegister, 0x0ULL);
- flags |= (1<<(i-OFFSET_PMC)); /* enable counter */
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ HASEP_CHECK_CORE_OVERFLOW(index-cpuid_info.perf_num_fixed_ctr);
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, STOP_PMC)
+ *current = field64(counter_result, 0, box_map[type].regWidth);
break;
case FIXED:
- msr_write(cpu_id, haswell_counter_map[i].counterRegister, 0x0ULL);
- flags |= (1ULL<<(i+32)); /* enable fixed counter */
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ HASEP_CHECK_CORE_OVERFLOW(index+32);
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, STOP_FIXED)
+ *current = field64(counter_result, 0, box_map[type].regWidth);
break;
case POWER:
- if(haveLock)
+ if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
{
- perfmon_threadData[thread_id].counters[i].counterData =
- power_read(cpu_id, haswell_counter_map[i].counterRegister);
+ CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, STOP_POWER)
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+ {
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, OVERFLOW_POWER)
+ eventSet->events[i].threadCounter[thread_id].overflows++;
+ }
+ *current = field64(counter_result, 0, box_map[type].regWidth);
}
break;
+ case THERMAL:
+ CHECK_TEMP_READ_ERROR(thermal_read(cpu_id,(uint32_t*)&counter_result));
+ *current = field64(counter_result, 0, box_map[type].regWidth);
+ break;
+
+ case PBOX:
+ case IBOX0:
+ case RBOX0:
+ case RBOX1:
+ case QBOX0:
+ case QBOX1:
+ case WBOX:
+ case SBOX0:
+ case SBOX1:
+ case SBOX2:
+ case SBOX3:
+ case UBOX:
+ case UBOXFIX:
case CBOX0:
case CBOX1:
case CBOX2:
case CBOX3:
- case UBOX:
- start_uncore = 1;
+ case CBOX4:
+ case CBOX5:
+ case CBOX6:
+ case CBOX7:
+ case CBOX8:
+ case CBOX9:
+ case CBOX10:
+ case CBOX11:
+ case CBOX12:
+ case CBOX13:
+ case CBOX14:
+ case CBOX15:
+ case CBOX16:
+ case CBOX17:
+ has_uncore_read(cpu_id, index, event, current, overflows,
+ FREEZE_FLAG_CLEAR_CTR, ovf_offset, getCounterTypeOffset(index));
+ break;
+
+ case WBOX0FIX:
+ if (haveLock)
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ if (counter_result < *current)
+ {
+ (*overflows)++;
+ }
+ *current = counter_result;
+ }
+ break;
+
+ case BBOX0:
+ has_uncore_read(cpu_id, index, event, current, overflows,
+ FREEZE_FLAG_CLEAR_CTR, ovf_offset, getCounterTypeOffset(index));
+ break;
+ case BBOX1:
+ has_uncore_read(cpu_id, index, event, current, overflows,
+ FREEZE_FLAG_CLEAR_CTR, ovf_offset, getCounterTypeOffset(index));
+ break;
+
+ case MBOX0:
+ case MBOX1:
+ case MBOX2:
+ case MBOX3:
+ has_uncore_read(cpu_id, index, event, current, overflows,
+ FREEZE_FLAG_CLEAR_CTR, ovf_offset, getCounterTypeOffset(index)+1);
+ break;
+
+ case MBOX4:
+ case MBOX5:
+ case MBOX6:
+ case MBOX7:
+ has_uncore_read(cpu_id, index, event, current, overflows,
+ FREEZE_FLAG_CLEAR_CTR, ovf_offset, getCounterTypeOffset(index)+1);
+ break;
+
+ case MBOX0FIX:
+ case MBOX1FIX:
+ case MBOX2FIX:
+ case MBOX3FIX:
+ case MBOX4FIX:
+ case MBOX5FIX:
+ case MBOX6FIX:
+ case MBOX7FIX:
+ has_uncore_read(cpu_id, index, event, current, overflows,
+ FREEZE_FLAG_CLEAR_CTR, ovf_offset, 0);
+ break;
+
+ case IBOX1:
+ has_uncore_read(cpu_id, index, event, current, overflows,
+ FREEZE_FLAG_CLEAR_CTR, ovf_offset, getCounterTypeOffset(index)+2);
+ break;
+
+ case QBOX0FIX:
+ case QBOX1FIX:
+ if (eventSet->events[i].event.eventId == 0x00)
+ {
+ HPMread(cpu_id, dev, counter1, &counter_result);
+ switch(extractBitField(counter_result, 3, 0))
+ {
+ case 0x2:
+ counter_result = 5.6E9;
+ break;
+ case 0x3:
+ counter_result = 6.4E9;
+ break;
+ case 0x4:
+ counter_result = 7.2E9;
+ break;
+ case 0x5:
+ counter_result = 8.0E9;
+ break;
+ case 0x6:
+ counter_result = 8.8E9;
+ break;
+ case 0x7:
+ counter_result = 9.6E9;
+ break;
+ default:
+ counter_result = 0;
+ break;
+ }
+
+ }
+ else if ((eventSet->events[i].event.eventId == 0x01) ||
+ (eventSet->events[i].event.eventId == 0x02))
+ {
+ HPMread(cpu_id, dev, counter1, &counter_result);
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, STOP_QBOXFIX);
+ counter_result = field64(counter_result, 0, box_map[type].regWidth);
+ }
+ eventSet->events[i].threadCounter[thread_id].counterData = counter_result;
break;
default:
- /* should never be reached */
break;
}
}
+ eventSet->events[i].threadCounter[thread_id].init = FALSE;
}
- if (haveLock && start_uncore && cpuid_info.supportUncore)
- {
- msr_write(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL, (1ULL<<29));
- }
- if (perfmon_verbose)
- {
- printf("perfmon_start_counters: Write Register 0x%X , \
- Flags: 0x%llX \n",MSR_PERF_GLOBAL_CTRL, LLU_CAST flags);
- printf("perfmon_start_counters: Write Register 0x%X , \
- Flags: 0x%llX \n",MSR_UNCORE_PERF_GLOBAL_CTRL, LLU_CAST uflags);
- }
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
- msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x30000000FULL);
+ return 0;
}
-void perfmon_stopCountersThread_haswell(int thread_id)
+
+int perfmon_readCountersThread_haswell(int thread_id, PerfmonEventSet* eventSet)
{
- uint64_t flags;
- uint64_t tmp;
- uint32_t uflags = 0x10100UL; /* Set freeze bit */
- uint64_t counter_result = 0x0ULL;
+ uint64_t flags = 0x0ULL;
int haveLock = 0;
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ uint64_t counter_result = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
haveLock = 1;
}
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
- if (haveLock && cpuid_info.supportUncore)
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
{
- msr_write(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL, 0x0ULL);
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &flags));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, SAFE_PMC_FLAGS)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, RESET_PMC_FLAGS)
}
- for ( int i=0; i < perfmon_numCountersHaswell; i++ )
+ HASEP_FREEZE_UNCORE;
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- switch (haswell_counter_map[i].type)
+ counter_result= 0x0ULL;
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ PciDeviceIndex dev = counter_map[index].device;
+ uint64_t counter1 = counter_map[index].counterRegister;
+ uint64_t* current = &(eventSet->events[i].threadCounter[thread_id].counterData);
+ int* overflows = &(eventSet->events[i].threadCounter[thread_id].overflows);
+ int ovf_offset = box_map[type].ovflOffset;
+ switch (type)
{
case PMC:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ HASEP_CHECK_CORE_OVERFLOW(index-cpuid_info.perf_num_fixed_ctr);
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC)
+ *current = field64(counter_result, 0, box_map[type].regWidth);
+ break;
+
case FIXED:
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, haswell_counter_map[i].counterRegister);
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ HASEP_CHECK_CORE_OVERFLOW(index+32);
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED)
+ *current = field64(counter_result, 0, box_map[type].regWidth);
break;
case POWER:
- if(haveLock)
+ if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
{
- perfmon_threadData[thread_id].counters[i].counterData =
- power_info.energyUnit *
- ( power_read(cpu_id, haswell_counter_map[i].counterRegister) -
- perfmon_threadData[thread_id].counters[i].counterData);
+ CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_POWER)
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+ {
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST eventSet->events[i].threadCounter[thread_id].startData, OVERFLOW_POWER_START)
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, OVERFLOW_POWER_STOP)
+ eventSet->events[i].threadCounter[thread_id].overflows++;
+ }
+ *current = field64(counter_result, 0, box_map[type].regWidth);
}
break;
case THERMAL:
- perfmon_threadData[thread_id].counters[i].counterData =
- thermal_read(cpu_id);
+ CHECK_TEMP_READ_ERROR(thermal_read(cpu_id,(uint32_t*)&counter_result));
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_TEMP)
+ *current = field64(counter_result, 0, box_map[type].regWidth);
+ break;
+
+ case WBOX0FIX:
+ if (haveLock)
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_WBOXFIX)
+ if (counter_result < *current)
+ {
+ (*overflows)++;
+ }
+ *current = counter_result;
+ }
+ break;
+
+ case BBOX0:
+ case BBOX1:
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_BBOX)
+ has_uncore_read(cpu_id, index, event, current, overflows,
+ 0, ovf_offset, getCounterTypeOffset(index));
+ break;
+
+ case MBOX0:
+ case MBOX1:
+ case MBOX2:
+ case MBOX3:
+ case MBOX4:
+ case MBOX5:
+ case MBOX6:
+ case MBOX7:
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_MBOX)
+ has_uncore_read(cpu_id, index, event, current, overflows,
+ 0, ovf_offset, getCounterTypeOffset(index)+1);
+ break;
+
+ case MBOX0FIX:
+ case MBOX1FIX:
+ case MBOX2FIX:
+ case MBOX3FIX:
+ case MBOX4FIX:
+ case MBOX5FIX:
+ case MBOX6FIX:
+ case MBOX7FIX:
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_MBOXFIX)
+ has_uncore_read(cpu_id, index, event, current, overflows,
+ 0, ovf_offset, 0);
+ break;
+
+ case IBOX1:
+ has_uncore_read(cpu_id, index, event, current, overflows,
+ 0, ovf_offset, getCounterTypeOffset(index)+2);
break;
+ case PBOX:
+ case IBOX0:
+ case RBOX0:
+ case RBOX1:
+ case QBOX0:
+ case QBOX1:
+ case WBOX:
+ case SBOX0:
+ case SBOX1:
+ case SBOX2:
+ case SBOX3:
+ case UBOX:
+ case UBOXFIX:
case CBOX0:
case CBOX1:
case CBOX2:
case CBOX3:
- case UBOX:
- if(haveLock && cpuid_info.supportUncore)
+ case CBOX4:
+ case CBOX5:
+ case CBOX6:
+ case CBOX7:
+ case CBOX8:
+ case CBOX9:
+ case CBOX10:
+ case CBOX11:
+ case CBOX12:
+ case CBOX13:
+ case CBOX14:
+ case CBOX15:
+ case CBOX16:
+ case CBOX17:
+ has_uncore_read(cpu_id, index, event, current, overflows,
+ 0, ovf_offset, getCounterTypeOffset(index));
+ break;
+
+ case QBOX0FIX:
+ case QBOX1FIX:
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_QBOXFIX)
+ if (eventSet->events[i].event.eventId == 0x00)
+ {
+ HPMread(cpu_id, dev, counter1, &counter_result);
+ switch(extractBitField(counter_result, 3, 0))
+ {
+ case 0x2:
+ counter_result = 5.6E9;
+ break;
+ case 0x3:
+ counter_result = 6.4E9;
+ break;
+ case 0x4:
+ counter_result = 7.2E9;
+ break;
+ case 0x5:
+ counter_result = 8.0E9;
+ break;
+ case 0x6:
+ counter_result = 8.8E9;
+ break;
+ case 0x7:
+ counter_result = 9.6E9;
+ break;
+ default:
+ counter_result = 0;
+ break;
+ }
+
+ }
+ else if ((eventSet->events[i].event.eventId == 0x01) ||
+ (eventSet->events[i].event.eventId == 0x02))
{
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, haswell_counter_map[i].counterRegister);
+ HPMread(cpu_id, dev, counter1, &counter_result);
+ counter_result = field64(counter_result, 0, box_map[type].regWidth);
}
+ eventSet->events[i].threadCounter[thread_id].counterData = counter_result;
break;
default:
- /* should never be reached */
break;
}
}
}
- flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
- // printf ("Status: 0x%llX \n", LLU_CAST flags);
- if ( (flags & 0x3) || (flags & (0x3ULL<<32)) )
+ HASEP_UNFREEZE_UNCORE;
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
{
- printf ("Overflow occured \n");
+ // Erratum HSW143
+ //VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, RESTORE_PMC_FLAGS_WORKAROUND)
+ //CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, (1ULL<<32)));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, RESTORE_PMC_FLAGS)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
}
+
+ return 0;
}
-void perfmon_readCountersThread_haswell(int thread_id)
+int perfmon_finalizeCountersThread_haswell(int thread_id, PerfmonEventSet* eventSet)
{
- uint64_t counter_result = 0x0ULL;
int haveLock = 0;
- int cpu_id = perfmon_threadData[thread_id].processorId;
- uint64_t core_flags = 0x0ULL;
- uint64_t uncore_flags = 0x0ULL;
+ int haveTileLock = 0;
+ int clearPBS = 0;
+ uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
+ uint64_t ovf_values_uncore = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
haveLock = 1;
}
-
- core_flags = msr_read(cpu_id, MSR_PERF_GLOBAL_CTRL);
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
- if (cpuid_info.supportUncore)
+ if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
{
- uncore_flags = msr_read(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL);
- msr_write(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL, 0x0ULL);
+ haveTileLock = 1;
}
-
- for ( int i=0; i<perfmon_numCountersHaswell; i++ )
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- if ((haswell_counter_map[i].type == PMC) ||
- (haswell_counter_map[i].type == FIXED))
- {
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, haswell_counter_map[i].counterRegister);
- }
- else
- {
- if(haveLock)
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ PciDeviceIndex dev = counter_map[index].device;
+ uint64_t reg = counter_map[index].configRegister;
+ switch (type)
+ {
+ case PMC:
+ ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+ if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
{
- switch (haswell_counter_map[i].type)
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
+ }
+ else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+ }
+ break;
+ case FIXED:
+ ovf_values_core |= (1ULL<<(index+32));
+ break;
+ default:
+ /*if (counter_map[index].type > UNCORE)
+ {
+ if (box_map[counter_map[index].type].ovflOffset >= 0)
{
- case POWER:
- perfmon_threadData[thread_id].counters[i].counterData =
- power_info.energyUnit *
- power_read(cpu_id, haswell_counter_map[i].counterRegister);
- break;
-
- case CBOX0:
- case CBOX1:
- case CBOX2:
- case CBOX3:
- case UBOX:
- if(haveLock)
- {
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, haswell_counter_map[i].counterRegister);
- }
- break;
- default:
- /* should never be reached */
- break;
+ ovf_values_uncore |= (1ULL<<box_map[counter_map[index].type].ovflOffset);
}
- }
+ }*/
+ break;
+ }
+ if ((reg) && (((type == PMC)||(type == FIXED))||((type >= UNCORE) && (haveLock))))
+ {
+ ovf_values_uncore = 0x0ULL;
+ VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+ if ((type >= SBOX0) && (type <= SBOX3))
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+ }
+ if (box_map[type].filterRegister1)
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, box_map[type].filterRegister1, 0x0ULL, CLEAR_FILTER);
+ HPMwrite(cpu_id, dev, box_map[type].filterRegister1, 0x0ULL);
+ }
+ if (box_map[type].filterRegister2)
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, box_map[type].filterRegister2, 0x0ULL, CLEAR_FILTER);
+ HPMwrite(cpu_id, dev, box_map[type].filterRegister2, 0x0ULL);
}
}
+ eventSet->events[i].threadCounter[thread_id].init = FALSE;
}
- if (cpuid_info.supportUncore && uncore_flags > 0x0ULL)
+ if (haveLock && eventSet->regTypeMask & ~(0xFULL))
{
- msr_write(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL, uncore_flags);
+ VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, LLU_CAST ovf_values_uncore, CLEAR_UNCORE_OVF)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, ovf_values_uncore));
+ VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST 0x0ULL, CLEAR_UNCORE_CTRL)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_CTL, 0x0ULL));
}
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, core_flags);
-}
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST ovf_values_core, CLEAR_GLOBAL_OVF)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST 0x0ULL, CLEAR_GLOBAL_CTRL)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
+ return 0;
+}
diff --git a/src/includes/perfmon_haswellEP_counters.h b/src/includes/perfmon_haswellEP_counters.h
new file mode 100644
index 0000000..0c93c91
--- /dev/null
+++ b/src/includes/perfmon_haswellEP_counters.h
@@ -0,0 +1,330 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: perfmon_haswellEP_counters.h
+ *
+ * Description: Counter Header File of perfmon module for Intel Haswell EP/EN/EX.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#define NUM_COUNTERS_HASWELL_EP 187
+#define NUM_COUNTERS_CORE_HASWELL_EP 8
+#define NUM_COUNTERS_UNCORE_HASWELL_EP 111
+
+#define HAS_EP_VALID_OPTIONS_FIXED EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_COUNT_KERNEL_MASK
+#define HAS_EP_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK| \
+ EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_IN_TRANS_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define HAS_EP_VALID_OPTIONS_CBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_TID_MASK
+#define HAS_EP_VALID_OPTIONS_UBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define HAS_EP_VALID_OPTIONS_SBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_TID_MASK
+#define HAS_EP_VALID_OPTIONS_BBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define HAS_EP_VALID_OPTIONS_WBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK|\
+ EVENT_OPTION_OCCUPANCY_MASK|EVENT_OPTION_OCCUPANCY_FILTER_MASK|EVENT_OPTION_OCCUPANCY_EDGE_MASK|\
+ EVENT_OPTION_OCCUPANCY_INVERT_MASK
+#define HAS_EP_VALID_OPTIONS_MBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define HAS_EP_VALID_OPTIONS_IBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define HAS_EP_VALID_OPTIONS_PBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define HAS_EP_VALID_OPTIONS_RBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define HAS_EP_VALID_OPTIONS_QBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+static RegisterMap haswellEP_counter_map[NUM_COUNTERS_HASWELL_EP] = {
+ /* Fixed Counters: instructions retired, cycles unhalted core */
+ {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_FIXED},
+ {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_FIXED},
+ {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_FIXED},
+ /* PMC Counters: 4 48bit wide */
+ {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, HAS_EP_VALID_OPTIONS_PMC},
+ {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, HAS_EP_VALID_OPTIONS_PMC},
+ {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, HAS_EP_VALID_OPTIONS_PMC|EVENT_OPTION_IN_TRANS_ABORT_MASK},
+ {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, HAS_EP_VALID_OPTIONS_PMC},
+ /* Temperature Sensor*/
+ {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ /* RAPL counters */
+ {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"CBOX0C0", PMC12, CBOX0, MSR_UNC_V3_C0_PMON_CTL0, MSR_UNC_V3_C0_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX0C1", PMC13, CBOX0, MSR_UNC_V3_C0_PMON_CTL1, MSR_UNC_V3_C0_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX0C2", PMC14, CBOX0, MSR_UNC_V3_C0_PMON_CTL2, MSR_UNC_V3_C0_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX0C3", PMC15, CBOX0, MSR_UNC_V3_C0_PMON_CTL3, MSR_UNC_V3_C0_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX1C0", PMC16, CBOX1, MSR_UNC_V3_C1_PMON_CTL0, MSR_UNC_V3_C1_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX1C1", PMC17, CBOX1, MSR_UNC_V3_C1_PMON_CTL1, MSR_UNC_V3_C1_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX1C2", PMC18, CBOX1, MSR_UNC_V3_C1_PMON_CTL2, MSR_UNC_V3_C1_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX1C3", PMC19, CBOX1, MSR_UNC_V3_C1_PMON_CTL3, MSR_UNC_V3_C1_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX2C0", PMC20, CBOX2, MSR_UNC_V3_C2_PMON_CTL0, MSR_UNC_V3_C2_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX2C1", PMC21, CBOX2, MSR_UNC_V3_C2_PMON_CTL1, MSR_UNC_V3_C2_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX2C2", PMC22, CBOX2, MSR_UNC_V3_C2_PMON_CTL2, MSR_UNC_V3_C2_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX2C3", PMC23, CBOX2, MSR_UNC_V3_C2_PMON_CTL3, MSR_UNC_V3_C2_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX3C0", PMC24, CBOX3, MSR_UNC_V3_C3_PMON_CTL0, MSR_UNC_V3_C3_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX3C1", PMC25, CBOX3, MSR_UNC_V3_C3_PMON_CTL1, MSR_UNC_V3_C3_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX3C2", PMC26, CBOX3, MSR_UNC_V3_C3_PMON_CTL2, MSR_UNC_V3_C3_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX3C3", PMC27, CBOX3, MSR_UNC_V3_C3_PMON_CTL3, MSR_UNC_V3_C3_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX4C0", PMC28, CBOX4, MSR_UNC_V3_C4_PMON_CTL0, MSR_UNC_V3_C4_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX4C1", PMC29, CBOX4, MSR_UNC_V3_C4_PMON_CTL1, MSR_UNC_V3_C4_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX4C2", PMC30, CBOX4, MSR_UNC_V3_C4_PMON_CTL2, MSR_UNC_V3_C4_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX4C3", PMC31, CBOX4, MSR_UNC_V3_C4_PMON_CTL3, MSR_UNC_V3_C4_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX5C0", PMC32, CBOX5, MSR_UNC_V3_C5_PMON_CTL0, MSR_UNC_V3_C5_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX5C1", PMC33, CBOX5, MSR_UNC_V3_C5_PMON_CTL1, MSR_UNC_V3_C5_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX5C2", PMC34, CBOX5, MSR_UNC_V3_C5_PMON_CTL2, MSR_UNC_V3_C5_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX5C3", PMC35, CBOX5, MSR_UNC_V3_C5_PMON_CTL3, MSR_UNC_V3_C5_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX6C0", PMC36, CBOX6, MSR_UNC_V3_C6_PMON_CTL0, MSR_UNC_V3_C6_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX6C1", PMC37, CBOX6, MSR_UNC_V3_C6_PMON_CTL1, MSR_UNC_V3_C6_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX6C2", PMC38, CBOX6, MSR_UNC_V3_C6_PMON_CTL2, MSR_UNC_V3_C6_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX6C3", PMC39, CBOX6, MSR_UNC_V3_C6_PMON_CTL3, MSR_UNC_V3_C6_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX7C0", PMC40, CBOX7, MSR_UNC_V3_C7_PMON_CTL0, MSR_UNC_V3_C7_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX7C1", PMC41, CBOX7, MSR_UNC_V3_C7_PMON_CTL1, MSR_UNC_V3_C7_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX7C2", PMC42, CBOX7, MSR_UNC_V3_C7_PMON_CTL2, MSR_UNC_V3_C7_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX7C3", PMC43, CBOX7, MSR_UNC_V3_C7_PMON_CTL3, MSR_UNC_V3_C7_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX8C0", PMC44, CBOX8, MSR_UNC_V3_C8_PMON_CTL0, MSR_UNC_V3_C8_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX8C1", PMC45, CBOX8, MSR_UNC_V3_C8_PMON_CTL1, MSR_UNC_V3_C8_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX8C2", PMC46, CBOX8, MSR_UNC_V3_C8_PMON_CTL2, MSR_UNC_V3_C8_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX8C3", PMC47, CBOX8, MSR_UNC_V3_C8_PMON_CTL3, MSR_UNC_V3_C8_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX9C0", PMC48, CBOX9, MSR_UNC_V3_C9_PMON_CTL0, MSR_UNC_V3_C9_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX9C1", PMC49, CBOX9, MSR_UNC_V3_C9_PMON_CTL1, MSR_UNC_V3_C9_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX9C2", PMC50, CBOX9, MSR_UNC_V3_C9_PMON_CTL2, MSR_UNC_V3_C9_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX9C3", PMC51, CBOX9, MSR_UNC_V3_C9_PMON_CTL3, MSR_UNC_V3_C9_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX10C0", PMC52, CBOX10, MSR_UNC_V3_C10_PMON_CTL0, MSR_UNC_V3_C10_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX10C1", PMC53, CBOX10, MSR_UNC_V3_C10_PMON_CTL1, MSR_UNC_V3_C10_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX10C2", PMC54, CBOX10, MSR_UNC_V3_C10_PMON_CTL2, MSR_UNC_V3_C10_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX10C3", PMC55, CBOX10, MSR_UNC_V3_C10_PMON_CTL3, MSR_UNC_V3_C10_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX11C0", PMC56, CBOX11, MSR_UNC_V3_C11_PMON_CTL0, MSR_UNC_V3_C11_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX11C1", PMC57, CBOX11, MSR_UNC_V3_C11_PMON_CTL1, MSR_UNC_V3_C11_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX11C2", PMC58, CBOX11, MSR_UNC_V3_C11_PMON_CTL2, MSR_UNC_V3_C11_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX11C3", PMC59, CBOX11, MSR_UNC_V3_C11_PMON_CTL3, MSR_UNC_V3_C11_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX12C0", PMC60, CBOX12, MSR_UNC_V3_C12_PMON_CTL0, MSR_UNC_V3_C12_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX12C1", PMC61, CBOX12, MSR_UNC_V3_C12_PMON_CTL1, MSR_UNC_V3_C12_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX12C2", PMC62, CBOX12, MSR_UNC_V3_C12_PMON_CTL2, MSR_UNC_V3_C12_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX12C3", PMC63, CBOX12, MSR_UNC_V3_C12_PMON_CTL3, MSR_UNC_V3_C12_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX13C0", PMC64, CBOX13, MSR_UNC_V3_C13_PMON_CTL0, MSR_UNC_V3_C13_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX13C1", PMC65, CBOX13, MSR_UNC_V3_C13_PMON_CTL1, MSR_UNC_V3_C13_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX13C2", PMC66, CBOX13, MSR_UNC_V3_C13_PMON_CTL2, MSR_UNC_V3_C13_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX13C3", PMC67, CBOX13, MSR_UNC_V3_C13_PMON_CTL3, MSR_UNC_V3_C13_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX14C0", PMC68, CBOX14, MSR_UNC_V3_C14_PMON_CTL0, MSR_UNC_V3_C14_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX14C1", PMC69, CBOX14, MSR_UNC_V3_C14_PMON_CTL1, MSR_UNC_V3_C14_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX14C2", PMC70, CBOX14, MSR_UNC_V3_C14_PMON_CTL2, MSR_UNC_V3_C14_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX14C3", PMC71, CBOX14, MSR_UNC_V3_C14_PMON_CTL3, MSR_UNC_V3_C14_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX15C0", PMC72, CBOX15, MSR_UNC_V3_C15_PMON_CTL0, MSR_UNC_V3_C15_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX15C1", PMC73, CBOX15, MSR_UNC_V3_C15_PMON_CTL1, MSR_UNC_V3_C15_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX15C2", PMC74, CBOX15, MSR_UNC_V3_C15_PMON_CTL2, MSR_UNC_V3_C15_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX15C3", PMC75, CBOX15, MSR_UNC_V3_C15_PMON_CTL3, MSR_UNC_V3_C15_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX16C0", PMC76, CBOX16, MSR_UNC_V3_C16_PMON_CTL0, MSR_UNC_V3_C16_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX16C1", PMC77, CBOX16, MSR_UNC_V3_C16_PMON_CTL1, MSR_UNC_V3_C16_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX16C2", PMC78, CBOX16, MSR_UNC_V3_C16_PMON_CTL2, MSR_UNC_V3_C16_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX16C3", PMC79, CBOX16, MSR_UNC_V3_C16_PMON_CTL3, MSR_UNC_V3_C16_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX17C0", PMC80, CBOX17, MSR_UNC_V3_C17_PMON_CTL0, MSR_UNC_V3_C17_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX17C1", PMC81, CBOX17, MSR_UNC_V3_C17_PMON_CTL1, MSR_UNC_V3_C17_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX17C2", PMC82, CBOX17, MSR_UNC_V3_C17_PMON_CTL2, MSR_UNC_V3_C17_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"CBOX17C3", PMC83, CBOX17, MSR_UNC_V3_C17_PMON_CTL3, MSR_UNC_V3_C17_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+ {"UBOX0", PMC84, UBOX, MSR_UNC_V3_U_PMON_CTL0, MSR_UNC_V3_U_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_UBOX},
+ {"UBOX1", PMC85, UBOX, MSR_UNC_V3_U_PMON_CTL1, MSR_UNC_V3_U_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_UBOX},
+ {"UBOXFIX", PMC86, UBOXFIX, MSR_UNC_V3_U_UCLK_FIXED_CTL, MSR_UNC_V3_U_UCLK_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"SBOX0C0", PMC87, SBOX0, MSR_UNC_V3_S0_PMON_CTL_0, MSR_UNC_V3_S0_PMON_CTR_0, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+ {"SBOX0C1", PMC88, SBOX0, MSR_UNC_V3_S0_PMON_CTL_1, MSR_UNC_V3_S0_PMON_CTR_1, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+ {"SBOX0C2", PMC89, SBOX0, MSR_UNC_V3_S0_PMON_CTL_2, MSR_UNC_V3_S0_PMON_CTR_2, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+ {"SBOX0C3", PMC90, SBOX0, MSR_UNC_V3_S0_PMON_CTL_3, MSR_UNC_V3_S0_PMON_CTR_3, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+ {"SBOX1C0", PMC91, SBOX1, MSR_UNC_V3_S1_PMON_CTL_0, MSR_UNC_V3_S1_PMON_CTR_0, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+ {"SBOX1C1", PMC92, SBOX1, MSR_UNC_V3_S1_PMON_CTL_1, MSR_UNC_V3_S1_PMON_CTR_1, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+ {"SBOX1C2", PMC93, SBOX1, MSR_UNC_V3_S1_PMON_CTL_2, MSR_UNC_V3_S1_PMON_CTR_2, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+ {"SBOX1C3", PMC94, SBOX1, MSR_UNC_V3_S1_PMON_CTL_3, MSR_UNC_V3_S1_PMON_CTR_3, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+ {"SBOX2C0", PMC95, SBOX2, MSR_UNC_V3_S2_PMON_CTL_0, MSR_UNC_V3_S2_PMON_CTR_0, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+ {"SBOX2C1", PMC96, SBOX2, MSR_UNC_V3_S2_PMON_CTL_1, MSR_UNC_V3_S2_PMON_CTR_1, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+ {"SBOX2C2", PMC97, SBOX2, MSR_UNC_V3_S2_PMON_CTL_2, MSR_UNC_V3_S2_PMON_CTR_2, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+ {"SBOX2C3", PMC98, SBOX2, MSR_UNC_V3_S2_PMON_CTL_3, MSR_UNC_V3_S2_PMON_CTR_3, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+ {"SBOX3C0", PMC99, SBOX3, MSR_UNC_V3_S3_PMON_CTL_0, MSR_UNC_V3_S3_PMON_CTR_0, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+ {"SBOX3C1", PMC100, SBOX3, MSR_UNC_V3_S3_PMON_CTL_1, MSR_UNC_V3_S3_PMON_CTR_1, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+ {"SBOX3C2", PMC101, SBOX3, MSR_UNC_V3_S3_PMON_CTL_2, MSR_UNC_V3_S3_PMON_CTR_2, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+ {"SBOX3C3", PMC102, SBOX3, MSR_UNC_V3_S3_PMON_CTL_3, MSR_UNC_V3_S3_PMON_CTR_3, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+ {"WBOX0", PMC103, WBOX, MSR_UNC_V3_PCU_PMON_CTL0, MSR_UNC_V3_PCU_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_WBOX},
+ {"WBOX1", PMC104, WBOX, MSR_UNC_V3_PCU_PMON_CTL1, MSR_UNC_V3_PCU_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_WBOX},
+ {"WBOX2", PMC105, WBOX, MSR_UNC_V3_PCU_PMON_CTL2, MSR_UNC_V3_PCU_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_WBOX},
+ {"WBOX3", PMC106, WBOX, MSR_UNC_V3_PCU_PMON_CTL3, MSR_UNC_V3_PCU_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_WBOX},
+ {"WBOX0FIX", PMC107, WBOX0FIX, 0, MSR_UNC_V3_PCU_CC6_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"WBOX1FIX", PMC108, WBOX0FIX, 0, MSR_UNC_V3_PCU_CC3_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"WBOX2FIX", PMC109, WBOX0FIX, 0, MSR_UNC_V3_PCU_PC2_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"WBOX3FIX", PMC110, WBOX0FIX, 0, MSR_UNC_V3_PCU_PC3_CTR , 0, 0, EVENT_OPTION_NONE_MASK},
+ {"BBOX0C0", PMC111, BBOX0, PCI_UNC_HA_PMON_CTL_0, PCI_UNC_HA_PMON_CTR_0_A, PCI_UNC_HA_PMON_CTR_0_B, PCI_HA_DEVICE_0, HAS_EP_VALID_OPTIONS_BBOX},
+ {"BBOX0C1", PMC112, BBOX0, PCI_UNC_HA_PMON_CTL_1, PCI_UNC_HA_PMON_CTR_1_A, PCI_UNC_HA_PMON_CTR_1_B, PCI_HA_DEVICE_0, HAS_EP_VALID_OPTIONS_BBOX},
+ {"BBOX0C2", PMC113, BBOX0, PCI_UNC_HA_PMON_CTL_2, PCI_UNC_HA_PMON_CTR_2_A, PCI_UNC_HA_PMON_CTR_2_B, PCI_HA_DEVICE_0, HAS_EP_VALID_OPTIONS_BBOX},
+ {"BBOX0C3", PMC114, BBOX0, PCI_UNC_HA_PMON_CTL_3, PCI_UNC_HA_PMON_CTR_3_A, PCI_UNC_HA_PMON_CTR_3_B, PCI_HA_DEVICE_0, HAS_EP_VALID_OPTIONS_BBOX},
+ {"BBOX1C0", PMC115, BBOX1, PCI_UNC_HA_PMON_CTL_0, PCI_UNC_HA_PMON_CTR_0_A, PCI_UNC_HA_PMON_CTR_0_B, PCI_HA_DEVICE_1, HAS_EP_VALID_OPTIONS_BBOX},
+ {"BBOX1C1", PMC116, BBOX1, PCI_UNC_HA_PMON_CTL_1, PCI_UNC_HA_PMON_CTR_1_A, PCI_UNC_HA_PMON_CTR_1_B, PCI_HA_DEVICE_1, HAS_EP_VALID_OPTIONS_BBOX},
+ {"BBOX1C2", PMC117, BBOX1, PCI_UNC_HA_PMON_CTL_2, PCI_UNC_HA_PMON_CTR_2_A, PCI_UNC_HA_PMON_CTR_2_B, PCI_HA_DEVICE_1, HAS_EP_VALID_OPTIONS_BBOX},
+ {"BBOX1C3", PMC118, BBOX1, PCI_UNC_HA_PMON_CTL_3, PCI_UNC_HA_PMON_CTR_3_A, PCI_UNC_HA_PMON_CTR_3_B, PCI_HA_DEVICE_1, HAS_EP_VALID_OPTIONS_BBOX},
+ {"MBOX0C0", PMC119, MBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_0, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX0C1", PMC120, MBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_0, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX0C2", PMC121, MBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_0, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX0C3", PMC122, MBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_0, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX0FIX", PMC123, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_NONE_MASK},
+ {"MBOX1C0", PMC124, MBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_1, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX1C1", PMC125, MBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_1, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX1C2", PMC126, MBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_1, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX1C3", PMC127, MBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_1, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX0FIX", PMC128, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_NONE_MASK},
+ {"MBOX2C0", PMC129, MBOX2, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_2, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX2C1", PMC130, MBOX2, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_2, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX2C2", PMC131, MBOX2, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_2, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX2C3", PMC132, MBOX2, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_2, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX0FIX", PMC133, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_NONE_MASK},
+ {"MBOX3C0", PMC134, MBOX3, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_3, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX3C1", PMC135, MBOX3, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_3, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX3C2", PMC136, MBOX3, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_3, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX3C3", PMC137, MBOX3, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_3, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX0FIX", PMC138, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_NONE_MASK},
+ {"MBOX4C0", PMC139, MBOX4, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_0, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX4C1", PMC140, MBOX4, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_0, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX4C2", PMC141, MBOX4, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_0, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX4C3", PMC142, MBOX4, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_0, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX0FIX", PMC43, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_NONE_MASK},
+ {"MBOX5C0", PMC144, MBOX5, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_1, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX5C1", PMC145, MBOX5, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_1, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX5C2", PMC146, MBOX5, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_1, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX5C3", PMC147, MBOX5, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_1, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX0FIX", PMC148, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_NONE_MASK},
+ {"MBOX6C0", PMC149, MBOX6, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_2, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX6C1", PMC150, MBOX6, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_2, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX6C2", PMC151, MBOX6, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_2, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX6C3", PMC152, MBOX6, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_2, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX0FIX", PMC153, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_NONE_MASK},
+ {"MBOX7C0", PMC154, MBOX7, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_3, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX7C1", PMC155, MBOX7, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_3, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX7C2", PMC156, MBOX7, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_3, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX7C3", PMC157, MBOX7, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_3, HAS_EP_VALID_OPTIONS_MBOX},
+ {"MBOX0FIX", PMC158, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_NONE_MASK},
+ {"IBOX0C0", PMC159, IBOX0, PCI_UNC_IRP0_PMON_CTL_0, PCI_UNC_IRP0_PMON_CTR_0, 0, PCI_IRP_DEVICE, HAS_EP_VALID_OPTIONS_IBOX},
+ {"IBOX0C1", PMC160, IBOX0, PCI_UNC_IRP0_PMON_CTL_1, PCI_UNC_IRP0_PMON_CTR_1, 0, PCI_IRP_DEVICE, HAS_EP_VALID_OPTIONS_IBOX},
+ {"IBOX1C0", PMC161, IBOX1, PCI_UNC_IRP1_PMON_CTL_0, PCI_UNC_IRP1_PMON_CTR_0, 0, PCI_IRP_DEVICE, HAS_EP_VALID_OPTIONS_IBOX},
+ {"IBOX1C1", PMC162, IBOX1, PCI_UNC_IRP1_PMON_CTL_1, PCI_UNC_IRP1_PMON_CTR_1, 0, PCI_IRP_DEVICE, HAS_EP_VALID_OPTIONS_IBOX},
+ {"PBOX0", PMC163, PBOX, PCI_UNC_R2PCIE_PMON_CTL_0, PCI_UNC_R2PCIE_PMON_CTR_0_A, PCI_UNC_R2PCIE_PMON_CTR_0_B, PCI_R2PCIE_DEVICE, HAS_EP_VALID_OPTIONS_PBOX},
+ {"PBOX1", PMC164, PBOX, PCI_UNC_R2PCIE_PMON_CTL_1, PCI_UNC_R2PCIE_PMON_CTR_1_A, PCI_UNC_R2PCIE_PMON_CTR_1_B, PCI_R2PCIE_DEVICE, HAS_EP_VALID_OPTIONS_PBOX},
+ {"PBOX2", PMC165, PBOX, PCI_UNC_R2PCIE_PMON_CTL_2, PCI_UNC_R2PCIE_PMON_CTR_2_A, PCI_UNC_R2PCIE_PMON_CTR_2_B, PCI_R2PCIE_DEVICE, HAS_EP_VALID_OPTIONS_PBOX},
+ {"PBOX3", PMC166, PBOX, PCI_UNC_R2PCIE_PMON_CTL_3, PCI_UNC_R2PCIE_PMON_CTR_3_A, PCI_UNC_R2PCIE_PMON_CTR_3_B, PCI_R2PCIE_DEVICE, HAS_EP_VALID_OPTIONS_PBOX},
+ {"RBOX0C0", PMC167, RBOX0, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_0, HAS_EP_VALID_OPTIONS_RBOX},
+ {"RBOX0C1", PMC168, RBOX0, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_0, HAS_EP_VALID_OPTIONS_RBOX},
+ {"RBOX0C2", PMC169, RBOX0, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_0, HAS_EP_VALID_OPTIONS_RBOX},
+ {"RBOX1C0", PMC170, RBOX1, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_1, HAS_EP_VALID_OPTIONS_RBOX},
+ {"RBOX1C1", PMC171, RBOX1, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_1, HAS_EP_VALID_OPTIONS_RBOX},
+ {"RBOX1C2", PMC172, RBOX1, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_1, HAS_EP_VALID_OPTIONS_RBOX},
+ {"QBOX0C0", PMC173, QBOX0, PCI_UNC_V3_QPI_PMON_CTL_0, PCI_UNC_V3_QPI_PMON_CTR_0_A, PCI_UNC_V3_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_0, HAS_EP_VALID_OPTIONS_QBOX},
+ {"QBOX0C1", PMC174, QBOX0, PCI_UNC_V3_QPI_PMON_CTL_1, PCI_UNC_V3_QPI_PMON_CTR_1_A, PCI_UNC_V3_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_0, HAS_EP_VALID_OPTIONS_QBOX},
+ {"QBOX0C2", PMC175, QBOX0, PCI_UNC_V3_QPI_PMON_CTL_2, PCI_UNC_V3_QPI_PMON_CTR_2_A, PCI_UNC_V3_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_0, HAS_EP_VALID_OPTIONS_QBOX},
+ {"QBOX0C3", PMC176, QBOX0, PCI_UNC_V3_QPI_PMON_CTL_3, PCI_UNC_V3_QPI_PMON_CTR_3_A, PCI_UNC_V3_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_0, HAS_EP_VALID_OPTIONS_QBOX},
+ {"QBOX1C0", PMC177, QBOX1, PCI_UNC_V3_QPI_PMON_CTL_0, PCI_UNC_V3_QPI_PMON_CTR_0_A, PCI_UNC_V3_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_1, HAS_EP_VALID_OPTIONS_QBOX},
+ {"QBOX1C1", PMC178, QBOX1, PCI_UNC_V3_QPI_PMON_CTL_1, PCI_UNC_V3_QPI_PMON_CTR_1_A, PCI_UNC_V3_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_1, HAS_EP_VALID_OPTIONS_QBOX},
+ {"QBOX1C2", PMC179, QBOX1, PCI_UNC_V3_QPI_PMON_CTL_2, PCI_UNC_V3_QPI_PMON_CTR_2_A, PCI_UNC_V3_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_1, HAS_EP_VALID_OPTIONS_QBOX},
+ {"QBOX1C3", PMC180, QBOX1, PCI_UNC_V3_QPI_PMON_CTL_3, PCI_UNC_V3_QPI_PMON_CTR_3_A, PCI_UNC_V3_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_1, HAS_EP_VALID_OPTIONS_QBOX},
+ {"QBOX0FIX0", PMC181, QBOX0FIX, 0x0, PCI_UNC_V3_QPI_RATE_STATUS, 0x0, PCI_QPI_MISC_DEVICE_PORT_0, EVENT_OPTION_NONE_MASK},
+ {"QBOX0FIX1", PMC182, QBOX0FIX, 0x0, PCI_UNC_V3_QPI_LINK_IDLE, 0x0, PCI_QPI_MISC_DEVICE_PORT_0, EVENT_OPTION_NONE_MASK},
+ {"QBOX0FIX2", PMC183, QBOX0FIX, 0x0, PCI_UNC_V3_QPI_LINK_LLR, 0x0, PCI_QPI_MISC_DEVICE_PORT_0, EVENT_OPTION_NONE_MASK},
+ {"QBOX1FIX0", PMC184, QBOX1FIX, 0x0, PCI_UNC_V3_QPI_RATE_STATUS, 0x0, PCI_QPI_MISC_DEVICE_PORT_1, EVENT_OPTION_NONE_MASK},
+ {"QBOX1FIX1", PMC185, QBOX1FIX, 0x0, PCI_UNC_V3_QPI_LINK_IDLE, 0x0, PCI_QPI_MISC_DEVICE_PORT_1, EVENT_OPTION_NONE_MASK},
+ {"QBOX1FIX2", PMC186, QBOX1FIX, 0x0, PCI_UNC_V3_QPI_LINK_LLR, 0x0, PCI_QPI_MISC_DEVICE_PORT_1, EVENT_OPTION_NONE_MASK},
+};
+
+
+static BoxMap haswellEP_box_map[NUM_UNITS] = {
+ [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+ [THERMAL] = {0,0,0,-1,0,0,8},
+ [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+ [POWER] = {0,0,0,-1,0,0,32},
+ [CBOX0] = {MSR_UNC_V3_C0_PMON_BOX_CTL, MSR_UNC_V3_C0_PMON_BOX_STATUS, MSR_UNC_V3_C0_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C0_PMON_BOX_FILTER0, MSR_UNC_V3_C0_PMON_BOX_FILTER1},
+ [CBOX1] = {MSR_UNC_V3_C1_PMON_BOX_CTL, MSR_UNC_V3_C1_PMON_BOX_STATUS, MSR_UNC_V3_C1_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C1_PMON_BOX_FILTER0, MSR_UNC_V3_C1_PMON_BOX_FILTER1},
+ [CBOX2] = {MSR_UNC_V3_C2_PMON_BOX_CTL, MSR_UNC_V3_C2_PMON_BOX_STATUS, MSR_UNC_V3_C2_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C2_PMON_BOX_FILTER0, MSR_UNC_V3_C2_PMON_BOX_FILTER1},
+ [CBOX3] = {MSR_UNC_V3_C3_PMON_BOX_CTL, MSR_UNC_V3_C3_PMON_BOX_STATUS, MSR_UNC_V3_C3_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C3_PMON_BOX_FILTER0, MSR_UNC_V3_C3_PMON_BOX_FILTER1},
+ [CBOX4] = {MSR_UNC_V3_C4_PMON_BOX_CTL, MSR_UNC_V3_C4_PMON_BOX_STATUS, MSR_UNC_V3_C4_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C4_PMON_BOX_FILTER0, MSR_UNC_V3_C4_PMON_BOX_FILTER1},
+ [CBOX5] = {MSR_UNC_V3_C5_PMON_BOX_CTL, MSR_UNC_V3_C5_PMON_BOX_STATUS, MSR_UNC_V3_C5_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C5_PMON_BOX_FILTER0, MSR_UNC_V3_C5_PMON_BOX_FILTER1},
+ [CBOX6] = {MSR_UNC_V3_C6_PMON_BOX_CTL, MSR_UNC_V3_C6_PMON_BOX_STATUS, MSR_UNC_V3_C6_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C6_PMON_BOX_FILTER0, MSR_UNC_V3_C6_PMON_BOX_FILTER1},
+ [CBOX7] = {MSR_UNC_V3_C7_PMON_BOX_CTL, MSR_UNC_V3_C7_PMON_BOX_STATUS, MSR_UNC_V3_C7_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C7_PMON_BOX_FILTER0, MSR_UNC_V3_C7_PMON_BOX_FILTER1},
+ [CBOX8] = {MSR_UNC_V3_C8_PMON_BOX_CTL, MSR_UNC_V3_C8_PMON_BOX_STATUS, MSR_UNC_V3_C8_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C8_PMON_BOX_FILTER0, MSR_UNC_V3_C8_PMON_BOX_FILTER1},
+ [CBOX9] = {MSR_UNC_V3_C9_PMON_BOX_CTL, MSR_UNC_V3_C9_PMON_BOX_STATUS, MSR_UNC_V3_C9_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C9_PMON_BOX_FILTER0, MSR_UNC_V3_C9_PMON_BOX_FILTER1},
+ [CBOX10] = {MSR_UNC_V3_C10_PMON_BOX_CTL, MSR_UNC_V3_C10_PMON_BOX_STATUS, MSR_UNC_V3_C10_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C10_PMON_BOX_FILTER0, MSR_UNC_V3_C10_PMON_BOX_FILTER1},
+ [CBOX11] = {MSR_UNC_V3_C11_PMON_BOX_CTL, MSR_UNC_V3_C11_PMON_BOX_STATUS, MSR_UNC_V3_C11_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C11_PMON_BOX_FILTER0, MSR_UNC_V3_C11_PMON_BOX_FILTER1},
+ [CBOX12] = {MSR_UNC_V3_C12_PMON_BOX_CTL, MSR_UNC_V3_C12_PMON_BOX_STATUS, MSR_UNC_V3_C12_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C12_PMON_BOX_FILTER0, MSR_UNC_V3_C12_PMON_BOX_FILTER1},
+ [CBOX13] = {MSR_UNC_V3_C13_PMON_BOX_CTL, MSR_UNC_V3_C13_PMON_BOX_STATUS, MSR_UNC_V3_C13_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C13_PMON_BOX_FILTER0, MSR_UNC_V3_C13_PMON_BOX_FILTER1},
+ [CBOX14] = {MSR_UNC_V3_C14_PMON_BOX_CTL, MSR_UNC_V3_C14_PMON_BOX_STATUS, MSR_UNC_V3_C14_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C14_PMON_BOX_FILTER0, MSR_UNC_V3_C14_PMON_BOX_FILTER1},
+ [CBOX15] = {MSR_UNC_V3_C15_PMON_BOX_CTL, MSR_UNC_V3_C15_PMON_BOX_STATUS, MSR_UNC_V3_C15_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C15_PMON_BOX_FILTER0, MSR_UNC_V3_C15_PMON_BOX_FILTER1},
+ [CBOX16] = {MSR_UNC_V3_C16_PMON_BOX_CTL, MSR_UNC_V3_C16_PMON_BOX_STATUS, MSR_UNC_V3_C16_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C16_PMON_BOX_FILTER0, MSR_UNC_V3_C16_PMON_BOX_FILTER1},
+ [CBOX17] = {MSR_UNC_V3_C17_PMON_BOX_CTL, MSR_UNC_V3_C17_PMON_BOX_STATUS, MSR_UNC_V3_C17_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C17_PMON_BOX_FILTER0, MSR_UNC_V3_C17_PMON_BOX_FILTER1},
+ [UBOX] = {0, MSR_UNC_V3_U_PMON_BOX_STATUS, MSR_UNC_V3_U_PMON_BOX_STATUS, 1, 0, 0, 48},
+ [UBOXFIX] = {0, MSR_UNC_V3_U_PMON_BOX_STATUS, MSR_UNC_V3_U_PMON_BOX_STATUS, 0, 0, 0, 48},
+ [SBOX0] = {MSR_UNC_V3_S0_PMON_BOX_CTL, MSR_UNC_V3_S0_PMON_BOX_STATUS, MSR_UNC_V3_S0_PMON_BOX_STATUS, -1, 0, 0, 48},
+ [SBOX1] = {MSR_UNC_V3_S1_PMON_BOX_CTL, MSR_UNC_V3_S1_PMON_BOX_STATUS, MSR_UNC_V3_S1_PMON_BOX_STATUS, -1, 0, 0, 48},
+ [SBOX2] = {MSR_UNC_V3_S2_PMON_BOX_CTL, MSR_UNC_V3_S2_PMON_BOX_STATUS, MSR_UNC_V3_S2_PMON_BOX_STATUS, -1, 0, 0, 48},
+ [SBOX3] = {MSR_UNC_V3_S3_PMON_BOX_CTL, MSR_UNC_V3_S3_PMON_BOX_STATUS, MSR_UNC_V3_S3_PMON_BOX_STATUS, -1, 0, 0, 48},
+ [WBOX] = {MSR_UNC_V3_PCU_PMON_BOX_CTL, MSR_UNC_V3_PCU_PMON_BOX_STATUS,MSR_UNC_V3_PCU_PMON_BOX_STATUS, 2, 0, 0, 48, MSR_UNC_V3_PCU_PMON_BOX_FILTER},
+ [WBOX0FIX] = {0,0,0,-1,0,0,64},
+ [BBOX0] = {PCI_UNC_HA_PMON_BOX_CTL, PCI_UNC_HA_PMON_BOX_STATUS, PCI_UNC_HA_PMON_BOX_STATUS, 21, 1, PCI_HA_DEVICE_0, 48},
+ [BBOX1] = {PCI_UNC_HA_PMON_BOX_CTL, PCI_UNC_HA_PMON_BOX_STATUS, PCI_UNC_HA_PMON_BOX_STATUS, 22, 1, PCI_HA_DEVICE_1, 48},
+ [MBOX0] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_0, 48},
+ [MBOX0FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 23, 1, PCI_IMC_DEVICE_0_CH_0, 48},
+ [MBOX1] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_1, 48},
+ [MBOX1FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 23, 1, PCI_IMC_DEVICE_0_CH_1, 48},
+ [MBOX2] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_2, 48},
+ [MBOX2FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 23, 1, PCI_IMC_DEVICE_0_CH_2, 48},
+ [MBOX3] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_3, 48},
+ [MBOX3FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 23, 1, PCI_IMC_DEVICE_0_CH_3, 48},
+ [MBOX4] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 24, 1, PCI_IMC_DEVICE_1_CH_0, 48},
+ [MBOX4FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 23, 1, PCI_IMC_DEVICE_1_CH_0, 48},
+ [MBOX5] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 24, 1, PCI_IMC_DEVICE_1_CH_1, 48},
+ [MBOX5FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 23, 1, PCI_IMC_DEVICE_1_CH_1, 48},
+ [MBOX6] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 24, 1, PCI_IMC_DEVICE_1_CH_2, 48},
+ [MBOX6FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 23, 1, PCI_IMC_DEVICE_1_CH_2, 48},
+ [MBOX7] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 24, 1, PCI_IMC_DEVICE_1_CH_3, 48},
+ [MBOX7FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 23, 1, PCI_IMC_DEVICE_1_CH_3, 48},
+ [IBOX0] = {PCI_UNC_IRP_PMON_BOX_CTL, PCI_UNC_IRP_PMON_BOX_STATUS, PCI_UNC_IRP_PMON_BOX_STATUS, 34, 1, PCI_IRP_DEVICE, 48},
+ [IBOX1] = {PCI_UNC_IRP_PMON_BOX_CTL, PCI_UNC_IRP_PMON_BOX_STATUS, PCI_UNC_IRP_PMON_BOX_STATUS, 34, 1, PCI_IRP_DEVICE, 48},
+ [PBOX] = {PCI_UNC_R2PCIE_PMON_BOX_CTL, PCI_UNC_R2PCIE_PMON_BOX_STATUS, PCI_UNC_R2PCIE_PMON_BOX_STATUS, 29, 1, PCI_R2PCIE_DEVICE, 48},
+ [RBOX0] = {PCI_UNC_R3QPI_PMON_BOX_CTL, PCI_UNC_R3QPI_PMON_BOX_STATUS, PCI_UNC_R3QPI_PMON_BOX_STATUS, 27, 1, PCI_R3QPI_DEVICE_LINK_0, 44},
+ [RBOX1] = {PCI_UNC_R3QPI_PMON_BOX_CTL, PCI_UNC_R3QPI_PMON_BOX_STATUS, PCI_UNC_R3QPI_PMON_BOX_STATUS, 28, 1, PCI_R3QPI_DEVICE_LINK_1, 44},
+ [QBOX0] = {PCI_UNC_V3_QPI_PMON_BOX_CTL, PCI_UNC_V3_QPI_PMON_BOX_STATUS, PCI_UNC_V3_QPI_PMON_BOX_STATUS, 25, 1, PCI_QPI_DEVICE_PORT_0, 48},
+ [QBOX1] = {PCI_UNC_V3_QPI_PMON_BOX_CTL, PCI_UNC_V3_QPI_PMON_BOX_STATUS, PCI_UNC_V3_QPI_PMON_BOX_STATUS, 26, 1, PCI_QPI_DEVICE_PORT_1, 48},
+ [QBOX0FIX] = {0x0, 0x0, 0x0, -1, 1, PCI_QPI_MISC_DEVICE_PORT_0, 32},
+ [QBOX1FIX] = {0x0, 0x0, 0x0, -1, 1, PCI_QPI_MISC_DEVICE_PORT_1, 32},
+};
+
+static PciDevice haswellEP_pci_devices[MAX_NUM_PCI_DEVICES] = {
+ [MSR_DEV] = {NODEVTYPE, "", "MSR", ""},
+ [PCI_R3QPI_DEVICE_LINK_0] = {R3QPI, "0b.1", "PCI_R3QPI_DEVICE_LINK_0", "RBOX0", 0x2F36},
+ [PCI_R3QPI_DEVICE_LINK_1] = {R3QPI, "0b.2", "PCI_R3QPI_DEVICE_LINK_1", "RBOX1", 0x2F37},
+ [PCI_R2PCIE_DEVICE] = {R2PCIE, "10.1", "PCI_R2PCIE_DEVICE", "PBOX0", 0x2F34},
+ [PCI_IMC_DEVICE_0_CH_0] = {IMC, "14.0", "PCI_IMC_DEVICE_0_CH_0", "MBOX0", 0x2FB4},
+ [PCI_IMC_DEVICE_0_CH_1] = {IMC, "14.1", "PCI_IMC_DEVICE_0_CH_1", "MBOX1", 0x2FB5},
+ [PCI_IMC_DEVICE_0_CH_2] = {IMC, "15.0", "PCI_IMC_DEVICE_0_CH_2", "MBOX2", 0x2FB0},
+ [PCI_IMC_DEVICE_0_CH_3] = {IMC, "15.1", "PCI_IMC_DEVICE_0_CH_3", "MBOX3", 0x2FB1},
+ [PCI_HA_DEVICE_0] = {HA, "12.1", "PCI_HA_DEVICE_0", "BBOX0", 0x2F30},
+ [PCI_HA_DEVICE_1] = {HA, "12.5", "PCI_HA_DEVICE_1", "BBOX1", 0x2F38},
+ [PCI_IMC_DEVICE_1_CH_0] = {IMC, "17.0", "PCI_IMC_DEVICE_1_CH_0", "MBOX4", 0x2FD4},
+ [PCI_IMC_DEVICE_1_CH_1] = {IMC, "17.1", "PCI_IMC_DEVICE_1_CH_1", "MBOX5", 0x2FD5},
+ [PCI_IMC_DEVICE_1_CH_2] = {IMC, "18.0", "PCI_IMC_DEVICE_1_CH_2", "MBOX6", 0x2FD0},
+ [PCI_IMC_DEVICE_1_CH_3] = {IMC, "18.1", "PCI_IMC_DEVICE_1_CH_3", "MBOX7", 0x2FD1},
+ [PCI_IRP_DEVICE] = {IRP, "05.6", "PCI_IRP_DEVICE", "IBOX", 0x2F39},
+ [PCI_QPI_DEVICE_PORT_0] = {QPI, "08.2", "PCI_QPI_DEVICE_PORT_0", "QBOX0", 0x2F32},
+ [PCI_QPI_DEVICE_PORT_1] = {QPI, "09.2", "PCI_QPI_DEVICE_PORT_1", "QBOX1", 0x2F33},
+ [PCI_QPI_MASK_DEVICE_PORT_0] = {QPI, "08.6", "PCI_QPI_MASK_DEVICE_PORT_0", NULL, 0x2F86},
+ [PCI_QPI_MASK_DEVICE_PORT_1] = {QPI, "09.6", "PCI_QPI_MASK_DEVICE_PORT_1", NULL, 0x2F96},
+ [PCI_QPI_MISC_DEVICE_PORT_0] = {QPI, "08.0", "PCI_QPI_MISC_DEVICE_PORT_0", "QBOX0FIX", 0x2F80},
+ [PCI_QPI_MISC_DEVICE_PORT_1] = {QPI, "08.0", "PCI_QPI_MISC_DEVICE_PORT_1", "QBOX1FIX", 0x2F80},
+};
+
diff --git a/src/includes/perfmon_haswellEP_events.txt b/src/includes/perfmon_haswellEP_events.txt
new file mode 100644
index 0000000..fb078a1
--- /dev/null
+++ b/src/includes/perfmon_haswellEP_events.txt
@@ -0,0 +1,2616 @@
+# =======================================================================================
+#
+# Filename: perfmon_haswellEP_events.txt
+#
+# Description: Event list for Intel Haswell EP/EN/EX
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
+# Author: Jan Treibig (jt), jan.treibig at gmail.com
+# Thomas Roehl (tr), thomas.roehl at googlemail.com
+# Project: likwid
+#
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation, either version 3 of the License, or (at your option) any later
+# version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
+
+EVENT_TEMP_CORE 0x00 TMP0
+UMASK_TEMP_CORE 0x00
+
+EVENT_PWR_PKG_ENERGY 0x00 PWR0
+UMASK_PWR_PKG_ENERGY 0x00
+
+EVENT_PWR_PP0_ENERGY 0x00 PWR1
+UMASK_PWR_PP0_ENERGY 0x00
+
+EVENT_PWR_PP1_ENERGY 0x00 PWR2
+UMASK_PWR_PP1_ENERGY 0x00
+
+EVENT_PWR_DRAM_ENERGY 0x00 PWR3
+UMASK_PWR_DRAM_ENERGY 0x00
+
+EVENT_INSTR_RETIRED 0x00 FIXC0
+UMASK_INSTR_RETIRED_ANY 0x00
+
+EVENT_CPU_CLK_UNHALTED 0x00 FIXC1
+UMASK_CPU_CLK_UNHALTED_CORE 0x00
+DEFAULT_OPTIONS_CPU_CLK_UNHALTED_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_CPU_CLK_UNHALTED_ANY 0x00
+
+EVENT_CPU_CLK_UNHALTED 0x00 FIXC2
+UMASK_CPU_CLK_UNHALTED_REF 0x00
+
+EVENT_LD_BLOCKS 0x03 PMC
+UMASK_LD_BLOCKS_STORE_FORWARD 0x02
+UMASK_LD_BLOCKS_NO_SR 0x08
+
+EVENT_MISALIGN_MEM_REF 0x05 PMC
+UMASK_MISALIGN_MEM_REF_LOADS 0x01
+UMASK_MISALIGN_MEM_REF_STORES 0x02
+UMASK_MISALIGN_MEM_REF_ANY 0x03
+
+EVENT_LD_BLOCKS_PARTIAL 0x07 PMC
+UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS 0x01
+
+EVENT_DTLB_LOAD_MISSES 0x08 PMC
+UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK 0x01
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED_4K 0x02
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED_LARGE 0x04
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED 0x0E
+UMASK_DTLB_LOAD_MISSES_WALK_DURATION 0x10
+UMASK_DTLB_LOAD_MISSES_STLB_HIT_4K 0x20
+UMASK_DTLB_LOAD_MISSES_STLB_HIT_2M 0x40
+UMASK_DTLB_LOAD_MISSES_STLB_HIT 0x60
+UMASK_DTLB_LOAD_MISSES_PDE_CACHE_MISS 0x80
+
+EVENT_INT_MISC 0x0D PMC
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_INT_MISC_RECOVERY_CYCLES 0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=0x1
+UMASK_INT_MISC_RECOVERY_COUNT 0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES_ANY EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_INT_MISC_RECOVERY_CYCLES_ANY 0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_COUNT_ANY EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_EDGE=0x1
+UMASK_INT_MISC_RECOVERY_COUNT_ANY 0x03
+
+EVENT_UOPS_ISSUED 0x0E PMC
+UMASK_UOPS_ISSUED_ANY 0x01
+UMASK_UOPS_ISSUED_FLAGS_MERGE 0x10
+UMASK_UOPS_ISSUED_SLOW_LEA 0x20
+UMASK_UOPS_ISSUED_SINGLE_MUL 0x40
+DEFAULT_OPTIONS_UOPS_ISSUED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_ISSUED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_TOTAL_CYCLES 0x01
+
+EVENT_ARITH_DIVIDER_UOPS 0x14 PMC
+UMASK_ARITH_DIVIDER_CYCLES 0x01
+UMASK_ARITH_DIVIDER_UOPS 0x02
+
+EVENT_L2_RQSTS 0x24 PMC
+UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD_MISS 0x21
+UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD_HIT 0x41
+UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD 0xE1
+UMASK_L2_RQSTS_RFO_HIT 0x42
+UMASK_L2_RQSTS_RFO_MISS 0x22
+UMASK_L2_RQSTS_ALL_RFO 0xE2
+UMASK_L2_RQSTS_CODE_RD_HIT 0x44
+UMASK_L2_RQSTS_CODE_RD_MISS 0x24
+UMASK_L2_RQSTS_ALL_DEMAND_MISS 0x27
+UMASK_L2_RQSTS_ALL_DEMAND_REFERENCES 0xE7
+UMASK_L2_RQSTS_ALL_CODE_RD 0xE4
+UMASK_L2_RQSTS_L2_PF_HIT 0x50
+UMASK_L2_RQSTS_L2_PF_MISS 0x30
+UMASK_L2_RQSTS_ALL_PF 0xF8
+UMASK_L2_RQSTS_MISS 0x3F
+UMASK_L2_RQSTS_REFERENCES 0xFF
+
+EVENT_L2_DEMAND_RQST_WB_HIT 0x27 PMC
+UMASK_L2_DEMAND_RQST_WB_HIT 0x50
+
+EVENT_LONGEST_LAT_CACHE 0x2E PMC
+UMASK_LONGEST_LAT_CACHE_REFERENCE 0x4F
+UMASK_LONGEST_LAT_CACHE_MISS 0x41
+
+EVENT_CPU_CLOCK_UNHALTED 0x3C PMC
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P 0x00
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_THREAD_P_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P_ANY 0x00
+UMASK_CPU_CLOCK_UNHALTED_REF_XCLK 0x01
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_REF_XCLK_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_CPU_CLOCK_UNHALTED_REF_XCLK_ANY 0x01
+UMASK_CPU_CLOCK_THREAD_UNHALTED_ONE_THREAD_ACTIVE 0x02
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0x2,EVENT_OPTION_INVERT=0x1
+UMASK_CPU_CLOCK_UNHALTED_TOTAL_CYCLES 0x00
+
+EVENT_L1D_PEND_MISS 0x48 PMC2
+UMASK_L1D_PEND_MISS_PENDING 0x01
+DEFAULT_OPTIONS_L1D_PEND_MISS_PENDING_CYCLES_ANY EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_THRESHOLD=0x1
+UMASK_L1D_PEND_MISS_PENDING_CYCLES_ANY 0x01
+DEFAULT_OPTIONS_L1D_PEND_MISS_PENDING_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_L1D_PEND_MISS_PENDING_CYCLES 0x01
+
+EVENT_L1D_PEND_MISS_REQUEST_FB_FULL 0x48 PMC
+UMASK_L1D_PEND_MISS_REQUEST_FB_FULL 0x02
+DEFAULT_OPTIONS_L1D_PEND_MISS_FB_FULL EVENT_OPTION_THRESHOLD=0x1
+UMASK_L1D_PEND_MISS_FB_FULL 0x02
+
+
+EVENT_DTLB_STORE_MISSES 0x49 PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK 0x01
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED_4K 0x02
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED_LARGE 0x04
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED 0x0E
+UMASK_DTLB_STORE_MISSES_WALK_DURATION 0x10
+UMASK_DTLB_STORE_MISSES_STLB_HIT_4K 0x20
+UMASK_DTLB_STORE_MISSES_STLB_HIT_2M 0x40
+UMASK_DTLB_STORE_MISSES_STLB_HIT 0x60
+UMASK_DTLB_STORE_MISSES_PDE_CACHE_MISS 0x80
+
+EVENT_LOAD_HIT_PRE 0x4C PMC
+UMASK_LOAD_HIT_PRE_SW_PF 0x01
+UMASK_LOAD_HIT_PRE_HW_PF 0x02
+
+EVENT_EPT_WALK_CYCLES 0x4F PMC
+UMASK_EPT_WALK_CYCLES 0x10
+
+EVENT_L1D 0x51 PMC
+UMASK_L1D_REPLACEMENT 0x01
+UMASK_L1D_ALLOCATED_IN_M 0x02
+UMASK_L1D_M_EVICT 0x04
+UMASK_L1D_ALL_M_REPLACEMENT 0x08
+
+EVENT_TX_MEM 0x54 PMC
+UMASK_TX_MEM_ABORT_CONFLICT 0x01
+UMASK_TX_MEM_ABORT_CAPACITY_WRITE 0x02
+UMASK_TX_MEM_ABORT_HLE_STORE_TO_ELIDED_LOCK 0x04
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_NOT_EMPTY 0x08
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_MISMATCH 0x10
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_UNSUPPORTED_ALIGNMENT 0x20
+UMASK_TX_MEM_HLE_ELISION_BUFFER_FULL 0x40
+
+EVENT_MOVE_ELIMINATION 0x58 PMC
+UMASK_MOVE_ELIMINATION_INT_NOT_ELIMINATED 0x04
+UMASK_MOVE_ELIMINATION_SIMD_NOT_ELIMINATED 0x08
+UMASK_MOVE_ELIMINATION_INT_ELIMINATED 0x01
+UMASK_MOVE_ELIMINATION_SIMD_ELIMINATED 0x02
+
+EVENT_CPL_CYCLES 0x5C PMC
+UMASK_CPL_CYCLES_RING0 0x01
+UMASK_CPL_CYCLES_RING123 0x02
+DEFAULT_OPTIONS_CPL_CYCLES_RING0_TRANS EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_CPL_CYCLES_RING0_TRANS 0x01
+
+EVENT_TX_EXEC 0x5D PMC
+UMASK_TX_EXEC_MISC1 0x01
+UMASK_TX_EXEC_MISC2 0x02
+UMASK_TX_EXEC_MISC3 0x04
+UMASK_TX_EXEC_MISC4 0x08
+UMASK_TX_EXEC_MISC5 0x10
+
+EVENT_RS_EVENTS 0x5E PMC
+UMASK_RS_EVENTS_EMPTY_CYCLES 0x01
+DEFAULT_OPTIONS_RS_EVENTS_EMPTY_END EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_EDGE=1
+UMASK_RS_EVENTS_EMPTY_END 0x01
+
+EVENT_OFFCORE_REQUESTS_OUTSTANDING 0x60 PMC
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD 0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD 0x02
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO 0x04
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD 0x08
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DEMAND_DATA_RD EVENT_OPTION_THRESHOLD=0x1
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DEMAND_DATA_RD 0x01
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DATA_RD EVENT_OPTION_THRESHOLD=0x1
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DATA_RD 0x08
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD_C6 EVENT_OPTION_THRESHOLD=0x6
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD_C6 0x01
+
+EVENT_LOCK_CYCLES 0x63 PMC
+UMASK_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION 0x01
+DEFAULT_OPTIONS_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_COUNT EVENT_OPTION_EDGE=0x1
+UMASK_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_COUNT 0x01
+UMASK_LOCK_CYCLES_CACHE_LOCK_DURATION 0x02
+DEFAULT_OPTIONS_LOCK_CYCLES_CACHE_LOCK_COUNT EVENT_OPTION_EDGE=0x1
+UMASK_LOCK_CYCLES_CACHE_LOCK_COUNT 0x02
+
+EVENT_IDQ 0x79 PMC
+UMASK_IDQ_EMPTY 0x02
+UMASK_IDQ_MITE_UOPS 0x04
+DEFAULT_OPTIONS_IDQ_MITE_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_MITE_CYCLES 0x04
+UMASK_IDQ_DSB_UOPS 0x08
+DEFAULT_OPTIONS_IDQ_DSB_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_DSB_CYCLES 0x08
+UMASK_IDQ_MS_DSB_UOPS 0x10
+DEFAULT_OPTIONS_IDQ_MS_DSB_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_MS_DSB_CYCLES 0x10
+DEFAULT_OPTIONS_IDQ_MS_DSB_OCCUR EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_IDQ_MS_DSB_OCCUR 0x10
+UMASK_IDQ_MS_MITE_UOPS 0x20
+DEFAULT_OPTIONS_IDQ_MS_MITE_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_MS_MITE_CYCLES 0x20
+UMASK_IDQ_MS_UOPS 0x30
+DEFAULT_OPTIONS_IDQ_MS_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_MS_CYCLES 0x30
+DEFAULT_OPTIONS_IDQ_MS_SWITCHES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_IDQ_MS_SWITCHES 0x30
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS 0x18
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_ALL_DSB_CYCLES_4_UOPS 0x18
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_ALL_MITE_CYCLES_ANY_UOPS 0x24
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS 0x24
+UMASK_IDQ_MITE_ALL_UOPS 0x3C
+
+EVENT_ICACHE 0x80 PMC
+UMASK_ICACHE_HIT 0x01
+UMASK_ICACHE_MISSES 0x02
+UMASK_ICACHE_ACCESSES 0x03
+UMASK_ICACHE_IFETCH_STALL 0x04
+
+EVENT_ITLB_MISSES 0x85 PMC
+UMASK_ITLB_MISSES_CAUSES_A_WALK 0x01
+UMASK_ITLB_MISSES_WALK_COMPLETED_4K 0x02
+UMASK_ITLB_MISSES_WALK_COMPLETED_LARGE 0x04
+UMASK_ITLB_MISSES_WALK_COMPLETED 0x0E
+UMASK_ITLB_MISSES_WALK_DURATION 0x10
+UMASK_ITLB_MISSES_STLB_HIT_4K 0x20
+UMASK_ITLB_MISSES_STLB_HIT_2M 0x40
+UMASK_ITLB_MISSES_STLB_HIT 0x60
+
+EVENT_ILD_STALL 0x87 PMC
+UMASK_ILD_STALL_LCP 0x01
+UMASK_ILD_STALL_IQ_FULL 0x04
+
+EVENT_BR_INST_EXEC 0x88 PMC
+UMASK_BR_INST_EXEC_COND_TAKEN 0x81
+UMASK_BR_INST_EXEC_COND_NON_TAKEN 0x41
+UMASK_BR_INST_EXEC_DIRECT_JMP_TAKEN 0x82
+UMASK_BR_INST_EXEC_DIRECT_JMP_NON_TAKEN 0x42
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN 0x84
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN 0x44
+UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN 0x88
+UMASK_BR_INST_EXEC_RETURN_NEAR_NON_TAKEN 0x48
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN 0x90
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_NON_TAKEN 0x50
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN 0xA0
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN 0x60
+UMASK_BR_INST_EXEC_ALL_CONDITIONAL 0xC1
+UMASK_BR_INST_EXEC_ALL_DIRECT_JMP 0xC2
+UMASK_BR_INST_EXEC_ALL_DIRECT_NEAR_CALL 0xD0
+UMASK_BR_INST_EXEC_ALL_INDIRECT_JUMP_NON_CALL_RET 0xC4
+UMASK_BR_INST_EXEC_ALL_INDIRECT_NEAR_RETURN 0xC8
+UMASK_BR_INST_EXEC_ALL_BRANCHES 0xFF
+
+EVENT_BR_MISP_EXEC 0x89 PMC
+UMASK_BR_MISP_EXEC_COND_TAKEN 0x81
+UMASK_BR_MISP_EXEC_COND_NON_TAKEN 0x41
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN 0x84
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN 0x44
+UMASK_BR_MISP_EXEC_RETURN_NEAR_TAKEN 0x88
+UMASK_BR_MISP_EXEC_RETURN_NEAR_NON_TAKEN 0x48
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_TAKEN 0x90
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_NON_TAKEN 0x50
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN 0xA0
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN 0x60
+UMASK_BR_MISP_EXEC_ALL_CONDITIONAL 0xC1
+UMASK_BR_MISP_EXEC_ALL_INDIRECT_JUMP_NON_CALL_RET 0xC4
+UMASK_BR_MISP_EXEC_ALL_BRANCHES 0xFF
+
+EVENT_IDQ_UOPS_NOT_DELIVERED 0x9C PMC
+UMASK_IDQ_UOPS_NOT_DELIVERED_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x3
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x2
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK 0x01
+
+EVENT_UOPS_EXECUTED_PORT 0xA1 PMC
+UMASK_UOPS_EXECUTED_PORT_PORT_0 0x01
+UMASK_UOPS_EXECUTED_PORT_PORT_1 0x02
+UMASK_UOPS_EXECUTED_PORT_PORT_2 0x04
+UMASK_UOPS_EXECUTED_PORT_PORT_3 0x08
+UMASK_UOPS_EXECUTED_PORT_PORT_4 0x10
+UMASK_UOPS_EXECUTED_PORT_PORT_5 0x20
+UMASK_UOPS_EXECUTED_PORT_PORT_6 0x40
+UMASK_UOPS_EXECUTED_PORT_PORT_7 0x80
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_0_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_0_CORE 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_1_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_1_CORE 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_2_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_2_CORE 0x04
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_3_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_3_CORE 0x08
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_4_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_4_CORE 0x10
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_5_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_5_CORE 0x20
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_6_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_6_CORE 0x40
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_7_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_7_CORE 0x80
+UMASK_UOPS_EXECUTED_PORT_ARITH_PORTS 0x63
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_ARITH_PORTS_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_ARITH_PORTS_CORE 0x63
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_DATA_PORTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_DATA_PORTS 0x9C
+
+EVENT_RESOURCE_STALLS 0xA2 PMC
+UMASK_RESOURCE_STALLS_ANY 0x01
+UMASK_RESOURCE_STALLS_RS 0x04
+UMASK_RESOURCE_STALLS_SB 0x08
+UMASK_RESOURCE_STALLS_ROB 0x10
+
+EVENT_CYCLE_ACTIVITY 0xA3 PMC
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L2_PENDING EVENT_OPTION_THRESHOLD=0x1
+UMASK_CYCLE_ACTIVITY_CYCLES_L2_PENDING 0x01
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_MEM_ANY EVENT_OPTION_THRESHOLD=0x1
+UMASK_CYCLE_ACTIVITY_CYCLES_MEM_ANY 0x02
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE EVENT_OPTION_THRESHOLD=0x4
+UMASK_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE 0x04
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_L2_PENDING EVENT_OPTION_THRESHOLD=0x5
+UMASK_CYCLE_ACTIVITY_STALLS_L2_PENDING 0x05
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_MEM_ANY EVENT_OPTION_THRESHOLD=0x6
+UMASK_CYCLE_ACTIVITY_STALLS_MEM_ANY 0x06
+
+EVENT_CYCLE_ACTIVITY_CYCLES 0xA3 PMC2
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L1D_PENDING EVENT_OPTION_THRESHOLD=0x8
+UMASK_CYCLE_ACTIVITY_CYCLES_L1D_PENDING 0x08
+
+EVENT_CYCLE_ACTIVITY_STALLS 0xA3 PMC2
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_L1D_PENDING EVENT_OPTION_THRESHOLD=0xC
+UMASK_CYCLE_ACTIVITY_STALLS_L1D_PENDING 0x0C
+
+EVENT_LSD_UOPS 0xA8 PMC
+UMASK_LSD_UOPS 0x01
+DEFAULT_OPTIONS_LSD_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x4
+UMASK_LSD_CYCLES_4_UOPS 0x01
+DEFAULT_OPTIONS_LSD_CYCLES_ACTIVE EVENT_OPTION_THRESHOLD=0x1
+UMASK_LSD_CYCLES_ACTIVE 0x01
+
+EVENT_DSB2MITE_SWITCHES 0xAB PMC
+UMASK_DSB2MITE_SWITCHES_COUNT 0x01
+UMASK_DSB2MITE_SWITCHES_PENALTY_CYCLES 0x02
+
+EVENT_ITLB 0xAE PMC
+UMASK_ITLB_ITLB_FLUSH 0x01
+
+EVENT_OFFCORE_REQUESTS 0xB0 PMC
+UMASK_OFFCORE_REQUESTS_DEMAND_DATA_RD 0x01
+UMASK_OFFCORE_REQUESTS_DEMAND_CODE_RD 0x02
+UMASK_OFFCORE_REQUESTS_DEMAND_RFO 0x04
+UMASK_OFFCORE_REQUESTS_ALL_DATA_RD 0x08
+
+EVENT_UOPS_EXECUTED 0xB1 PMC
+UMASK_UOPS_EXECUTED_THREAD 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC 0x01
+UMASK_UOPS_EXECUTED_CORE 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_USED_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_TOTAL_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC 0x02
+
+EVENT_OFFCORE_REQUESTS_BUFFER_SQ_FULL 0xB2 PMC
+UMASK_OFFCORE_REQUESTS_BUFFER_SQ_FULL 0x01
+
+EVENT_PAGE_WALKER_LOADS 0xBC PMC
+UMASK_PAGE_WALKER_LOADS_DTLB_L1 0x11
+UMASK_PAGE_WALKER_LOADS_ITLB_L1 0x21
+UMASK_PAGE_WALKER_LOADS_DTLB_L2 0x12
+UMASK_PAGE_WALKER_LOADS_ITLB_L2 0x22
+UMASK_PAGE_WALKER_LOADS_DTLB_L3 0x14
+UMASK_PAGE_WALKER_LOADS_ITLB_L3 0x24
+UMASK_PAGE_WALKER_LOADS_DTLB_MEMORY 0x18
+UMASK_PAGE_WALKER_LOADS_ITLB_MEMORY 0x28
+UMASK_PAGE_WALKER_LOADS_EPT_DTLB_L1 0x41
+UMASK_PAGE_WALKER_LOADS_EPT_ITLB_L1 0x81
+UMASK_PAGE_WALKER_LOADS_EPT_DTLB_L2 0x42
+UMASK_PAGE_WALKER_LOADS_EPT_ITLB_L2 0x82
+UMASK_PAGE_WALKER_LOADS_EPT_DTLB_L3 0x44
+UMASK_PAGE_WALKER_LOADS_EPT_ITLB_L3 0x84
+UMASK_PAGE_WALKER_LOADS_EPT_DTLB_MEMORY 0x48
+UMASK_PAGE_WALKER_LOADS_EPT_ITLB_MEMORY 0x88
+
+EVENT_TLB_FLUSH 0xBD PMC
+UMASK_TLB_FLUSH_DTLB_THREAD 0x01
+UMASK_TLB_FLUSH_STLB_ANY 0x20
+
+EVENT_INST_RETIRED_PREC 0xC0 PMC1
+UMASK_INST_RETIRED_PREC_DIST 0x01
+
+EVENT_INST_RETIRED_ANY 0xC0 PMC
+UMASK_INST_RETIRED_ANY_P 0x00
+
+EVENT_OTHER_ASSISTS 0xC1 PMC
+UMASK_OTHER_ASSISTS_AVX_TO_SSE 0x08
+UMASK_OTHER_ASSISTS_SSE_TO_AVX 0x10
+UMASK_OTHER_ASSISTS_ANY_WB_ASSIST 0x40
+
+EVENT_UOPS_RETIRED 0xC2 PMC
+UMASK_UOPS_RETIRED_ALL 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL 0x01
+UMASK_UOPS_RETIRED_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_RETIRED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_TOTAL_CYCLES 0x01
+
+EVENT_MACHINE_CLEARS 0xC3 PMC
+UMASK_MACHINE_CLEARS_CYCLES 0x01
+DEFAULT_OPTIONS_MACHINE_CLEARS_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_MACHINE_CLEARS_COUNT 0x01
+UMASK_MACHINE_CLEARS_MEMORY_ORDERING 0x02
+UMASK_MACHINE_CLEARS_SMC 0x04
+UMASK_MACHINE_CLEARS_MASKMOV 0x20
+
+EVENT_BR_INST_RETIRED 0xC4 PMC
+UMASK_BR_INST_RETIRED_ALL_BRANCHES 0x00
+UMASK_BR_INST_RETIRED_CONDITIONAL 0x01
+UMASK_BR_INST_RETIRED_NEAR_CALL 0x02
+UMASK_BR_INST_RETIRED_ALL_BRANCHES 0x04
+UMASK_BR_INST_RETIRED_NEAR_RETURN 0x08
+UMASK_BR_INST_RETIRED_NOT_TAKEN 0x10
+UMASK_BR_INST_RETIRED_NEAR_TAKEN 0x20
+UMASK_BR_INST_RETIRED_FAR_BRANCH 0x40
+
+EVENT_BR_MISP_RETIRED 0xC5 PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES 0x00
+UMASK_BR_MISP_RETIRED_CONDITIONAL 0x01
+UMASK_BR_MISP_RETIRED_NEAR_NOT_TAKEN 0x10
+UMASK_BR_MISP_RETIRED_NEAR_TAKEN 0x20
+
+EVENT_AVX_INSTS 0xC6 PMC
+UMASK_AVX_INSTS_LOADS 0x01
+UMASK_AVX_INSTS_STORES 0x02
+UMASK_AVX_INSTS_CALC 0x04
+UMASK_AVX_INSTS_ALL 0x07
+
+EVENT_HLE_RETIRED 0xC8 PMC
+UMASK_HLE_RETIRED_START 0x01
+UMASK_HLE_RETIRED_COMMIT 0x02
+UMASK_HLE_RETIRED_ABORTED 0x04
+UMASK_HLE_RETIRED_ABORTED_MISC1 0x08
+UMASK_HLE_RETIRED_ABORTED_MISC2 0x10
+UMASK_HLE_RETIRED_ABORTED_MISC3 0x20
+UMASK_HLE_RETIRED_ABORTED_MISC4 0x40
+UMASK_HLE_RETIRED_ABORTED_MISC5 0x80
+
+EVENT_RTM_RETIRED 0xC9 PMC
+UMASK_RTM_RETIRED_START 0x01
+UMASK_RTM_RETIRED_COMMIT 0x02
+UMASK_RTM_RETIRED_ABORTED 0x04
+UMASK_RTM_RETIRED_ABORTED_MISC1 0x08
+UMASK_RTM_RETIRED_ABORTED_MISC2 0x10
+UMASK_RTM_RETIRED_ABORTED_MISC3 0x20
+UMASK_RTM_RETIRED_ABORTED_MISC4 0x40
+UMASK_RTM_RETIRED_ABORTED_MISC5 0x80
+
+
+EVENT_FP_ASSIST 0xCA PMC
+UMASK_FP_ASSIST_X87_OUTPUT 0x02
+UMASK_FP_ASSIST_X87_INPUT 0x04
+UMASK_FP_ASSIST_SIMD_OUTPUT 0x08
+UMASK_FP_ASSIST_SIMD_INPUT 0x10
+DEFAULT_OPTIONS_FP_ASSIST_ANY EVENT_OPTION_THRESHOLD=0x1
+UMASK_FP_ASSIST_ANY 0x1E
+
+EVENT_ROB_MISC_EVENT_LBR_INSERTS 0xCC PMC
+UMASK_ROB_MISC_EVENT_LBR_INSERTS 0x20
+
+EVENT_MEM_UOPS_RETIRED 0xD0 PMC
+UMASK_MEM_UOPS_RETIRED_LOADS 0x81
+UMASK_MEM_UOPS_RETIRED_STORES 0x82
+UMASK_MEM_UOPS_RETIRED_ALL 0x83
+UMASK_MEM_UOPS_RETIRED_LOADS_STLB_MISS 0x11
+UMASK_MEM_UOPS_RETIRED_STORES_STLB_MISS 0x12
+UMASK_MEM_UOPS_RETIRED_LOADS_LOCK 0x21
+UMASK_MEM_UOPS_RETIRED_STORES_LOCK 0x22
+UMASK_MEM_UOPS_RETIRED_LOADS_SPLIT 0x41
+UMASK_MEM_UOPS_RETIRED_STORES_SPLIT 0x42
+
+EVENT_MEM_LOAD_UOPS_RETIRED 0xD1 PMC
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT 0x01
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_MISS 0x08
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_ALL 0x09
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_HIT 0x02
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_MISS 0x10
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_ALL 0x12
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT 0x04
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS 0x20
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL 0x24
+UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB 0x40
+UMASK_MEM_LOAD_UOPS_RETIRED_ALL_MISS 0x38
+UMASK_MEM_LOAD_UOPS_RETIRED_ALL_HIT 0x07
+UMASK_MEM_LOAD_UOPS_RETIRED_ALL_ALL 0x3F
+
+EVENT_MEM_LOAD_UOPS_L3_HIT_RETIRED 0xD2 PMC
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_MISS 0x01
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HIT 0x02
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM 0x04
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_NONE 0x08
+
+EVENT_MEM_LOAD_UOPS_L3_MISS_RETIRED 0xD3 PMC
+UMASK_MEM_LOAD_UOPS_L3_MISS_RETIRED_LOCAL_DRAM 0x01
+UMASK_MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_DRAM 0x04
+UMASK_MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM 0x10
+UMASK_MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_FWD 0x20
+
+
+EVENT_BACLEARS 0xE6 PMC
+UMASK_BACLEARS_ANY 0x1F
+
+EVENT_L2_TRANS 0xF0 PMC
+UMASK_L2_TRANS_DEMAND_DATA_RD 0x01
+UMASK_L2_TRANS_RFO 0x02
+UMASK_L2_TRANS_CODE_RD 0x04
+UMASK_L2_TRANS_ALL_PF 0x08
+UMASK_L2_TRANS_L1D_WB 0x10
+UMASK_L2_TRANS_L2_FILL 0x20
+UMASK_L2_TRANS_L2_WB 0x40
+UMASK_L2_TRANS_ALL_REQUESTS 0x80
+
+EVENT_L2_LINES_IN 0xF1 PMC
+UMASK_L2_LINES_IN_I 0x01
+UMASK_L2_LINES_IN_S 0x02
+UMASK_L2_LINES_IN_E 0x04
+UMASK_L2_LINES_IN_ALL 0x07
+
+EVENT_L2_LINES_OUT 0xF2 PMC
+UMASK_L2_LINES_OUT_DEMAND_CLEAN 0x05
+UMASK_L2_LINES_OUT_DEMAND_DIRTY 0x06
+
+EVENT_OFFCORE_RESPONSE_0 0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_LOCAL_DRAM EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_LOCAL_DRAM EVENT_OPTION_MATCH0=0x8FFF,EVENT_OPTION_MATCH1=0x60040
+UMASK_OFFCORE_RESPONSE_0_LOCAL_DRAM 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_REMOTE_DRAM EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_REMOTE_DRAM EVENT_OPTION_MATCH0=0x8FFF,EVENT_OPTION_MATCH1=0x67F80
+UMASK_OFFCORE_RESPONSE_0_REMOTE_DRAM 0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY 0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY 0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY 0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY 0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY 0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY 0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY 0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY 0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY 0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY 0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY 0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY 0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY 0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1 0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_LOCAL_DRAM EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_LOCAL_DRAM EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x60040
+UMASK_OFFCORE_RESPONSE_1_LOCAL_DRAM 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_REMOTE_DRAM EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_REMOTE_DRAM EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x67F80
+UMASK_OFFCORE_RESPONSE_1_REMOTE_DRAM 0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY 0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY 0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY 0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY 0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY 0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY 0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY 0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY 0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY 0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY 0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY 0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY 0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY 0x01 0x0F 0x10
+
+EVENT_CBOX_CLOCKTICKS 0x00 CBOX
+UMASK_CBOX_CLOCKTICKS 0x00
+
+EVENT_TXR_INSERTS 0x02 CBOX
+UMASK_TXR_INSERTS_AD_CACHE 0x01
+UMASK_TXR_INSERTS_AK_CACHE 0x02
+UMASK_TXR_INSERTS_BL_CACHE 0x04
+UMASK_TXR_INSERTS_IV_CACHE 0x08
+UMASK_TXR_INSERTS_AD_CORE 0x10
+UMASK_TXR_INSERTS_AK_CORE 0x20
+UMASK_TXR_INSERTS_BL_CORE 0x40
+
+EVENT_TXR_ADS_USED 0x04 CBOX
+UMASK_TXR_ADS_USED_AD 0x01
+UMASK_TXR_ADS_USED_AK 0x02
+UMASK_TXR_ADS_USED_BL 0x04
+
+EVENT_RING_BOUNCES 0x05 CBOX
+UMASK_RING_BOUNCES_AD 0x01
+UMASK_RING_BOUNCES_AK 0x02
+UMASK_RING_BOUNCES_BL 0x04
+UMASK_RING_BOUNCES_IV 0x08
+
+EVENT_RING_SRC_THRTL 0x07 CBOX
+UMASK_RING_SRC_THRTL 0x00
+
+EVENT_FAST_ASSERTED 0x09 CBOX0C0|CBOX0C1|CBOX1C0|CBOX1C1|CBOX2C0|CBOX2C1|CBOX3C0|CBOX3C1|CBOX4C0|CBOX4C1|CBOX5C0|CBOX5C1|CBOX6C0|CBOX6C1|CBOX7C0|CBOX7C1|CBOX8C0|CBOX8C1|CBOX9C0|CBOX9C1|CBOX10C0|CBOX10C1|CBOX11C0|CBOX11C1|CBOX12C0|CBOX12C1|CBOX13C0|CBOX13C1|CBOX14C0|CBOX14C1|CBOX15C0|CBOX15C1|CBOX16C0|CBOX16C1|CBOX17C0|CBOX17C1
+UMASK_FAST_ASSERTED 0x00
+
+EVENT_BOUNCE_CONTROL 0xA CBOX
+UMASK_BOUNCE_CONTROL 0x00
+
+EVENT_RXR_OCCUPANCY 0x11 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX15C0|CBOX16C0|CBOX17C0
+UMASK_RXR_OCCUPANCY_IRQ 0x01
+UMASK_RXR_OCCUPANCY_IRQ_REJ 0x02
+UMASK_RXR_OCCUPANCY_IPQ 0x04
+UMASK_RXR_OCCUPANCY_PRQ_REJ 0x20
+UMASK_RXR_OCCUPANCY_IRQ_IPQ 0x05
+UMASK_RXR_OCCUPANCY_IRQ_PRQ_REJ 0x21
+UMASK_RXR_OCCUPANCY_IPQ_PRQ_REJ 0x24
+
+EVENT_RXR_EXT_STARVED 0x12 CBOX
+UMASK_RXR_EXT_STARVED_IRQ 0x01
+UMASK_RXR_EXT_STARVED_IPQ 0x02
+UMASK_RXR_EXT_STARVED_PRQ 0x04
+UMASK_RXR_EXT_STARVED_ISMQ_BIDS 0x08
+
+EVENT_RXR_INSERTS 0x13 CBOX
+UMASK_RXR_INSERTS_IRQ 0x01
+UMASK_RXR_INSERTS_IRQ_REJ 0x02
+UMASK_RXR_INSERTS_IPQ 0x04
+UMASK_RXR_INSERTS_PRQ 0x10
+UMASK_RXR_INSERTS_PRQ_REJ 0x20
+
+EVENT_RING_AD_USED 0x1B CBOX
+UMASK_RING_AD_USED_UP_EVEN 0x01
+UMASK_RING_AD_USED_UP_ODD 0x02
+UMASK_RING_AD_USED_UP 0x03
+UMASK_RING_AD_USED_DOWN_EVEN 0x04
+UMASK_RING_AD_USED_DOWN_ODD 0x08
+UMASK_RING_AD_USED_DOWN 0x0C
+UMASK_RING_AD_USED_ANY 0x0F
+
+EVENT_RING_AK_USED 0x1C CBOX
+UMASK_RING_AK_USED_UP_EVEN 0x01
+UMASK_RING_AK_USED_UP_ODD 0x02
+UMASK_RING_AK_USED_UP 0x03
+UMASK_RING_AK_USED_DOWN_EVEN 0x04
+UMASK_RING_AK_USED_DOWN_ODD 0x08
+UMASK_RING_AK_USED_DOWN 0x0C
+UMASK_RING_AK_USED_ANY 0x0F
+
+EVENT_RING_BL_USED 0x1D CBOX
+UMASK_RING_BL_USED_UP_EVEN 0x01
+UMASK_RING_BL_USED_UP_ODD 0x02
+UMASK_RING_BL_USED_UP 0x03
+UMASK_RING_BL_USED_DOWN_EVEN 0x04
+UMASK_RING_BL_USED_DOWN_ODD 0x08
+UMASK_RING_BL_USED_DOWN 0x0C
+UMASK_RING_BL_USED_ANY 0x0F
+
+EVENT_RING_IV_USED 0x1E CBOX
+UMASK_RING_IV_USED_UP 0x03
+UMASK_RING_IV_USED_DN 0x0C
+UMASK_RING_IV_USED_ANY 0x0F
+
+
+EVENT_COUNTER0_OCCUPANCY 0x1F CBOX
+UMASK_COUNTER0_OCCUPANCY 0x00
+
+EVENT_RXR_IPQ_RETRY2 0x28 CBOX
+UMASK_RXR_IPQ_RETRY2_AD_SBO 0x01
+OPTIONS_RXR_IPQ_RETRY2_TARGET EVENT_OPTION_NID_MASK
+UMASK_RXR_IPQ_RETRY2_TARGET 0x40
+
+EVENT_RXR_IRQ_RETRY2 0x29 CBOX
+UMASK_RXR_IRQ_RETRY2_AD_SBO 0x01
+UMASK_RXR_IRQ_RETRY2_BL_SBO 0x02
+OPTIONS_RXR_IRQ_RETRY2_TARGET EVENT_OPTION_NID_MASK
+UMASK_RXR_IRQ_RETRY2_TARGET 0x40
+
+EVENT_RXR_ISMQ_RETRY2 0x2A CBOX
+UMASK_RXR_ISMQ_RETRY2_AD_SBO 0x01
+UMASK_RXR_ISMQ_RETRY2_BL_SBO 0x02
+OPTIONS_RXR_ISMQ_RETRY2_TARGET EVENT_OPTION_NID_MASK
+UMASK_RXR_ISMQ_RETRY2_TARGET 0x40
+
+EVENT_RXR_IPQ_RETRY 0x31 CBOX
+UMASK_RXR_IPQ_RETRY_ANY 0x01
+UMASK_RXR_IPQ_RETRY_FULL 0x02
+UMASK_RXR_IPQ_RETRY_ADDR_CONFLICT 0x04
+UMASK_RXR_IPQ_RETRY_QPI_CREDITS 0x10
+
+EVENT_RXR_IRQ_RETRY 0x32 CBOX
+UMASK_RXR_IRQ_RETRY_ANY 0x01
+UMASK_RXR_IRQ_RETRY_FULL 0x02
+UMASK_RXR_IRQ_RETRY_ADDR_CONFLICT 0x04
+UMASK_RXR_IRQ_RETRY_RTID 0x01
+UMASK_RXR_IRQ_RETRY_QPI_CREDITS 0x01
+UMASK_RXR_IRQ_RETRY_IIO_CREDITS 0x01
+OPTIONS_RXR_IRQ_RETRY_NID EVENT_OPTION_NID_MASK
+UMASK_RXR_IRQ_RETRY_NID 0x01
+
+EVENT_RXR_ISMQ_RETRY 0x33 CBOX
+UMASK_RXR_ISMQ_RETRY_ANY 0x01
+UMASK_RXR_ISMQ_RETRY_FULL 0x02
+UMASK_RXR_ISMQ_RETRY_RTID 0x08
+UMASK_RXR_ISMQ_RETRY_QPI_CREDITS 0x10
+UMASK_RXR_ISMQ_RETRY_IIO_CREDITS 0x20
+OPTIONS_RXR_ISMQ_RETRY_NID EVENT_OPTION_NID_MASK
+UMASK_RXR_ISMQ_RETRY_NID 0x40
+OPTIONS_RXR_ISMQ_RETRY_WB_CREDITS EVENT_OPTION_NID_MASK
+UMASK_RXR_ISMQ_RETRY_WB_CREDITS 0x80
+
+EVENT_LLC_LOOKUP 0x34 CBOX
+OPTIONS_LLC_LOOKUP_DATA_READ EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_DATA_READ 0x03
+OPTIONS_LLC_LOOKUP_WRITE EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_WRITE 0x05
+OPTIONS_LLC_LOOKUP_REMOTE_SNOOP EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_REMOTE_SNOOP 0x09
+OPTIONS_LLC_LOOKUP_ANY EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_ANY 0x11
+OPTIONS_LLC_LOOKUP_READ EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_READ 0x21
+OPTIONS_LLC_LOOKUP_NID_MASK EVENT_OPTION_STATE_MASK|EVENT_OPTION_NID_MASK
+UMASK_LLC_LOOKUP_NID 0x41
+
+EVENT_TOR_INSERTS 0x35 CBOX
+UMASK_TOR_INSERTS_ALL 0x08
+UMASK_TOR_INSERTS_WB 0x10
+OPTIONS_TOR_INSERTS_LOCAL_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_LOCAL_OPCODE 0x21
+OPTIONS_TOR_INSERTS_MISS_LOCAL_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_MISS_LOCAL_OPCODE 0x23
+UMASK_TOR_INSERTS_LOCAL 0x28
+UMASK_TOR_INSERTS_MISS_LOCAL 0x2A
+OPTIONS_TOR_INSERTS_NID_OPCODE EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_NID_OPCODE 0x41
+OPTIONS_TOR_INSERTS_NID_MISS_OPCODE EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_NID_MISS_OPCODE 0x43
+OPTIONS_TOR_INSERTS_NID_EVICION EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_EVICION 0x44
+OPTIONS_TOR_INSERTS_NID_ALL EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_ALL 0x48
+OPTIONS_TOR_INSERTS_NID_MISS_ALL EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_MISS_ALL 0x4A
+OPTIONS_TOR_INSERTS_NID_WB EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_WB 0x50
+OPTIONS_TOR_INSERTS_REMOTE_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_REMOTE_OPCODE 0x81
+OPTIONS_TOR_INSERTS_MISS_REMOTE_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_MISS_REMOTE_OPCODE 0x83
+UMASK_TOR_INSERTS_REMOTE 0x88
+UMASK_TOR_INSERTS_MISS_REMOTE 0x8A
+
+EVENT_TOR_OCCUPANCY 0x36 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX15C0|CBOX16C0|CBOX17C0
+OPTIONS_TOR_OCCUPANCY_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_OPCODE 0x01
+OPTIONS_TOR_OCCUPANCY_MISS_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_MISS_OPCODE 0x03
+UMASK_TOR_OCCUPANCY_EVICTION 0x04
+UMASK_TOR_OCCUPANCY_ALL 0x08
+UMASK_TOR_OCCUPANCY_MISS_ALL 0x0A
+UMASK_TOR_OCCUPANCY_WB 0x10
+UMASK_TOR_OCCUPANCY_LOCAL_OPCODE 0x21
+UMASK_TOR_OCCUPANCY_MISS_LOCAL_OPCODE 0x23
+UMASK_TOR_OCCUPANCY_LOCAL 0x28
+UMASK_TOR_OCCUPANCY_MISS_LOCAL 0x2A
+OPTIONS_TOR_OCCUPANCY_NID_OPCODE EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_NID_OPCODE 0x41
+OPTIONS_TOR_OCCUPANCY_NID_MISS_OPCODE EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_NID_MISS_OPCODE 0x43
+OPTIONS_TOR_OCCUPANCY_NID_EVICTION EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_EVICTION 0x44
+OPTIONS_TOR_OCCUPANCY_NID_ALL EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_ALL 0x48
+OPTIONS_TOR_OCCUPANCY_NID_MISS_ALL EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_MISS_ALL 0x4A
+OPTIONS_TOR_OCCUPANCY_NID_WB EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_WB 0x50
+OPTIONS_TOR_OCCUPANCY_REMOTE_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_REMOTE_OPCODE 0x81
+OPTIONS_TOR_OCCUPANCY_MISS_REMOTE_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_MISS_REMOTE_OPCODE 0x83
+UMASK_TOR_OCCUPANCY_REMOTE 0x88
+UMASK_TOR_OCCUPANCY_MISS_REMOTE 0x8A
+
+EVENT_LLC_VICTIMS 0x37 CBOX
+UMASK_LLC_VICTIMS_M 0x01
+UMASK_LLC_VICTIMS_E 0x02
+UMASK_LLC_VICTIMS_I 0x04
+UMASK_LLC_VICTIMS_F 0x08
+UMASK_LLC_VICTIMS_MEIF 0x0F
+UMASK_LLC_VICTIMS_MISS 0x10
+OPTIONS_LLC_VICTIMS_NID EVENT_OPTION_NID_MASK
+UMASK_LLC_VICTIMS_NID 0x40
+
+EVENT_MISC 0x39 CBOX
+UMASK_MISC_RSPI_WAS_FSE 0x01
+UMASK_MISC_WC_ALIASING 0x02
+UMASK_MISC_STARTED 0x04
+UMASK_MISC_RFO_HIT_S 0x08
+UMASK_MISC_CVZERO_PREFETCH_VICTIM 0x10
+UMASK_MISC_CVZERO_PREFETCH_MISS 0x20
+
+EVENT_SBO_CREDITS_ACQUIRED 0x3D CBOX
+UMASK_SBO_CREDITS_ACQUIRED_AD 0x01
+UMASK_SBO_CREDITS_ACQUIRED_BL 0x02
+
+EVENT_SBO_CREDIT_OCCUPANCY 0x3E CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX15C0|CBOX16C0|CBOX17C0
+UMASK_SBO_CREDIT_OCCUPANCY_AD 0x01
+UMASK_SBO_CREDIT_OCCUPANCY_BL 0x02
+
+EVENT_EVENT_MSG 0x42 UBOX
+UMASK_EVENT_MSG_DOORBELL_RCVD 0x08
+
+EVENT_PHOLD_CYCLES 0x45 UBOX
+UMASK_PHOLD_CYCLES_ASSERT_TO_ACK 0x01
+
+EVENT_RACU_REQUESTS 0x46 UBOX
+UMASK_RACU_REQUESTS 0x00
+
+EVENT_UNCORE_CLOCK 0x00 UBOXFIX
+UMASK_UNCORE_CLOCK 0x00
+
+EVENT_SBOX_CLOCKTICKS 0x00 SBOX
+UMASK_SBOX_CLOCKTICKS 0x00
+
+EVENT_TXR_OCCUPANCY 0x01 SBOX
+UMASK_TXR_OCCUPANCY_AD_CRD 0x01
+UMASK_TXR_OCCUPANCY_AD_BNC 0x02
+UMASK_TXR_OCCUPANCY_BL_CRD 0x04
+UMASK_TXR_OCCUPANCY_BL_BNC 0x08
+UMASK_TXR_OCCUPANCY_AK 0x10
+UMASK_TXR_OCCUPANCY_IV 0x20
+
+EVENT_TXR_INSERTS 0x02 SBOX
+UMASK_TXR_INSERTS_AD_CRD 0x01
+UMASK_TXR_INSERTS_AD_BNC 0x02
+UMASK_TXR_INSERTS_BL_CRD 0x04
+UMASK_TXR_INSERTS_BL_BNC 0x08
+UMASK_TXR_INSERTS_AK 0x10
+UMASK_TXR_INSERTS_IV 0x20
+
+EVENT_TXR_ADS_USED 0x04 SBOX
+UMASK_TXR_ADS_USED_AD 0x01
+UMASK_TXR_ADS_USED_AK 0x02
+UMASK_TXR_ADS_USED_BL 0x04
+
+EVENT_RING_BOUNCES 0x05 SBOX
+UMASK_RING_BOUNCES_AD_CACHE 0x01
+UMASK_RING_BOUNCES_AK_CORE 0x02
+UMASK_RING_BOUNCES_BL_CORE 0x04
+UMASK_RING_BOUNCES_IV_CORE 0x08
+
+EVENT_FAST_ASSERTED 0x09 SBOX
+UMASK_FAST_ASSERTED 0x00
+
+EVENT_BOUNCE_CONTROL 0x0A SBOX
+UMASK_BOUNCE_CONTROL 0x00
+
+EVENT_RXR_OCCUPANCY 0x11 SBOX
+UMASK_RXR_OCCUPANCY_AD_CRD 0x01
+UMASK_RXR_OCCUPANCY_AD_BNC 0x02
+UMASK_RXR_OCCUPANCY_BL_CRD 0x04
+UMASK_RXR_OCCUPANCY_BL_BNC 0x08
+UMASK_RXR_OCCUPANCY_AK 0x10
+UMASK_RXR_OCCUPANCY_IV 0x20
+
+EVENT_RXR_BYPASS 0x12 SBOX
+UMASK_RXR_BYPASS_AD_CRD 0x01
+UMASK_RXR_BYPASS_AD_BNC 0x02
+UMASK_RXR_BYPASS_BL_CRD 0x04
+UMASK_RXR_BYPASS_BL_BNC 0x08
+UMASK_RXR_BYPASS_AK 0x10
+UMASK_RXR_BYPASS_IV 0x20
+
+EVENT_RxR_INSERTS 0x13 SBOX
+UMASK_RXR_INSERTS_AD_CRD 0x01
+UMASK_RXR_INSERTS_AD_BNC 0x02
+UMASK_RXR_INSERTS_BL_CRD 0x04
+UMASK_RXR_INSERTS_BL_BNC 0x08
+UMASK_RXR_INSERTS_AK 0x10
+UMASK_RXR_INSERTS_IV 0x20
+
+EVENT_RING_AD_USED 0x1B SBOX
+UMASK_RING_AD_USED_ANY 0x0F
+UMASK_RING_AD_USED_UP_EVEN 0x01
+UMASK_RING_AD_USED_UP_ODD 0x02
+UMASK_RING_AD_USED_UP 0x03
+UMASK_RING_AD_USED_DOWN_EVEN 0x04
+UMASK_RING_AD_USED_DOWN_ODD 0x08
+UMASK_RING_AD_USED_DOWN 0x0C
+
+EVENT_RING_AK_USED 0x1C SBOX
+UMASK_RING_AK_USED_ANY 0x0F
+UMASK_RING_AK_USED_UP_EVEN 0x01
+UMASK_RING_AK_USED_UP_ODD 0x02
+UMASK_RING_AK_USED_UP 0x03
+UMASK_RING_AK_USED_DOWN_EVEN 0x04
+UMASK_RING_AK_USED_DOWN_ODD 0x08
+UMASK_RING_AK_USED_DOWN 0x0C
+
+EVENT_RING_BL_USED 0x1D SBOX
+UMASK_RING_BL_USED_ANY 0x0F
+UMASK_RING_BL_USED_UP_EVEN 0x01
+UMASK_RING_BL_USED_UP_ODD 0x02
+UMASK_RING_BL_USED_UP 0x03
+UMASK_RING_BL_USED_DOWN_EVEN 0x04
+UMASK_RING_BL_USED_DOWN_ODD 0x08
+UMASK_RING_BL_USED_DOWN 0x0C
+
+EVENT_RING_IV_USED 0x1E SBOX
+UMASK_RING_IV_USED_ANY 0x0F
+UMASK_RING_IV_USED_UP 0x03
+UMASK_RING_IV_USED_DOWN 0x0C
+
+EVENT_WBOX_CLOCKTICKS 0x00 WBOX
+UMASK_WBOX_CLOCKTICKS 0x00
+
+EVENT_CORE0_TRANSITION_CYCLES 0x60 WBOX
+UMASK_CORE0_TRANSITION_CYCLES 0x00
+
+EVENT_CORE1_TRANSITION_CYCLES 0x61 WBOX
+UMASK_CORE1_TRANSITION_CYCLES 0x00
+
+EVENT_CORE2_TRANSITION_CYCLES 0x62 WBOX
+UMASK_CORE2_TRANSITION_CYCLES 0x00
+
+EVENT_CORE3_TRANSITION_CYCLES 0x63 WBOX
+UMASK_CORE3_TRANSITION_CYCLES 0x00
+
+EVENT_CORE4_TRANSITION_CYCLES 0x64 WBOX
+UMASK_CORE4_TRANSITION_CYCLES 0x00
+
+EVENT_CORE5_TRANSITION_CYCLES 0x65 WBOX
+UMASK_CORE5_TRANSITION_CYCLES 0x00
+
+EVENT_CORE6_TRANSITION_CYCLES 0x66 WBOX
+UMASK_CORE6_TRANSITION_CYCLES 0x00
+
+EVENT_CORE7_TRANSITION_CYCLES 0x67 WBOX
+UMASK_CORE7_TRANSITION_CYCLES 0x00
+
+EVENT_CORE8_TRANSITION_CYCLES 0x68 WBOX
+UMASK_CORE8_TRANSITION_CYCLES 0x00
+
+EVENT_CORE9_TRANSITION_CYCLES 0x69 WBOX
+UMASK_CORE9_TRANSITION_CYCLES 0x00
+
+EVENT_CORE10_TRANSITION_CYCLES 0x6A WBOX
+UMASK_CORE10_TRANSITION_CYCLES 0x00
+
+EVENT_CORE11_TRANSITION_CYCLES 0x6B WBOX
+UMASK_CORE11_TRANSITION_CYCLES 0x00
+
+EVENT_CORE12_TRANSITION_CYCLES 0x6C WBOX
+UMASK_CORE12_TRANSITION_CYCLES 0x00
+
+EVENT_CORE13_TRANSITION_CYCLES 0x6D WBOX
+UMASK_CORE13_TRANSITION_CYCLES 0x00
+
+EVENT_CORE14_TRANSITION_CYCLES 0x6E WBOX
+UMASK_CORE14_TRANSITION_CYCLES 0x00
+
+EVENT_CORE15_TRANSITION_CYCLES 0x6F WBOX
+UMASK_CORE15_TRANSITION_CYCLES 0x00
+
+EVENT_CORE16_TRANSITION_CYCLES 0x70 WBOX
+UMASK_CORE16_TRANSITION_CYCLES 0x00
+
+EVENT_CORE17_TRANSITION_CYCLES 0x71 WBOX
+UMASK_CORE17_TRANSITION_CYCLES 0x00
+
+EVENT_DEMOTIONS_CORE0 0x30 WBOX
+UMASK_DEMOTIONS_CORE0 0x00
+
+EVENT_DEMOTIONS_CORE1 0x31 WBOX
+UMASK_DEMOTIONS_CORE1 0x00
+
+EVENT_DEMOTIONS_CORE2 0x32 WBOX
+UMASK_DEMOTIONS_CORE2 0x00
+
+EVENT_DEMOTIONS_CORE3 0x33 WBOX
+UMASK_DEMOTIONS_CORE3 0x00
+
+EVENT_DEMOTIONS_CORE4 0x34 WBOX
+UMASK_DEMOTIONS_CORE4 0x00
+
+EVENT_DEMOTIONS_CORE5 0x35 WBOX
+UMASK_DEMOTIONS_CORE5 0x00
+
+EVENT_DEMOTIONS_CORE6 0x36 WBOX
+UMASK_DEMOTIONS_CORE6 0x00
+
+EVENT_DEMOTIONS_CORE7 0x37 WBOX
+UMASK_DEMOTIONS_CORE7 0x00
+
+EVENT_DEMOTIONS_CORE8 0x38 WBOX
+UMASK_DEMOTIONS_CORE8 0x00
+
+EVENT_DEMOTIONS_CORE9 0x39 WBOX
+UMASK_DEMOTIONS_CORE9 0x00
+
+EVENT_DEMOTIONS_CORE10 0x3A WBOX
+UMASK_DEMOTIONS_CORE10 0x00
+
+EVENT_DEMOTIONS_CORE11 0x3B WBOX
+UMASK_DEMOTIONS_CORE11 0x00
+
+EVENT_DEMOTIONS_CORE12 0x3C WBOX
+UMASK_DEMOTIONS_CORE12 0x00
+
+EVENT_DEMOTIONS_CORE13 0x3D WBOX
+UMASK_DEMOTIONS_CORE13 0x00
+
+EVENT_DEMOTIONS_CORE14 0x3E WBOX
+UMASK_DEMOTIONS_CORE14 0x00
+
+EVENT_DEMOTIONS_CORE15 0x3F WBOX
+UMASK_DEMOTIONS_CORE15 0x00
+
+EVENT_DEMOTIONS_CORE16 0x40 WBOX
+UMASK_DEMOTIONS_CORE16 0x00
+
+EVENT_DEMOTIONS_CORE17 0x41 WBOX
+UMASK_DEMOTIONS_CORE17 0x00
+
+EVENT_FREQ_BAND0_CYCLES 0x0B WBOX
+OPTIONS_FREQ_BAND0_CYCLES EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND0_CYCLES 0x00
+
+EVENT_FREQ_BAND1_CYCLES 0x0C WBOX
+OPTIONS_FREQ_BAND1_CYCLES EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND1_CYCLES 0x00
+
+EVENT_FREQ_BAND2_CYCLES 0x0D WBOX
+OPTIONS_FREQ_BAND2_CYCLES EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND2_CYCLES 0x00
+
+EVENT_FREQ_BAND3_CYCLES 0x0E WBOX
+OPTIONS_FREQ_BAND3_CYCLES EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND3_CYCLES 0x00
+
+EVENT_FREQ_MAX_LIMIT_THERMAL_CYCLES 0x04 WBOX
+UMASK_FREQ_MAX_LIMIT_THERMAL_CYCLES 0x00
+
+EVENT_FREQ_MAX_OS_CYCLES 0x06 WBOX
+UMASK_FREQ_MAX_OS_CYCLES 0x00
+
+EVENT_FREQ_MAX_POWER_CYCLES 0x05 WBOX
+UMASK_FREQ_MAX_POWER_CYCLES 0x00
+
+EVENT_FREQ_MIN_IO_P_CYCLES 0x73 WBOX
+UMASK_FREQ_MIN_IO_P_CYCLES 0x00
+
+EVENT_FREQ_TRANS_CYCLES 0x74 WBOX
+UMASK_FREQ_TRANS_CYCLES 0x00
+
+EVENT_MEMORY_PHASE_SHEDDING_CYCLES 0x2F WBOX
+UMASK_MEMORY_PHASE_SHEDDING_CYCLES 0x00
+
+EVENT_PKG_RESIDENCY_C0_CYCLES 0x2A WBOX
+UMASK_PKG_RESIDENCY_C0_CYCLES 0x00
+
+EVENT_PKG_RESIDENCY_C1E_CYCLES 0x4E WBOX
+UMASK_PKG_RESIDENCY_C1E_CYCLES 0x00
+
+EVENT_PKG_RESIDENCY_C2E_CYCLES 0x2B WBOX
+UMASK_PKG_RESIDENCY_C2E_CYCLES 0x00
+
+EVENT_PKG_RESIDENCY_C3_CYCLES 0x2C WBOX
+UMASK_PKG_RESIDENCY_C3_CYCLES 0x00
+
+EVENT_PKG_RESIDENCY_C6_CYCLES 0x2D WBOX
+UMASK_PKG_RESIDENCY_C6_CYCLES 0x00
+
+EVENT_PKG_RESIDENCY_C7_CYCLES 0x2E WBOX
+UMASK_PKG_RESIDENCY_C7_CYCLES 0x00
+
+EVENT_POWER_STATE_OCCUPANCY 0x80 WBOX
+UMASK_POWER_STATE_OCCUPANCY_CORES_C0 0x40
+UMASK_POWER_STATE_OCCUPANCY_CORES_C3 0x80
+UMASK_POWER_STATE_OCCUPANCY_CORES_C6 0xC0
+
+EVENT_PROCHOT_EXTERNAL_CYCLES 0x0A WBOX
+UMASK_PROCHOT_EXTERNAL_CYCLES 0x00
+
+EVENT_PROCHOT_INTERNAL_CYCLES 0x09 WBOX
+UMASK_PROCHOT_INTERNAL_CYCLES 0x00
+
+EVENT_TOTAL_TRANSITION_CYCLES 0x72 WBOX
+UMASK_TOTAL_TRANSITION_CYCLES 0x00
+
+EVENT_UFS_TRANSITIONS_RING_GV 0x79 WBOX
+UMASK_UFS_TRANSITIONS_RING_GV 0x00
+
+EVENT_VR_HOT_CYCLES 0x42 WBOX
+UMASK_VR_HOT_CYCLES 0x00
+
+EVENT_CORE_CORE_C6_RESIDENCY 0x00 WBOX0FIX
+UMASK_CORE_CORE_C6_RESIDENCY 0x00
+
+EVENT_CORE_CORE_C3_RESIDENCY 0x00 WBOX1FIX
+UMASK_CORE_CORE_C3_RESIDENCY 0x00
+
+EVENT_CORE_PKG_C2_RESIDENCY 0x00 WBOX2FIX
+EVENT_CORE_PKG_C2_RESIDENCY 0x00
+
+EVENT_CORE_PKG_C3_RESIDENCY 0x00 WBOX3FIX
+UMASK_CORE_PKG_C3_RESIDENCY 0x00
+
+EVENT_BBOX_CLOCKTICKS 0x00 BBOX
+UMASK_BBOX_CLOCKTICKS 0x00
+
+EVENT_ADDR_OPC_MATCH 0x20 BBOX
+OPTIONS_ADDR_OPC_MATCH_ADDR EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_ADDR_OPC_MATCH_ADDR 0x01
+OPTIONS_ADDR_OPC_MATCH_OPC EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_OPC 0x02
+OPTIONS_ADDR_OPC_MATCH_FILT EVENT_OPTION_OPCODE_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_ADDR_OPC_MATCH_FILT 0x03
+OPTIONS_ADDR_OPC_MATCH_AD EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_AD 0x04
+OPTIONS_ADDR_OPC_MATCH_BL EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_BL 0x08
+OPTIONS_ADDR_OPC_MATCH_AK EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_AK 0x10
+
+EVENT_BT_CYCLES_NE 0x42 BBOX
+UMASK_BT_CYCLES_NE 0x00
+
+EVENT_BT_OCCUPANCY 0x43 BBOX
+UMASK_BT_OCCUPANCY 0x00
+
+EVENT_BYPASS_IMC 0x14 BBOX
+UMASK_BYPASS_IMC_TAKEN 0x01
+UMASK_BYPASS_IMC_NOT_TAKEN 0x02
+
+EVENT_CONFLICT_CYCLES 0x0B BBOX0C1|BBOX1C1
+UMASK_CONFLICT_CYCLES 0x00
+
+EVENT_DIRECT2CORE_COUNT 0x11 BBOX
+UMASK_DIRECT2CORE_COUNT 0x00
+
+EVENT_DIRECT2CORE_CYCLES_DISABLED 0x12 BBOX
+UMASK_DIRECT2CORE_CYCLES_DISABLED 0x00
+
+EVENT_DIRECT2CORE_TXN_OVERRIDE 0x13 BBOX
+UMASK_DIRECT2CORE_TXN_OVERRIDE 0x00
+
+EVENT_DIRECTORY_LAT_OPT 0x41 BBOX
+UMASK_DIRECTORY_LAT_OPT 0x00
+
+EVENT_DIRECTORY_LOOKUP 0x0C BBOX
+UMASK_DIRECTORY_LOOKUP_SNP 0x01
+UMASK_DIRECTORY_LOOKUP_NO_SNP 0x02
+
+EVENT_DIRECTORY_UPDATE 0x0D BBOX
+UMASK_DIRECTORY_UPDATE_SET 0x01
+UMASK_DIRECTORY_UPDATE_CLEAR 0x02
+UMASK_DIRECTORY_UPDATE_ANY 0x03
+
+EVENT_HITME_LOOKUP 0x70 BBOX
+UMASK_HITME_LOOKUP_READ_OR_INVITOE 0x01
+UMASK_HITME_LOOKUP_WBMTOI 0x02
+UMASK_HITME_LOOKUP_ACKCNFLTWBI 0x04
+UMASK_HITME_LOOKUP_WBMTOE_OR_S 0x08
+UMASK_HITME_LOOKUP_HOM 0x0F
+UMASK_HITME_LOOKUP_RSPFWDI_REMOTE 0x10
+UMASK_HITME_LOOKUP_RSPFWDI_LOCAL 0x20
+UMASK_HITME_LOOKUP_INVALS 0x26
+UMASK_HITME_LOOKUP_RSPFWDS 0x40
+UMASK_HITME_LOOKUP_EVICTS 0x42
+UMASK_HITME_LOOKUP_ALLOCS 0x70
+UMASK_HITME_LOOKUP_RSP 0x80
+UMASK_HITME_LOOKUP_ALL 0xFF
+
+EVENT_HITME_HIT 0x71 BBOX
+UMASK_HITME_HIT_READ_OR_INVITOE 0x01
+UMASK_HITME_HIT_WBMTOI 0x02
+UMASK_HITME_HIT_ACKCNFLTWBI 0x04
+UMASK_HITME_HIT_WBMTOE_OR_S 0x08
+UMASK_HITME_HIT_HOM 0x0F
+UMASK_HITME_HIT_RSPFWDI_REMOTE 0x10
+UMASK_HITME_HIT_RSPFWDI_LOCAL 0x20
+UMASK_HITME_HIT_INVALS 0x26
+UMASK_HITME_HIT_RSPFWDS 0x40
+UMASK_HITME_HIT_EVICTS 0x42
+UMASK_HITME_HIT_ALLOCS 0x70
+UMASK_HITME_HIT_RSP 0x80
+UMASK_HITME_HIT_ALL 0xFF
+
+EVENT_HITME_HIT_PV_BITS_SET 0x72 BBOX
+UMASK_HITME_HIT_PV_BITS_SET_READ_OR_INVITOE 0x01
+UMASK_HITME_HIT_PV_BITS_SET_WBMTOI 0x02
+UMASK_HITME_HIT_PV_BITS_SET_ACKCNFLTWBI 0x04
+UMASK_HITME_HIT_PV_BITS_SET_WBMTOE_OR_S 0x08
+UMASK_HITME_HIT_PV_BITS_SET_HOM 0x0F
+UMASK_HITME_HIT_PV_BITS_SET_RSPFWDI_REMOTE 0x10
+UMASK_HITME_HIT_PV_BITS_SET_RSPFWDI_LOCAL 0x20
+UMASK_HITME_HIT_PV_BITS_SET_RSPFWDS 0x40
+UMASK_HITME_HIT_PV_BITS_SET_RSP 0x80
+UMASK_HITME_HIT_PV_BITS_SET_ALL 0xFF
+
+EVENT_IGR_NO_CREDIT_CYCLES 0x22 BBOX
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI0 0x01
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI1 0x02
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI0 0x04
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI1 0x08
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI2 0x10
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI2 0x20
+
+EVENT_IMC_READS 0x17 BBOX
+UMASK_IMC_READS_NORMAL 0x01
+
+EVENT_IMC_RETRY 0x1E BBOX
+UMASK_IMC_RETRY 0x00
+
+EVENT_IMC_WRITES 0x1A BBOX
+UMASK_IMC_WRITES_FULL 0x01
+UMASK_IMC_WRITES_PARTIAL 0x02
+UMASK_IMC_WRITES_FULL_ISOCH 0x04
+UMASK_IMC_WRITES_PARTIAL_ISOCH 0x08
+UMASK_IMC_WRITES_ALL 0x0F
+
+EVENT_OSB 0x53 BBOX
+UMASK_OSB_READS_LOCAL 0x02
+UMASK_OSB_INVITOE_LOCAL 0x04
+UMASK_OSB_REMOTE 0x08
+UMASK_OSB_CANCELLED 0x10
+UMASK_OSB_READS_LOCAL_USEFUL 0x20
+UMASK_OSB_REMOTE_USEFUL 0x40
+
+EVENT_OSB_EDR 0x54 BBOX
+UMASK_OSB_EDR_ALL 0x01
+UMASK_OSB_EDR_READS_LOCAL_I 0x02
+UMASK_OSB_EDR_READS_REMOTE_I 0x04
+UMASK_OSB_EDR_READS_LOCAL_S 0x08
+UMASK_OSB_EDR_READS_REMOTE_S 0x10
+
+EVENT_REQUESTS 0x01 BBOX
+UMASK_REQUESTS_READS_LOCAL 0x01
+UMASK_REQUESTS_READS_REMOTE 0x02
+UMASK_REQUESTS_READS 0x03
+UMASK_REQUESTS_WRITES_LOCAL 0x04
+UMASK_REQUESTS_WRITES_REMOTE 0x08
+UMASK_REQUESTS_WRITES 0x0C
+UMASK_REQUESTS_INVITOE_LOCAL 0x10
+UMASK_REQUESTS_INVITOE_REMOTE 0x20
+
+EVENT_RING_AD_USED 0x3E BBOX
+UMASK_RING_AD_USED_CW_EVEN 0x01
+UMASK_RING_AD_USED_CW_ODD 0x02
+UMASK_RING_AD_USED_CW 0x03
+UMASK_RING_AD_USED_CCW_EVEN 0x04
+UMASK_RING_AD_USED_CCW_ODD 0x08
+UMASK_RING_AD_USED_CCW 0x0C
+
+EVENT_RING_AK_USED 0x3F BBOX
+UMASK_RING_AK_USED_CW_EVEN 0x01
+UMASK_RING_AK_USED_CW_ODD 0x02
+UMASK_RING_AK_USED_CW 0x03
+UMASK_RING_AK_USED_CCW_EVEN 0x04
+UMASK_RING_AK_USED_CCW_ODD 0x08
+UMASK_RING_AK_USED_CCW 0x0C
+
+EVENT_RING_BL_USED 0x40 BBOX
+UMASK_RING_BL_USED_CW_EVEN 0x01
+UMASK_RING_BL_USED_CW_ODD 0x02
+UMASK_RING_BL_USED_CW 0x03
+UMASK_RING_BL_USED_CCW_EVEN 0x04
+UMASK_RING_BL_USED_CCW_ODD 0x08
+UMASK_RING_BL_USED_CCW 0x0C
+
+EVENT_RPQ_CYCLES_NO_REG_CREDITS 0x15 BBOX
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN0 0x01
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN1 0x02
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN2 0x04
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN3 0x08
+
+EVENT_WPQ_CYCLES_NO_REG_CREDITS 0x18 BBOX
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0 0x01
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0 0x02
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0 0x04
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0 0x08
+
+EVENT_SBO0_CREDITS_ACQUIRED 0x68 BBOX
+UMASK_SBO0_CREDITS_ACQUIRED_AD 0x01
+UMASK_SBO0_CREDITS_ACQUIRED_BL 0x02
+
+EVENT_SBO1_CREDITS_ACQUIRED 0x69 BBOX
+UMASK_SBO1_CREDITS_ACQUIRED_AD 0x01
+UMASK_SBO1_CREDITS_ACQUIRED_BL 0x02
+
+EVENT_SBO0_CREDITS_OCCUPANCY 0x6A BBOX
+UMASK_SBO0_CREDITS_OCCUPANCY_AD 0x01
+UMASK_SBO0_CREDITS_OCCUPANCY_BL 0x02
+
+EVENT_SBO1_CREDITS_OCCUPANCY 0x6B BBOX
+UMASK_SBO1_CREDITS_OCCUPANCY_AD 0x01
+UMASK_SBO1_CREDITS_OCCUPANCY_BL 0x02
+
+EVENT_SNOOPS_RSP_AFTER_DATA 0x0A BBOX
+UMASK_SNOOPS_RSP_AFTER_DATA_LOCAL 0x01
+UMASK_SNOOPS_RSP_AFTER_DATA_REMOTE 0x02
+
+EVENT_SNOOP_CYCLES_NE 0x08 BBOX
+UMASK_SNOOP_CYCLES_NE_LOCAL 0x01
+UMASK_SNOOP_CYCLES_NE_REMOTE 0x02
+UMASK_SNOOP_CYCLES_NE_ALL 0x03
+
+EVENT_SNOOP_OCCUPANCY 0x09 BBOX
+UMASK_SNOOP_OCCUPANCY_LOCAL 0x01
+UMASK_SNOOP_OCCUPANCY_REMOTE 0x02
+
+EVENT_SNOOP_RESP 0x21 BBOX
+UMASK_SNOOP_RESP_RSPI 0x01
+UMASK_SNOOP_RESP_RSPS 0x02
+UMASK_SNOOP_RESP_RSPIFWD 0x04
+UMASK_SNOOP_RESP_RSPSFWD 0x08
+UMASK_SNOOP_RESP_RSP_WB 0x10
+UMASK_SNOOP_RESP_RSP_FWD_WB 0x20
+UMASK_SNOOP_RESP_RSPCNFLCT 0x40
+
+EVENT_SNP_RESP_RECV_LOCAL 0x60 BBOX
+UMASK_SNP_RESP_RECV_LOCAL_RSPI 0x01
+UMASK_SNP_RESP_RECV_LOCAL_RSPS 0x02
+UMASK_SNP_RESP_RECV_LOCAL_RSPIFWD 0x04
+UMASK_SNP_RESP_RECV_LOCAL_RSPSFWD 0x08
+UMASK_SNP_RESP_RECV_LOCAL_RSPXWB 0x10
+UMASK_SNP_RESP_RECV_LOCAL_RSPXFWDXWB 0x20
+UMASK_SNP_RESP_RECV_LOCAL_RSPCNFLCT 0x40
+UMASK_SNP_RESP_RECV_LOCAL_OTHER 0x80
+
+EVENT_STALL_NO_SBO_CREDIT 0x6C BBOX
+UMASK_STALL_NO_SBO_CREDIT_SBO0_AD 0x01
+UMASK_STALL_NO_SBO_CREDIT_SBO1_AD 0x02
+UMASK_STALL_NO_SBO_CREDIT_SBO0_BL 0x04
+UMASK_STALL_NO_SBO_CREDIT_SBO1_BL 0x08
+
+EVENT_TAD_REQUESTS_G0 0x1B BBOX
+UMASK_TAD_REQUESTS_G0_REGION0 0x01
+UMASK_TAD_REQUESTS_G0_REGION1 0x02
+UMASK_TAD_REQUESTS_G0_REGION2 0x04
+UMASK_TAD_REQUESTS_G0_REGION3 0x08
+UMASK_TAD_REQUESTS_G0_REGION4 0x10
+UMASK_TAD_REQUESTS_G0_REGION5 0x20
+UMASK_TAD_REQUESTS_G0_REGION6 0x40
+UMASK_TAD_REQUESTS_G0_REGION7 0x60
+
+EVENT_TAD_REQUESTS_G1 0x1C BBOX
+UMASK_TAD_REQUESTS_G1_REGION8 0x01
+UMASK_TAD_REQUESTS_G1_REGION9 0x02
+UMASK_TAD_REQUESTS_G1_REGION10 0x04
+UMASK_TAD_REQUESTS_G1_REGION11 0x08
+
+EVENT_TRACKER_CYCLES_FULL 0x02 BBOX
+UMASK_TRACKER_CYCLES_FULL_GP 0x01
+UMASK_TRACKER_CYCLES_FULL_ALL 0x02
+
+EVENT_TRACKER_CYCLES_NE 0x03 BBOX
+UMASK_TRACKER_CYCLES_NE_LOCAL 0x01
+UMASK_TRACKER_CYCLES_NE_REMOTE 0x02
+UMASK_TRACKER_CYCLES_NE_ALL 0x03
+
+EVENT_TRACKER_OCCUPANCY 0x04 BBOX
+UMASK_TRACKER_OCCUPANCY_READS_LOCAL 0x04
+UMASK_TRACKER_OCCUPANCY_READS_REMOTE 0x08
+UMASK_TRACKER_OCCUPANCY_WRITES_LOCAL 0x10
+UMASK_TRACKER_OCCUPANCY_WRITES_REMOTE 0x20
+UMASK_TRACKER_OCCUPANCY_INVITOE_LOCAL 0x40
+UMASK_TRACKER_OCCUPANCY_INVITOE_REMOTE 0x80
+
+EVENT_TRACKER_PENDING_OCCUPANCY 0x05 BBOX
+UMASK_TRACKER_PENDING_OCCUPANCY_LOCAL 0x01
+UMASK_TRACKER_PENDING_OCCUPANCY_REMOTE 0x02
+
+EVENT_TXR_AD_CYCLES_FULL 0x2A BBOX
+UMASK_TXR_AD_CYCLES_FULL_SCHED0 0x01
+UMASK_TXR_AD_CYCLES_FULL_SCHED1 0x02
+UMASK_TXR_AD_CYCLES_FULL_ALL 0x03
+
+EVENT_TXR_AK 0x0E BBOX
+UMASK_TXR_AK 0x00
+
+EVENT_TXR_AK_CYCLES_FULL 0x32 BBOX
+UMASK_TXR_AK_CYCLES_FULL_SCHED0 0x01
+UMASK_TXR_AK_CYCLES_FULL_SCHED1 0x02
+UMASK_TXR_AK_CYCLES_FULL_ALL 0x03
+
+EVENT_TXR_BL 0x10 BBOX
+UMASK_TXR_BL_DRS_CACHE 0x01
+UMASK_TXR_BL_DRS_CORE 0x02
+UMASK_TXR_BL_DRS_QPI 0x04
+
+EVENT_TXR_BL_CYCLES_FULL 0x36 BBOX
+UMASK_TXR_BL_CYCLES_FULL_SCHED0 0x01
+UMASK_TXR_BL_CYCLES_FULL_SCHED1 0x02
+UMASK_TXR_BL_CYCLES_FULL_ALL 0x03
+
+EVENT_TXR_BL_OCCUPANCY 0x34 BBOX
+UMASK_TXR_BL_OCCUPANCY 0x00
+
+EVENT_TXR_STARVED 0x6D BBOX
+UMASK_TXR_STARVED_AK 0x01
+UMASK_TXR_STARVED_BL 0x02
+
+EVENT_DRAM_CLOCKTICKS 0x00 MBOX
+UMASK_DRAM_CLOCKTICKS 0x00
+
+EVENT_ACT_COUNT 0x01 MBOX
+UMASK_ACT_COUNT_RD 0x01
+UMASK_ACT_COUNT_WR 0x02
+UMASK_ACT_COUNT_BYP 0x08
+
+EVENT_BYP_CMDS 0xA1 MBOX
+UMASK_BYP_CMDS_ACT 0x01
+UMASK_BYP_CMDS_CAS 0x02
+UMASK_BYP_CMDS_PRE 0x04
+
+EVENT_CAS_COUNT 0x04 MBOX
+UMASK_CAS_COUNT_RD_REG 0x01
+UMASK_CAS_COUNT_RD_UNDERFILL 0x02
+UMASK_CAS_COUNT_RD 0x03
+UMASK_CAS_COUNT_RD_WMM 0x10
+UMASK_CAS_COUNT_RD_RMM 0x20
+UMASK_CAS_COUNT_WR_WMM 0x04
+UMASK_CAS_COUNT_WR_RMM 0x08
+UMASK_CAS_COUNT_WR 0x0C
+UMASK_CAS_COUNT_ALL 0x0F
+
+EVENT_DRAM_PRE_ALL 0x06 MBOX
+UMASK_DRAM_PRE_ALL 0x00
+
+EVENT_DRAM_REFRESH 0x05 MBOX
+UMASK_DRAM_REFRESH_PANIC 0x02
+UMASK_DRAM_REFRESH_HIGH 0x04
+
+EVENT_ECC_CORRECTABLE_ERRORS 0x09 MBOX
+UMASK_ECC_CORRECTABLE_ERRORS 0x00
+
+EVENT_MAJOR_MODES 0x07 MBOX
+UMASK_MAJOR_MODES_READ 0x01
+UMASK_MAJOR_MODES_WRITE 0x02
+UMASK_MAJOR_MODES_PARTIAL 0x03
+UMASK_MAJOR_MODES_ISOCH 0x04
+
+EVENT_POWER_CHANNEL_DLLOFF 0x84 MBOX
+UMASK_POWER_CHANNEL_DLLOFF 0x00
+
+EVENT_POWER_CHANNEL_PPD 0x85 MBOX
+UMASK_POWER_CHANNEL_PPD 0x00
+
+EVENT_POWER_CKE_CYCLES 0x83 MBOX
+UMASK_POWER_CKE_CYCLES_RANK0 0x01
+UMASK_POWER_CKE_CYCLES_RANK1 0x02
+UMASK_POWER_CKE_CYCLES_RANK2 0x04
+UMASK_POWER_CKE_CYCLES_RANK3 0x08
+UMASK_POWER_CKE_CYCLES_RANK4 0x10
+UMASK_POWER_CKE_CYCLES_RANK5 0x20
+UMASK_POWER_CKE_CYCLES_RANK6 0x40
+UMASK_POWER_CKE_CYCLES_RANK7 0x80
+
+EVENT_POWER_CRITICAL_THROTTLE_CYCLES 0x86 MBOX
+UMASK_POWER_CRITICAL_THROTTLE_CYCLES 0x00
+
+EVENT_POWER_PCU_THROTTLING 0x42 MBOX
+UMASK_POWER_PCU_THROTTLING 0x00
+
+EVENT_POWER_SELF_REFRESH 0x43 MBOX
+UMASK_POWER_SELF_REFRESH 0x00
+
+EVENT_POWER_THROTTLE_CYCLES 0x41 MBOX
+UMASK_POWER_THROTTLE_CYCLES_RANK0 0x01
+UMASK_POWER_THROTTLE_CYCLES_RANK1 0x02
+UMASK_POWER_THROTTLE_CYCLES_RANK2 0x04
+UMASK_POWER_THROTTLE_CYCLES_RANK3 0x08
+UMASK_POWER_THROTTLE_CYCLES_RANK4 0x10
+UMASK_POWER_THROTTLE_CYCLES_RANK5 0x20
+UMASK_POWER_THROTTLE_CYCLES_RANK6 0x40
+UMASK_POWER_THROTTLE_CYCLES_RANK7 0x80
+
+EVENT_PREEMPTION 0x08 MBOX
+UMASK_PREEMPTION_RD_PREEMPT_RD 0x01
+UMASK_PREEMPTION_RD_PREEMPT_WR 0x02
+
+EVENT_PRE_COUNT 0x02 MBOX
+UMASK_PRE_COUNT_PAGE_MISS 0x01
+UMASK_PRE_COUNT_PAGE_CLOSE 0x02
+UMASK_PRE_COUNT_RD 0x04
+UMASK_PRE_COUNT_WR 0x08
+UMASK_PRE_COUNT_BYP 0x10
+
+EVENT_RD_CAS_PRIO 0xA0 MBOX
+UMASK_RD_CAS_PRIO_LOW 0x01
+UMASK_RD_CAS_PRIO_MED 0x02
+UMASK_RD_CAS_PRIO_HIGH 0x04
+UMASK_RD_CAS_PRIO_PANIC 0x08
+
+EVENT_RD_CAS_RANK0 0xB0 MBOX
+UMASK_RD_CAS_RANK0_BANK0 0x00
+UMASK_RD_CAS_RANK0_BANK1 0x01
+UMASK_RD_CAS_RANK0_BANK2 0x02
+UMASK_RD_CAS_RANK0_BANK3 0x03
+UMASK_RD_CAS_RANK0_BANK4 0x04
+UMASK_RD_CAS_RANK0_BANK5 0x05
+UMASK_RD_CAS_RANK0_BANK6 0x06
+UMASK_RD_CAS_RANK0_BANK7 0x07
+UMASK_RD_CAS_RANK0_BANK8 0x08
+UMASK_RD_CAS_RANK0_BANK9 0x09
+UMASK_RD_CAS_RANK0_BANK10 0x0A
+UMASK_RD_CAS_RANK0_BANK11 0x0B
+UMASK_RD_CAS_RANK0_BANK12 0x0C
+UMASK_RD_CAS_RANK0_BANK13 0x0D
+UMASK_RD_CAS_RANK0_BANK14 0x0E
+UMASK_RD_CAS_RANK0_BANK15 0x0F
+UMASK_RD_CAS_RANK0_ALLBANKS 0x10
+UMASK_RD_CAS_RANK0_BANKG0 0x11
+UMASK_RD_CAS_RANK0_BANKG1 0x12
+UMASK_RD_CAS_RANK0_BANKG2 0x13
+UMASK_RD_CAS_RANK0_BANKG3 0x14
+
+EVENT_RD_CAS_RANK1 0xB1 MBOX
+UMASK_RD_CAS_RANK1_BANK0 0x00
+UMASK_RD_CAS_RANK1_BANK1 0x01
+UMASK_RD_CAS_RANK1_BANK2 0x02
+UMASK_RD_CAS_RANK1_BANK3 0x03
+UMASK_RD_CAS_RANK1_BANK4 0x04
+UMASK_RD_CAS_RANK1_BANK5 0x05
+UMASK_RD_CAS_RANK1_BANK6 0x06
+UMASK_RD_CAS_RANK1_BANK7 0x07
+UMASK_RD_CAS_RANK1_BANK8 0x08
+UMASK_RD_CAS_RANK1_BANK9 0x09
+UMASK_RD_CAS_RANK1_BANK10 0x0A
+UMASK_RD_CAS_RANK1_BANK11 0x0B
+UMASK_RD_CAS_RANK1_BANK12 0x0C
+UMASK_RD_CAS_RANK1_BANK13 0x0D
+UMASK_RD_CAS_RANK1_BANK14 0x0E
+UMASK_RD_CAS_RANK1_BANK15 0x0F
+UMASK_RD_CAS_RANK1_ALLBANKS 0x10
+UMASK_RD_CAS_RANK1_BANKG0 0x11
+UMASK_RD_CAS_RANK1_BANKG1 0x12
+UMASK_RD_CAS_RANK1_BANKG2 0x13
+UMASK_RD_CAS_RANK1_BANKG3 0x14
+
+EVENT_RD_CAS_RANK2 0xB2 MBOX
+UMASK_RD_CAS_RANK2_BANK0 0x00
+UMASK_RD_CAS_RANK2_BANK1 0x01
+UMASK_RD_CAS_RANK2_BANK2 0x02
+UMASK_RD_CAS_RANK2_BANK3 0x03
+UMASK_RD_CAS_RANK2_BANK4 0x04
+UMASK_RD_CAS_RANK2_BANK5 0x05
+UMASK_RD_CAS_RANK2_BANK6 0x06
+UMASK_RD_CAS_RANK2_BANK7 0x07
+UMASK_RD_CAS_RANK2_BANK8 0x08
+UMASK_RD_CAS_RANK2_BANK9 0x09
+UMASK_RD_CAS_RANK2_BANK10 0x0A
+UMASK_RD_CAS_RANK2_BANK11 0x0B
+UMASK_RD_CAS_RANK2_BANK12 0x0C
+UMASK_RD_CAS_RANK2_BANK13 0x0D
+UMASK_RD_CAS_RANK2_BANK14 0x0E
+UMASK_RD_CAS_RANK2_BANK15 0x0F
+UMASK_RD_CAS_RANK2_ALLBANKS 0x10
+UMASK_RD_CAS_RANK2_BANKG0 0x11
+UMASK_RD_CAS_RANK2_BANKG1 0x12
+UMASK_RD_CAS_RANK2_BANKG2 0x13
+UMASK_RD_CAS_RANK2_BANKG3 0x14
+
+EVENT_RD_CAS_RANK3 0xB3 MBOX
+UMASK_RD_CAS_RANK3_BANK0 0x00
+UMASK_RD_CAS_RANK3_BANK1 0x01
+UMASK_RD_CAS_RANK3_BANK2 0x02
+UMASK_RD_CAS_RANK3_BANK3 0x03
+UMASK_RD_CAS_RANK3_BANK4 0x04
+UMASK_RD_CAS_RANK3_BANK5 0x05
+UMASK_RD_CAS_RANK3_BANK6 0x06
+UMASK_RD_CAS_RANK3_BANK7 0x07
+UMASK_RD_CAS_RANK3_BANK8 0x08
+UMASK_RD_CAS_RANK3_BANK9 0x09
+UMASK_RD_CAS_RANK3_BANK10 0x0A
+UMASK_RD_CAS_RANK3_BANK11 0x0B
+UMASK_RD_CAS_RANK3_BANK12 0x0C
+UMASK_RD_CAS_RANK3_BANK13 0x0D
+UMASK_RD_CAS_RANK3_BANK14 0x0E
+UMASK_RD_CAS_RANK3_BANK15 0x0F
+UMASK_RD_CAS_RANK3_ALLBANKS 0x10
+UMASK_RD_CAS_RANK3_BANKG0 0x11
+UMASK_RD_CAS_RANK3_BANKG1 0x12
+UMASK_RD_CAS_RANK3_BANKG2 0x13
+UMASK_RD_CAS_RANK3_BANKG3 0x14
+
+EVENT_RD_CAS_RANK4 0xB4 MBOX
+UMASK_RD_CAS_RANK4_BANK0 0x00
+UMASK_RD_CAS_RANK4_BANK1 0x01
+UMASK_RD_CAS_RANK4_BANK2 0x02
+UMASK_RD_CAS_RANK4_BANK3 0x03
+UMASK_RD_CAS_RANK4_BANK4 0x04
+UMASK_RD_CAS_RANK4_BANK5 0x05
+UMASK_RD_CAS_RANK4_BANK6 0x06
+UMASK_RD_CAS_RANK4_BANK7 0x07
+UMASK_RD_CAS_RANK4_BANK8 0x08
+UMASK_RD_CAS_RANK4_BANK9 0x09
+UMASK_RD_CAS_RANK4_BANK10 0x0A
+UMASK_RD_CAS_RANK4_BANK11 0x0B
+UMASK_RD_CAS_RANK4_BANK12 0x0C
+UMASK_RD_CAS_RANK4_BANK13 0x0D
+UMASK_RD_CAS_RANK4_BANK14 0x0E
+UMASK_RD_CAS_RANK4_BANK15 0x0F
+UMASK_RD_CAS_RANK4_ALLBANKS 0x10
+UMASK_RD_CAS_RANK4_BANKG0 0x11
+UMASK_RD_CAS_RANK4_BANKG1 0x12
+UMASK_RD_CAS_RANK4_BANKG2 0x13
+UMASK_RD_CAS_RANK4_BANKG3 0x14
+
+EVENT_RD_CAS_RANK5 0xB5 MBOX
+UMASK_RD_CAS_RANK5_BANK0 0x00
+UMASK_RD_CAS_RANK5_BANK1 0x01
+UMASK_RD_CAS_RANK5_BANK2 0x02
+UMASK_RD_CAS_RANK5_BANK3 0x03
+UMASK_RD_CAS_RANK5_BANK4 0x04
+UMASK_RD_CAS_RANK5_BANK5 0x05
+UMASK_RD_CAS_RANK5_BANK6 0x06
+UMASK_RD_CAS_RANK5_BANK7 0x07
+UMASK_RD_CAS_RANK5_BANK8 0x08
+UMASK_RD_CAS_RANK5_BANK9 0x09
+UMASK_RD_CAS_RANK5_BANK10 0x0A
+UMASK_RD_CAS_RANK5_BANK11 0x0B
+UMASK_RD_CAS_RANK5_BANK12 0x0C
+UMASK_RD_CAS_RANK5_BANK13 0x0D
+UMASK_RD_CAS_RANK5_BANK14 0x0E
+UMASK_RD_CAS_RANK5_BANK15 0x0F
+UMASK_RD_CAS_RANK5_ALLBANKS 0x10
+UMASK_RD_CAS_RANK5_BANKG0 0x11
+UMASK_RD_CAS_RANK5_BANKG1 0x12
+UMASK_RD_CAS_RANK5_BANKG2 0x13
+UMASK_RD_CAS_RANK5_BANKG3 0x14
+
+EVENT_RD_CAS_RANK6 0xB6 MBOX
+UMASK_RD_CAS_RANK6_BANK0 0x00
+UMASK_RD_CAS_RANK6_BANK1 0x01
+UMASK_RD_CAS_RANK6_BANK2 0x02
+UMASK_RD_CAS_RANK6_BANK3 0x03
+UMASK_RD_CAS_RANK6_BANK4 0x04
+UMASK_RD_CAS_RANK6_BANK5 0x05
+UMASK_RD_CAS_RANK6_BANK6 0x06
+UMASK_RD_CAS_RANK6_BANK7 0x07
+UMASK_RD_CAS_RANK6_BANK8 0x08
+UMASK_RD_CAS_RANK6_BANK9 0x09
+UMASK_RD_CAS_RANK6_BANK10 0x0A
+UMASK_RD_CAS_RANK6_BANK11 0x0B
+UMASK_RD_CAS_RANK6_BANK12 0x0C
+UMASK_RD_CAS_RANK6_BANK13 0x0D
+UMASK_RD_CAS_RANK6_BANK14 0x0E
+UMASK_RD_CAS_RANK6_BANK15 0x0F
+UMASK_RD_CAS_RANK6_ALLBANKS 0x10
+UMASK_RD_CAS_RANK6_BANKG0 0x11
+UMASK_RD_CAS_RANK6_BANKG1 0x12
+UMASK_RD_CAS_RANK6_BANKG2 0x13
+UMASK_RD_CAS_RANK6_BANKG3 0x14
+
+EVENT_RD_CAS_RANK7 0xB7 MBOX
+UMASK_RD_CAS_RANK7_BANK0 0x00
+UMASK_RD_CAS_RANK7_BANK1 0x01
+UMASK_RD_CAS_RANK7_BANK2 0x02
+UMASK_RD_CAS_RANK7_BANK3 0x03
+UMASK_RD_CAS_RANK7_BANK4 0x04
+UMASK_RD_CAS_RANK7_BANK5 0x05
+UMASK_RD_CAS_RANK7_BANK6 0x06
+UMASK_RD_CAS_RANK7_BANK7 0x07
+UMASK_RD_CAS_RANK7_BANK8 0x08
+UMASK_RD_CAS_RANK7_BANK9 0x09
+UMASK_RD_CAS_RANK7_BANK10 0x0A
+UMASK_RD_CAS_RANK7_BANK11 0x0B
+UMASK_RD_CAS_RANK7_BANK12 0x0C
+UMASK_RD_CAS_RANK7_BANK13 0x0D
+UMASK_RD_CAS_RANK7_BANK14 0x0E
+UMASK_RD_CAS_RANK7_BANK15 0x0F
+UMASK_RD_CAS_RANK7_ALLBANKS 0x10
+UMASK_RD_CAS_RANK7_BANKG0 0x11
+UMASK_RD_CAS_RANK7_BANKG1 0x12
+UMASK_RD_CAS_RANK7_BANKG2 0x13
+UMASK_RD_CAS_RANK7_BANKG3 0x14
+
+EVENT_RPQ_CYCLES_NE 0x11 MBOX
+UMASK_RPQ_CYCLES_NE 0x00
+
+EVENT_RPQ_INSERTS 0x10 MBOX
+UMASK_RPQ_INSERTS 0x00
+
+EVENT_VMSE_MXB_WR_OCCUPANCY 0x91 MBOX
+UMASK_VMSE_MXB_WR_OCCUPANCY 0x00
+
+EVENT_VMSE_WR_PUSH 0x90 MBOX
+UMASK_VMSE_WR_PUSH_WMM 0x01
+UMASK_VMSE_WR_PUSH_RMM 0x02
+
+EVENT_WMM_TO_RMM 0xC0 MBOX
+UMASK_WMM_TO_RMM_LOW_THRESH 0x01
+UMASK_WMM_TO_RMM_STARVE 0x02
+UMASK_WMM_TO_RMM_VMSE_RETRY 0x04
+
+# Undocumented event, mentioned in metrics table but not defined
+EVENT_WPQ_INSERTS 0x20 MBOX
+UMASK_WPQ_INSERTS 0x00
+
+EVENT_WPQ_CYCLES_FULL 0x22 MBOX
+UMASK_WPQ_CYCLES_FULL 0x00
+
+EVENT_WPQ_CYCLES_NE 0x21 MBOX
+UMASK_WPQ_CYCLES_NE 0x00
+
+EVENT_WPQ_READ_HIT 0x23 MBOX
+UMASK_WPQ_READ_HIT 0x00
+
+EVENT_WPQ_WRITE_HIT 0x24 MBOX
+UMASK_WPQ_WRITE_HIT 0x00
+
+EVENT_WRONG_MM 0xC1 MBOX
+UMASK_WRONG_MM 0x00
+
+EVENT_WR_CAS_RANK0 0xB8 MBOX
+UMASK_WR_CAS_RANK0_BANK0 0x00
+UMASK_WR_CAS_RANK0_BANK1 0x01
+UMASK_WR_CAS_RANK0_BANK2 0x02
+UMASK_WR_CAS_RANK0_BANK3 0x03
+UMASK_WR_CAS_RANK0_BANK4 0x04
+UMASK_WR_CAS_RANK0_BANK5 0x05
+UMASK_WR_CAS_RANK0_BANK6 0x06
+UMASK_WR_CAS_RANK0_BANK7 0x07
+UMASK_WR_CAS_RANK0_BANK8 0x08
+UMASK_WR_CAS_RANK0_BANK9 0x09
+UMASK_WR_CAS_RANK0_BANK10 0x0A
+UMASK_WR_CAS_RANK0_BANK11 0x0B
+UMASK_WR_CAS_RANK0_BANK12 0x0C
+UMASK_WR_CAS_RANK0_BANK13 0x0D
+UMASK_WR_CAS_RANK0_BANK14 0x0E
+UMASK_WR_CAS_RANK0_BANK15 0x0F
+UMASK_WR_CAS_RANK0_ALLBANKS 0x10
+UMASK_WR_CAS_RANK0_BANKG0 0x11
+UMASK_WR_CAS_RANK0_BANKG1 0x12
+UMASK_WR_CAS_RANK0_BANKG2 0x13
+UMASK_WR_CAS_RANK0_BANKG3 0x14
+
+EVENT_WR_CAS_RANK1 0xB9 MBOX
+UMASK_WR_CAS_RANK1_BANK0 0x00
+UMASK_WR_CAS_RANK1_BANK1 0x01
+UMASK_WR_CAS_RANK1_BANK2 0x02
+UMASK_WR_CAS_RANK1_BANK3 0x03
+UMASK_WR_CAS_RANK1_BANK4 0x04
+UMASK_WR_CAS_RANK1_BANK5 0x05
+UMASK_WR_CAS_RANK1_BANK6 0x06
+UMASK_WR_CAS_RANK1_BANK7 0x07
+UMASK_WR_CAS_RANK1_BANK8 0x08
+UMASK_WR_CAS_RANK1_BANK9 0x09
+UMASK_WR_CAS_RANK1_BANK10 0x0A
+UMASK_WR_CAS_RANK1_BANK11 0x0B
+UMASK_WR_CAS_RANK1_BANK12 0x0C
+UMASK_WR_CAS_RANK1_BANK13 0x0D
+UMASK_WR_CAS_RANK1_BANK14 0x0E
+UMASK_WR_CAS_RANK1_BANK15 0x0F
+UMASK_WR_CAS_RANK1_ALLBANKS 0x10
+UMASK_WR_CAS_RANK1_BANKG0 0x11
+UMASK_WR_CAS_RANK1_BANKG1 0x12
+UMASK_WR_CAS_RANK1_BANKG2 0x13
+UMASK_WR_CAS_RANK1_BANKG3 0x14
+
+EVENT_WR_CAS_RANK2 0xBA MBOX
+UMASK_WR_CAS_RANK2_BANK0 0x00
+UMASK_WR_CAS_RANK2_BANK1 0x01
+UMASK_WR_CAS_RANK2_BANK2 0x02
+UMASK_WR_CAS_RANK2_BANK3 0x03
+UMASK_WR_CAS_RANK2_BANK4 0x04
+UMASK_WR_CAS_RANK2_BANK5 0x05
+UMASK_WR_CAS_RANK2_BANK6 0x06
+UMASK_WR_CAS_RANK2_BANK7 0x07
+UMASK_WR_CAS_RANK2_BANK8 0x08
+UMASK_WR_CAS_RANK2_BANK9 0x09
+UMASK_WR_CAS_RANK2_BANK10 0x0A
+UMASK_WR_CAS_RANK2_BANK11 0x0B
+UMASK_WR_CAS_RANK2_BANK12 0x0C
+UMASK_WR_CAS_RANK2_BANK13 0x0D
+UMASK_WR_CAS_RANK2_BANK14 0x0E
+UMASK_WR_CAS_RANK2_BANK15 0x0F
+UMASK_WR_CAS_RANK2_ALLBANKS 0x10
+UMASK_WR_CAS_RANK2_BANKG0 0x11
+UMASK_WR_CAS_RANK2_BANKG1 0x12
+UMASK_WR_CAS_RANK2_BANKG2 0x13
+UMASK_WR_CAS_RANK2_BANKG3 0x14
+
+EVENT_WR_CAS_RANK3 0xBB MBOX
+UMASK_WR_CAS_RANK3_BANK0 0x00
+UMASK_WR_CAS_RANK3_BANK1 0x01
+UMASK_WR_CAS_RANK3_BANK2 0x02
+UMASK_WR_CAS_RANK3_BANK3 0x03
+UMASK_WR_CAS_RANK3_BANK4 0x04
+UMASK_WR_CAS_RANK3_BANK5 0x05
+UMASK_WR_CAS_RANK3_BANK6 0x06
+UMASK_WR_CAS_RANK3_BANK7 0x07
+UMASK_WR_CAS_RANK3_BANK8 0x08
+UMASK_WR_CAS_RANK3_BANK9 0x09
+UMASK_WR_CAS_RANK3_BANK10 0x0A
+UMASK_WR_CAS_RANK3_BANK11 0x0B
+UMASK_WR_CAS_RANK3_BANK12 0x0C
+UMASK_WR_CAS_RANK3_BANK13 0x0D
+UMASK_WR_CAS_RANK3_BANK14 0x0E
+UMASK_WR_CAS_RANK3_BANK15 0x0F
+UMASK_WR_CAS_RANK3_ALLBANKS 0x10
+UMASK_WR_CAS_RANK3_BANKG0 0x11
+UMASK_WR_CAS_RANK3_BANKG1 0x12
+UMASK_WR_CAS_RANK3_BANKG2 0x13
+UMASK_WR_CAS_RANK3_BANKG3 0x14
+
+EVENT_WR_CAS_RANK4 0xBC MBOX
+UMASK_WR_CAS_RANK4_BANK0 0x00
+UMASK_WR_CAS_RANK4_BANK1 0x01
+UMASK_WR_CAS_RANK4_BANK2 0x02
+UMASK_WR_CAS_RANK4_BANK3 0x03
+UMASK_WR_CAS_RANK4_BANK4 0x04
+UMASK_WR_CAS_RANK4_BANK5 0x05
+UMASK_WR_CAS_RANK4_BANK6 0x06
+UMASK_WR_CAS_RANK4_BANK7 0x07
+UMASK_WR_CAS_RANK4_BANK8 0x08
+UMASK_WR_CAS_RANK4_BANK9 0x09
+UMASK_WR_CAS_RANK4_BANK10 0x0A
+UMASK_WR_CAS_RANK4_BANK11 0x0B
+UMASK_WR_CAS_RANK4_BANK12 0x0C
+UMASK_WR_CAS_RANK4_BANK13 0x0D
+UMASK_WR_CAS_RANK4_BANK14 0x0E
+UMASK_WR_CAS_RANK4_BANK15 0x0F
+UMASK_WR_CAS_RANK4_ALLBANKS 0x10
+UMASK_WR_CAS_RANK4_BANKG0 0x11
+UMASK_WR_CAS_RANK4_BANKG1 0x12
+UMASK_WR_CAS_RANK4_BANKG2 0x13
+UMASK_WR_CAS_RANK4_BANKG3 0x14
+
+EVENT_WR_CAS_RANK5 0xBD MBOX
+UMASK_WR_CAS_RANK5_BANK0 0x00
+UMASK_WR_CAS_RANK5_BANK1 0x01
+UMASK_WR_CAS_RANK5_BANK2 0x02
+UMASK_WR_CAS_RANK5_BANK3 0x03
+UMASK_WR_CAS_RANK5_BANK4 0x04
+UMASK_WR_CAS_RANK5_BANK5 0x05
+UMASK_WR_CAS_RANK5_BANK6 0x06
+UMASK_WR_CAS_RANK5_BANK7 0x07
+UMASK_WR_CAS_RANK5_BANK8 0x08
+UMASK_WR_CAS_RANK5_BANK9 0x09
+UMASK_WR_CAS_RANK5_BANK10 0x0A
+UMASK_WR_CAS_RANK5_BANK11 0x0B
+UMASK_WR_CAS_RANK5_BANK12 0x0C
+UMASK_WR_CAS_RANK5_BANK13 0x0D
+UMASK_WR_CAS_RANK5_BANK14 0x0E
+UMASK_WR_CAS_RANK5_BANK15 0x0F
+UMASK_WR_CAS_RANK5_ALLBANKS 0x10
+UMASK_WR_CAS_RANK5_BANKG0 0x11
+UMASK_WR_CAS_RANK5_BANKG1 0x12
+UMASK_WR_CAS_RANK5_BANKG2 0x13
+UMASK_WR_CAS_RANK5_BANKG3 0x14
+
+EVENT_WR_CAS_RANK6 0xBE MBOX
+UMASK_WR_CAS_RANK6_BANK0 0x00
+UMASK_WR_CAS_RANK6_BANK1 0x01
+UMASK_WR_CAS_RANK6_BANK2 0x02
+UMASK_WR_CAS_RANK6_BANK3 0x03
+UMASK_WR_CAS_RANK6_BANK4 0x04
+UMASK_WR_CAS_RANK6_BANK5 0x05
+UMASK_WR_CAS_RANK6_BANK6 0x06
+UMASK_WR_CAS_RANK6_BANK7 0x07
+UMASK_WR_CAS_RANK6_BANK8 0x08
+UMASK_WR_CAS_RANK6_BANK9 0x09
+UMASK_WR_CAS_RANK6_BANK10 0x0A
+UMASK_WR_CAS_RANK6_BANK11 0x0B
+UMASK_WR_CAS_RANK6_BANK12 0x0C
+UMASK_WR_CAS_RANK6_BANK13 0x0D
+UMASK_WR_CAS_RANK6_BANK14 0x0E
+UMASK_WR_CAS_RANK6_BANK15 0x0F
+UMASK_WR_CAS_RANK6_ALLBANKS 0x10
+UMASK_WR_CAS_RANK6_BANKG0 0x11
+UMASK_WR_CAS_RANK6_BANKG1 0x12
+UMASK_WR_CAS_RANK6_BANKG2 0x13
+UMASK_WR_CAS_RANK6_BANKG3 0x14
+
+EVENT_WR_CAS_RANK7 0xBF MBOX
+UMASK_WR_CAS_RANK7_BANK0 0x00
+UMASK_WR_CAS_RANK7_BANK1 0x01
+UMASK_WR_CAS_RANK7_BANK2 0x02
+UMASK_WR_CAS_RANK7_BANK3 0x03
+UMASK_WR_CAS_RANK7_BANK4 0x04
+UMASK_WR_CAS_RANK7_BANK5 0x05
+UMASK_WR_CAS_RANK7_BANK6 0x06
+UMASK_WR_CAS_RANK7_BANK7 0x07
+UMASK_WR_CAS_RANK7_BANK8 0x08
+UMASK_WR_CAS_RANK7_BANK9 0x09
+UMASK_WR_CAS_RANK7_BANK10 0x0A
+UMASK_WR_CAS_RANK7_BANK11 0x0B
+UMASK_WR_CAS_RANK7_BANK12 0x0C
+UMASK_WR_CAS_RANK7_BANK13 0x0D
+UMASK_WR_CAS_RANK7_BANK14 0x0E
+UMASK_WR_CAS_RANK7_BANK15 0x0F
+UMASK_WR_CAS_RANK7_ALLBANKS 0x10
+UMASK_WR_CAS_RANK7_BANKG0 0x11
+UMASK_WR_CAS_RANK7_BANKG1 0x12
+UMASK_WR_CAS_RANK7_BANKG2 0x13
+UMASK_WR_CAS_RANK7_BANKG3 0x14
+
+EVENT_PBOX_CLOCKTICKS 0x01 PBOX
+UMASK_PBOX_CLOCKTICKS 0x00
+
+EVENT_IIO_CREDIT 0x2D PBOX
+UMASK_IIO_CREDIT_PRQ_QPI0 0x01
+UMASK_IIO_CREDIT_PRQ_QPI1 0x02
+UMASK_IIO_CREDIT_ISOCH_QPI0 0x04
+UMASK_IIO_CREDIT_ISOCH_QPI1 0x08
+
+EVENT_RING_AD_USED 0x07 PBOX
+UMASK_RING_AD_USED_CW_EVEN 0x01
+UMASK_RING_AD_USED_CW_ODD 0x02
+UMASK_RING_AD_USED_CW 0x03
+UMASK_RING_AD_USED_CCW_EVEN 0x04
+UMASK_RING_AD_USED_CCW_ODD 0x08
+UMASK_RING_AD_USED_CCW 0x0C
+UMASK_RING_AD_USED_ANY 0x0F
+
+EVENT_RING_AK_BOUNCES 0x12 PBOX
+UMASK_RING_AK_BOUNCES_UP 0x01
+UMASK_RING_AK_BOUNCES_DN 0x02
+
+EVENT_RING_AK_USED 0x08 PBOX
+UMASK_RING_AK_USED_CW_EVEN 0x01
+UMASK_RING_AK_USED_CW_ODD 0x02
+UMASK_RING_AK_USED_CW 0x03
+UMASK_RING_AK_USED_CCW_EVEN 0x04
+UMASK_RING_AK_USED_CCW_ODD 0x08
+UMASK_RING_AK_USED_CCW 0x0C
+UMASK_RING_AK_USED_ANY 0x0F
+
+EVENT_RING_BL_USED 0x09 PBOX
+UMASK_RING_BL_USED_CW_EVEN 0x01
+UMASK_RING_BL_USED_CW_ODD 0x02
+UMASK_RING_BL_USED_CW 0x03
+UMASK_RING_BL_USED_CCW_EVEN 0x04
+UMASK_RING_BL_USED_CCW_ODD 0x08
+UMASK_RING_BL_USED_CCW 0x0C
+UMASK_RING_BL_USED_ANY 0x0F
+
+EVENT_RING_IV_USED 0x09 PBOX
+UMASK_RING_IV_USED_CW 0x03
+UMASK_RING_IV_USED_CCW 0x0C
+UMASK_RING_IV_USED_ANY 0x0F
+
+EVENT_RXR_CYCLES_NE 0x10 PBOX
+UMASK_RXR_CYCLES_NE_NCB 0x10
+UMASK_RXR_CYCLES_NE_NCS 0x20
+
+EVENT_RXR_INSERTS 0x11 PBOX
+UMASK_RXR_INSERTS_NCB 0x10
+UMASK_RXR_INSERTS_NCS 0x20
+
+EVENT_RXR_OCCUPANCY 0x13 PBOX
+UMASK_RXR_OCCUPANCY_DRS 0x08
+
+EVENT_SBO0_CREDITS_ACQUIRED 0x28 PBOX
+UMASK_SBO0_CREDITS_ACQUIRED_AD 0x01
+UMASK_SBO0_CREDITS_ACQUIRED_BL 0x02
+
+EVENT_STALL_NO_SBO_CREDIT 0x2C PBOX
+UMASK_STALL_NO_SBO_CREDIT_SBO0_AD 0x01
+UMASK_STALL_NO_SBO_CREDIT_SBO1_AD 0x02
+UMASK_STALL_NO_SBO_CREDIT_SBO0_BL 0x04
+UMASK_STALL_NO_SBO_CREDIT_SBO1_BL 0x08
+
+EVENT_TXR_NACK_CW 0x26 PBOX
+UMASK_TXR_NACK_CW_DN_AD 0x01
+UMASK_TXR_NACK_CW_DN_BL 0x02
+UMASK_TXR_NACK_CW_DN_AK 0x04
+UMASK_TXR_NACK_CW_UP_AD 0x08
+UMASK_TXR_NACK_CW_UP_BL 0x10
+UMASK_TXR_NACK_CW_UP_AK 0x20
+UMASK_TXR_NACK_CW_AD 0x09
+UMASK_TXR_NACK_CW_BL 0x12
+UMASK_TXR_NACK_CW_AK 0x24
+
+EVENT_CACHE_TOTAL_OCCUPANCY 0x12 IBOX
+UMASK_CACHE_TOTAL_OCCUPANCY_ANY 0x01
+UMASK_CACHE_TOTAL_OCCUPANCY_SOURCE 0x02
+
+EVENT_COHERENT_OPS 0x13 IBOX
+UMASK_COHERENT_OPS_PCIRDCUR 0x01
+UMASK_COHERENT_OPS_CRD 0x02
+UMASK_COHERENT_OPS_DRD 0x04
+UMASK_COHERENT_OPS_RFO 0x08
+UMASK_COHERENT_OPS_PCITOM 0x10
+UMASK_COHERENT_OPS_PCIDCAHINT 0x20
+UMASK_COHERENT_OPS_WBMTOI 0x40
+UMASK_COHERENT_OPS_CLFLUSH 0x80
+
+EVENT_MISC0 0x14 IBOX
+UMASK_MISC0_FAST_REQ 0x01
+UMASK_MISC0_FAST_REJ 0x02
+UMASK_MISC0_2ND_RD_INSERT 0x04
+UMASK_MISC0_2ND_WR_INSERT 0x08
+UMASK_MISC0_2ND_ATOMIC_INSERT 0x10
+UMASK_MISC0_FAST_XFER 0x20
+UMASK_MISC0_PF_ACK_HINT 0x40
+UMASK_MISC0_PF_TIMEOUT 0x80
+
+EVENT_MISC1 0x15 IBOX
+UMASK_MISC1_SLOW_I 0x01
+UMASK_MISC1_SLOW_S 0x02
+UMASK_MISC1_SLOW_E 0x04
+UMASK_MISC1_SLOW_M 0x08
+UMASK_MISC1_LOST_FWD 0x10
+UMASK_MISC1_SEC_RCVD_INVLD 0x20
+UMASK_MISC1_SEC_RCVD_VLD 0x40
+UMASK_MISC1_DATA_THROTTLE 0x80
+
+EVENT_SNOOP_RESP 0x17 IBOX
+UMASK_SNOOP_RESP_MISS 0x01
+UMASK_SNOOP_RESP_HIT_I 0x02
+UMASK_SNOOP_RESP_HIT_ES 0x04
+UMASK_SNOOP_RESP_HIT_M 0x08
+UMASK_SNOOP_RESP_SNPCODE 0x10
+UMASK_SNOOP_RESP_SNPDATA 0x20
+UMASK_SNOOP_RESP_SNPINV 0x40
+
+EVENT_TRANSACTIONS 0x16 IBOX
+UMASK_TRANSACTIONS_READS 0x01
+UMASK_TRANSACTIONS_WRITES 0x02
+UMASK_TRANSACTIONS_RD_PREF 0x04
+UMASK_TRANSACTIONS_WR_PREF 0x08
+UMASK_TRANSACTIONS_ATOMIC 0x10
+UMASK_TRANSACTIONS_OTHER 0x20
+UMASK_TRANSACTIONS_ORDERINGQ 0x40
+
+EVENT_RXR_AK_INSERTS 0x0A IBOX
+UMASK_RXR_AK_INSERTS 0x00
+
+EVENT_RXR_BL_DRS_CYCLES_FULL 0x04 IBOX
+UMASK_RXR_BL_DRS_CYCLES_FULL 0x00
+
+EVENT_RXR_BL_DRS_INSERTS 0x01 IBOX
+UMASK_RXR_BL_DRS_INSERTS 0x00
+
+EVENT_RXR_BL_DRS_OCCUPANCY 0x07 IBOX
+UMASK_RXR_BL_DRS_OCCUPANCY 0x00
+
+EVENT_RXR_BL_NCB_CYCLES_FULL 0x05 IBOX
+UMASK_RXR_BL_NCB_CYCLES_FULL 0x00
+
+EVENT_RXR_BL_NCB_INSERTS 0x02 IBOX
+UMASK_RXR_BL_NCB_INSERTS 0x00
+
+EVENT_RXR_BL_NCB_OCCUPANCY 0x08 IBOX
+UMASK_RXR_BL_NCB_OCCUPANCY 0x00
+
+EVENT_RXR_BL_NCS_CYCLES_FULL 0x06 IBOX
+UMASK_RXR_BL_NCS_CYCLES_FULL 0x00
+
+EVENT_RXR_BL_NCS_INSERTS 0x03 IBOX
+UMASK_RXR_BL_NCS_INSERTS 0x00
+
+EVENT_RXR_BL_NCS_OCCUPANCY 0x09 IBOX
+UMASK_RXR_BL_NCS_OCCUPANCY 0x00
+
+EVENT_TXR_AD_STALL_CREDIT_CYCLES 0x18 IBOX
+UMASK_TXR_AD_STALL_CREDIT_CYCLES 0x00
+
+EVENT_TXR_BL_STALL_CREDIT_CYCLES 0x19 IBOX
+UMASK_TXR_BL_STALL_CREDIT_CYCLES 0x00
+
+EVENT_TXR_DATA_INSERTS_NCB 0x0E IBOX
+UMASK_TXR_DATA_INSERTS_NCB 0x00
+
+EVENT_TXR_DATA_INSERTS_NCS 0x0F IBOX
+UMASK_TXR_DATA_INSERTS_NCS 0x00
+
+EVENT_TXR_REQUEST_OCCUPANCY 0x0D IBOX
+UMASK_TXR_REQUEST_OCCUPANCY 0x00
+
+EVENT_RBOX_CLOCKTICKS 0x01 RBOX
+UMASK_RBOX_CLOCKTICKS 0x00
+
+EVENT_C_LO_AD_CREDITS_EMPTY 0x22 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO0 0x01
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO1 0x02
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO2 0x04
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO3 0x08
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO4 0x10
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO5 0x20
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO6 0x40
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO7 0x80
+
+EVENT_C_HI_AD_CREDITS_EMPTY 0x1F RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO8 0x01
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO9 0x02
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO10 0x04
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO11 0x08
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO12 0x10
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO13 0x20
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO14_16 0x40
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO15_17 0x80
+
+EVENT_HA_R2_BL_CREDITS_EMPTY_LO 0x2D RBOX0C0|RBOX1C0
+UMASK_HA_R2_BL_CREDITS_EMPTY_LO_HA0 0x01
+UMASK_HA_R2_BL_CREDITS_EMPTY_LO_HA1 0x02
+UMASK_HA_R2_BL_CREDITS_EMPTY_LO_R2_NCB 0x01
+UMASK_HA_R2_BL_CREDITS_EMPTY_LO_R2_NCS 0x01
+
+EVENT_HA_R2_BL_CREDITS_EMPTY_HI 0x2D RBOX0C1|RBOX1C1
+UMASK_HA_R2_BL_CREDITS_EMPTY_HI_HA0 0x01
+UMASK_HA_R2_BL_CREDITS_EMPTY_HI_HA1 0x02
+UMASK_HA_R2_BL_CREDITS_EMPTY_HI_R2_NCB 0x01
+UMASK_HA_R2_BL_CREDITS_EMPTY_HI_R2_NCS 0x01
+
+EVENT_QPI0_AD_CREDITS_EMPTY 0x20 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_QPI0_AD_CREDITS_EMPTY_VNA 0x01
+UMASK_QPI0_AD_CREDITS_EMPTY_VN0_HOM 0x02
+UMASK_QPI0_AD_CREDITS_EMPTY_VN0_SNP 0x04
+UMASK_QPI0_AD_CREDITS_EMPTY_VN0_NDR 0x08
+UMASK_QPI0_AD_CREDITS_EMPTY_VN1_HOM 0x10
+UMASK_QPI0_AD_CREDITS_EMPTY_VN1_SNP 0x20
+UMASK_QPI0_AD_CREDITS_EMPTY_VN1_NDR 0x40
+
+EVENT_QPI1_AD_CREDITS_EMPTY 0x2E RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_QPI1_AD_CREDITS_EMPTY_VNA 0x01
+UMASK_QPI1_AD_CREDITS_EMPTY_VN0_HOM 0x02
+UMASK_QPI1_AD_CREDITS_EMPTY_VN0_SNP 0x04
+UMASK_QPI1_AD_CREDITS_EMPTY_VN0_NDR 0x08
+UMASK_QPI1_AD_CREDITS_EMPTY_VN1_HOM 0x10
+UMASK_QPI1_AD_CREDITS_EMPTY_VN1_SNP 0x20
+UMASK_QPI1_AD_CREDITS_EMPTY_VN1_NDR 0x40
+
+EVENT_QPI0_BL_CREDITS_EMPTY 0x21 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_QPI0_BL_CREDITS_EMPTY_VNA 0x01
+UMASK_QPI0_BL_CREDITS_EMPTY_VN0_HOM 0x02
+UMASK_QPI0_BL_CREDITS_EMPTY_VN0_SNP 0x04
+UMASK_QPI0_BL_CREDITS_EMPTY_VN0_NDR 0x08
+UMASK_QPI0_BL_CREDITS_EMPTY_VN1_HOM 0x10
+UMASK_QPI0_BL_CREDITS_EMPTY_VN1_SNP 0x20
+UMASK_QPI0_BL_CREDITS_EMPTY_VN1_NDR 0x40
+
+EVENT_QPI1_BL_CREDITS_EMPTY 0x2F RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_QPI1_BL_CREDITS_EMPTY_VNA 0x01
+UMASK_QPI1_BL_CREDITS_EMPTY_VN0_HOM 0x02
+UMASK_QPI1_BL_CREDITS_EMPTY_VN0_SNP 0x04
+UMASK_QPI1_BL_CREDITS_EMPTY_VN0_NDR 0x08
+UMASK_QPI1_BL_CREDITS_EMPTY_VN1_HOM 0x10
+UMASK_QPI1_BL_CREDITS_EMPTY_VN1_SNP 0x20
+UMASK_QPI1_BL_CREDITS_EMPTY_VN1_NDR 0x40
+
+EVENT_RING_AD_USED 0x07 RBOX
+UMASK_RING_AD_USED_CW_EVEN 0x01
+UMASK_RING_AD_USED_CW_ODD 0x02
+UMASK_RING_AD_USED_CW 0x03
+UMASK_RING_AD_USED_CCW_EVEN 0x04
+UMASK_RING_AD_USED_CCW_ODD 0x08
+UMASK_RING_AD_USED_CCW 0x0C
+UMASK_RING_AD_USED_ANY 0x0F
+
+EVENT_RING_AK_USED 0x08 RBOX
+UMASK_RING_AK_USED_CW_EVEN 0x01
+UMASK_RING_AK_USED_CW_ODD 0x02
+UMASK_RING_AK_USED_CW 0x03
+UMASK_RING_AK_USED_CCW_EVEN 0x04
+UMASK_RING_AK_USED_CCW_ODD 0x08
+UMASK_RING_AK_USED_CCW 0x0C
+UMASK_RING_AK_USED_ANY 0x0F
+
+EVENT_RING_BL_USED 0x09 RBOX
+UMASK_RING_BL_USED_CW_EVEN 0x01
+UMASK_RING_BL_USED_CW_ODD 0x02
+UMASK_RING_BL_USED_CW 0x03
+UMASK_RING_BL_USED_CCW_EVEN 0x04
+UMASK_RING_BL_USED_CCW_ODD 0x08
+UMASK_RING_BL_USED_CCW 0x0C
+UMASK_RING_BL_USED_ANY 0x0F
+
+EVENT_RING_IV_USED 0x0A RBOX
+UMASK_RING_IV_USED_CW 0x03
+UMASK_RING_IV_USED_CCW 0x0C
+UMASK_RING_IV_USED_ANY 0x0F
+
+EVENT_RING_SINK_STARVED 0x0E RBOX
+UMASK_RING_SINK_STARVED_AK 0x02
+
+EVENT_RXR_CYCLES_NE 0x10 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_CYCLES_NE_HOM 0x01
+UMASK_RXR_CYCLES_NE_SNP 0x02
+UMASK_RXR_CYCLES_NE_NDR 0x04
+
+EVENT_RXR_CYCLES_NE_VN1 0x14 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_CYCLES_NE_VN1_HOM 0x01
+UMASK_RXR_CYCLES_NE_VN1_SNP 0x02
+UMASK_RXR_CYCLES_NE_VN1_NDR 0x04
+UMASK_RXR_CYCLES_NE_VN1_DRS 0x08
+UMASK_RXR_CYCLES_NE_VN1_NCB 0x10
+UMASK_RXR_CYCLES_NE_VN1_NCS 0x20
+
+EVENT_RXR_INSERTS 0x11 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_INSERTS_HOM 0x01
+UMASK_RXR_INSERTS_SNP 0x02
+UMASK_RXR_INSERTS_NDR 0x04
+UMASK_RXR_INSERTS_DRS 0x08
+UMASK_RXR_INSERTS_NCB 0x10
+UMASK_RXR_INSERTS_NCS 0x20
+
+EVENT_RXR_INSERTS_VN1 0x15 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_INSERTS_VN1_HOM 0x01
+UMASK_RXR_INSERTS_VN1_SNP 0x02
+UMASK_RXR_INSERTS_VN1_NDR 0x04
+UMASK_RXR_INSERTS_VN1_DRS 0x08
+UMASK_RXR_INSERTS_VN1_NCB 0x10
+UMASK_RXR_INSERTS_VN1_NCS 0x20
+
+EVENT_RXR_OCCUPANCY_VN1 0x13 RBOX0C0|RBOX1C0
+UMASK_RXR_OCCUPANCY_VN1_HOM 0x01
+UMASK_RXR_OCCUPANCY_VN1_SNP 0x02
+UMASK_RXR_OCCUPANCY_VN1_NDR 0x04
+UMASK_RXR_OCCUPANCY_VN1_DRS 0x08
+UMASK_RXR_OCCUPANCY_VN1_NCB 0x10
+UMASK_RXR_OCCUPANCY_VN1_NCS 0x20
+
+EVENT_TXR_NACK 0x26 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_TXR_NACK_DN_AD 0x01
+UMASK_TXR_NACK_DN_BL 0x02
+UMASK_TXR_NACK_DN_AK 0x04
+UMASK_TXR_NACK_UP_AD 0x08
+UMASK_TXR_NACK_UP_BL 0x10
+UMASK_TXR_NACK_UP_AK 0x20
+
+EVENT_SBO0_CREDITS_ACQUIRED 0x28 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_SBO0_CREDITS_ACQUIRED_AD 0x01
+UMASK_SBO0_CREDITS_ACQUIRED_BL 0x02
+
+EVENT_SBO1_CREDITS_ACQUIRED 0x29 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_SBO1_CREDITS_ACQUIRED_AD 0x01
+UMASK_SBO1_CREDITS_ACQUIRED_BL 0x02
+
+EVENT_STALL_NO_SBO_CREDIT 0x2C RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_STALL_NO_SBO_CREDIT_SBO0_AD 0x01
+UMASK_STALL_NO_SBO_CREDIT_SBO1_AD 0x02
+UMASK_STALL_NO_SBO_CREDIT_SBO0_BL 0x04
+UMASK_STALL_NO_SBO_CREDIT_SBO1_BL 0x08
+
+EVENT_VN0_CREDITS_USED 0x36 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN0_CREDITS_USED_HOM 0x01
+UMASK_VN0_CREDITS_USED_SNP 0x02
+UMASK_VN0_CREDITS_USED_NDR 0x04
+UMASK_VN0_CREDITS_USED_DRS 0x08
+UMASK_VN0_CREDITS_USED_NCB 0x10
+UMASK_VN0_CREDITS_USED_NCS 0x20
+
+EVENT_VN0_CREDITS_REJECT 0x37 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN0_CREDITS_REJECT_HOM 0x01
+UMASK_VN0_CREDITS_REJECT_SNP 0x02
+UMASK_VN0_CREDITS_REJECT_NDR 0x04
+UMASK_VN0_CREDITS_REJECT_DRS 0x08
+UMASK_VN0_CREDITS_REJECT_NCB 0x10
+UMASK_VN0_CREDITS_REJECT_NCS 0x20
+
+EVENT_VN1_CREDITS_USED 0x38 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN1_CREDITS_USED_HOM 0x01
+UMASK_VN1_CREDITS_USED_SNP 0x02
+UMASK_VN1_CREDITS_USED_NDR 0x04
+UMASK_VN1_CREDITS_USED_DRS 0x08
+UMASK_VN1_CREDITS_USED_NCB 0x10
+UMASK_VN1_CREDITS_USED_NCS 0x20
+
+EVENT_VN1_CREDITS_REJECT 0x39 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN1_CREDITS_REJECT_HOM 0x01
+UMASK_VN1_CREDITS_REJECT_SNP 0x02
+UMASK_VN1_CREDITS_REJECT_NDR 0x04
+UMASK_VN1_CREDITS_REJECT_DRS 0x08
+UMASK_VN1_CREDITS_REJECT_NCB 0x10
+UMASK_VN1_CREDITS_REJECT_NCS 0x20
+
+EVENT_VNA_CREDITS_ACQUIRED 0x33 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VNA_CREDITS_ACQUIRED_AD 0x01
+UMASK_VNA_CREDITS_ACQUIRED_BL 0x04
+
+EVENT_VNA_CREDITS_REJECT 0x34 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VNA_CREDITS_REJECT_HOM 0x01
+UMASK_VNA_CREDITS_REJECT_SNP 0x02
+UMASK_VNA_CREDITS_REJECT_NDR 0x04
+UMASK_VNA_CREDITS_REJECT_DRS 0x08
+UMASK_VNA_CREDITS_REJECT_NCB 0x10
+UMASK_VNA_CREDITS_REJECT_NCS 0x20
+
+EVENT_QBOX_CLOCKTICKS 0x14 QBOX
+UMASK_QBOX_CLOCKTICKS 0x00
+
+EVENT_CTO_COUNT 0x38 QBOX
+OPTIONS_CTO_COUNT EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MASK2_MASK|EVENT_OPTION_MASK3_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK|EVENT_OPTION_MATCH2_MASK|EVENT_OPTION_MATCH3_MASK
+UMASK_CTO_COUNT 0x00 0x01 0x00
+
+EVENT_DIRECT2CORE 0x13 QBOX
+UMASK_DIRECT2CORE_SUCCESS_RBT_HIT 0x01
+UMASK_DIRECT2CORE_FAILURE_CREDITS 0x02
+UMASK_DIRECT2CORE_FAILURE_RBT_HIT 0x04
+UMASK_DIRECT2CORE_FAILURE_CREDITS_RBT 0x08
+UMASK_DIRECT2CORE_FAILURE_MISS 0x10
+UMASK_DIRECT2CORE_FAILURE_CREDITS_MISS 0x20
+UMASK_DIRECT2CORE_FAILURE_RBT_MISS 0x40
+UMASK_DIRECT2CORE_FAILURE_CREDITS_RBT_MISS 0x80
+
+EVENT_L1_POWER_CYCLES 0x12 QBOX
+UMASK_L1_POWER_CYCLES 0x00
+
+EVENT_RXL0P_POWER_CYCLES 0x10 QBOX
+UMASK_RXL0P_POWER_CYCLES 0x00
+
+EVENT_RXL0_POWER_CYCLES 0x0F QBOX
+UMASK_RXL0_POWER_CYCLES 0x00
+
+EVENT_TXL0P_POWER_CYCLES 0x0D QBOX
+UMASK_TXL0P_POWER_CYCLES 0x00
+
+EVENT_TXL0_POWER_CYCLES 0x0C QBOX
+UMASK_TXL0_POWER_CYCLES 0x00
+
+EVENT_RXL_BYPASSED 0x09 QBOX
+UMASK_RXL_BYPASSED 0x00
+
+EVENT_TXL_BYPASSED 0x05 QBOX
+UMASK_TXL_BYPASSED 0x00
+
+EVENT_RXL_CREDITS_CONSUMED_VN0 0x1E QBOX
+UMASK_RXL_CREDITS_CONSUMED_VN0_DRS 0x01 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN0_NCB 0x02 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN0_NCS 0x04 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN0_HOM 0x08 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN0_SNP 0x10 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN0_NDR 0x20 0x01 0x00
+
+EVENT_RXL_CREDITS_CONSUMED_VN1 0x39 QBOX
+UMASK_RXL_CREDITS_CONSUMED_VN1_DRS 0x01 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN1_NCB 0x02 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN1_NCS 0x04 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN1_HOM 0x08 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN1_SNP 0x10 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN1_NDR 0x20 0x01 0x00
+
+EVENT_RXL_CREDITS_CONSUMED_VNA 0x1D QBOX
+UMASK_RXL_CREDITS_CONSUMED_VNA 0x00 0x01 0x00
+
+EVENT_RXL_CYCLES_NE 0x0A QBOX
+UMASK_RXL_CYCLES_NE 0x00
+
+EVENT_TXL_CYCLES_NE 0x06 QBOX
+UMASK_TXL_CYCLES_NE 0x00
+
+EVENT_RXL_FLITS_G1 0x02 QBOX
+UMASK_RXL_FLITS_G1_SNP 0x01 0x01 0x00
+UMASK_RXL_FLITS_G1_HOM_REQ 0x02 0x01 0x00
+UMASK_RXL_FLITS_G1_HOM_NONREQ 0x04 0x01 0x00
+UMASK_RXL_FLITS_G1_HOM 0x06 0x01 0x00
+UMASK_RXL_FLITS_G1_DRS_DATA 0x08 0x01 0x00
+UMASK_RXL_FLITS_G1_DRS_NONDATA 0x10 0x01 0x00
+UMASK_RXL_FLITS_G1_DRS 0x18 0x01 0x00
+
+EVENT_RXL_FLITS_G2 0x03 QBOX
+UMASK_RXL_FLITS_G2_NDR_AD 0x01 0x01 0x00
+UMASK_RXL_FLITS_G2_NDR_AK 0x02 0x01 0x00
+UMASK_RXL_FLITS_G2_NCB_DATA 0x04 0x01 0x00
+UMASK_RXL_FLITS_G2_NCB_NONDATA 0x08 0x01 0x00
+UMASK_RXL_FLITS_G2_NCB 0x0C 0x01 0x00
+UMASK_RXL_FLITS_G2_NCS 0x10 0x01 0x00
+
+EVENT_RXL_FLITS_G0 0x01 QBOX
+UMASK_RXL_FLITS_G0_IDLE 0x01
+UMASK_RXL_FLITS_G0_DATA 0x02
+UMASK_RXL_FLITS_G0_NON_DATA 0x04
+
+EVENT_TXL_FLITS_G0 0x00 QBOX
+UMASK_TXL_FLITS_G0_IDLE 0x01
+UMASK_TXL_FLITS_G0_DATA 0x02
+UMASK_TXL_FLITS_G0_NON_DATA 0x04
+
+EVENT_TXL_FLITS_G1 0x00 QBOX
+UMASK_TXL_FLITS_G1_SNP 0x01 0x01 0x00
+UMASK_TXL_FLITS_G1_HOM_REQ 0x02 0x01 0x00
+UMASK_TXL_FLITS_G1_HOM_NONREQ 0x04 0x01 0x00
+UMASK_TXL_FLITS_G1_HOM 0x06 0x01 0x00
+UMASK_TXL_FLITS_G1_DRS_DATA 0x08 0x01 0x00
+UMASK_TXL_FLITS_G1_DRS_NONDATA 0x10 0x01 0x00
+UMASK_TXL_FLITS_G1_DRS 0x18 0x01 0x00
+
+EVENT_TXL_FLITS_G2 0x01 QBOX
+UMASK_TXL_FLITS_G2_NDR_AD 0x01 0x01 0x00
+UMASK_TXL_FLITS_G2_NDR_AK 0x02 0x01 0x00
+UMASK_TXL_FLITS_G2_NCB_DATA 0x04 0x01 0x00
+UMASK_TXL_FLITS_G2_NCB_NONDATA 0x08 0x01 0x00
+UMASK_TXL_FLITS_G2_NCB 0x0C 0x01 0x00
+UMASK_TXL_FLITS_G2_NCS 0x10 0x01 0x00
+
+EVENT_RXL_INSERTS 0x08 QBOX
+UMASK_RXL_INSERTS 0x00
+
+EVENT_TXL_INSERTS 0x04 QBOX
+UMASK_TXL_INSERTS 0x00
+
+EVENT_RXL_INSERTS_DRS 0x09 QBOX
+UMASK_RXL_INSERTS_DRS_VN0 0x01 0x01 0x00
+UMASK_RXL_INSERTS_DRS_VN1 0x02 0x01 0x00
+
+EVENT_RXL_INSERTS_HOM 0x0C QBOX
+UMASK_RXL_INSERTS_HOM_VN0 0x01 0x01 0x00
+UMASK_RXL_INSERTS_HOM_VN1 0x02 0x01 0x00
+
+EVENT_RXL_INSERTS_NCB 0x0A QBOX
+UMASK_RXL_INSERTS_NCB_VN0 0x01 0x01 0x00
+UMASK_RXL_INSERTS_NCB_VN1 0x02 0x01 0x00
+
+EVENT_RXL_INSERTS_NCS 0x0B QBOX
+UMASK_RXL_INSERTS_NCS_VN0 0x01 0x01 0x00
+UMASK_RXL_INSERTS_NCS_VN1 0x02 0x01 0x00
+
+EVENT_RXL_INSERTS_NDR 0x0E QBOX
+UMASK_RXL_INSERTS_NDR_VN0 0x01 0x01 0x00
+UMASK_RXL_INSERTS_NDR_VN1 0x02 0x01 0x00
+
+EVENT_RXL_INSERTS_SNP 0x0D QBOX
+UMASK_RXL_INSERTS_SNP_VN0 0x01 0x01 0x00
+UMASK_RXL_INSERTS_SNP_VN1 0x02 0x01 0x00
+
+EVENT_RXL_OCCUPANCY 0x0B QBOX
+UMASK_RXL_OCCUPANCY 0x00
+
+EVENT_TXL_OCCUPANCY 0x07 QBOX
+UMASK_TXL_OCCUPANCY 0x00
+
+EVENT_RXL_OCCUPANCY_DRS 0x15 QBOX
+UMASK_RXL_OCCUPANCY_DRS_VN0 0x01 0x01 0x00
+UMASK_RXL_OCCUPANCY_DRS_VN1 0x02 0x01 0x00
+
+EVENT_RXL_OCCUPANCY_HOM 0x18 QBOX
+UMASK_RXL_OCCUPANCY_HOM_VN0 0x01 0x01 0x00
+UMASK_RXL_OCCUPANCY_HOM_VN1 0x02 0x01 0x00
+
+EVENT_RXL_OCCUPANCY_NCB 0x16 QBOX
+UMASK_RXL_OCCUPANCY_NCB_VN0 0x01 0x01 0x00
+UMASK_RXL_OCCUPANCY_NCB_VN1 0x02 0x01 0x00
+
+EVENT_RXL_OCCUPANCY_NCS 0x17 QBOX
+UMASK_RXL_OCCUPANCY_NCS_VN0 0x01 0x01 0x00
+UMASK_RXL_OCCUPANCY_NCS_VN1 0x02 0x01 0x00
+
+EVENT_RXL_OCCUPANCY_NDR 0x1A QBOX
+UMASK_RXL_OCCUPANCY_NDR_VN0 0x01 0x01 0x00
+UMASK_RXL_OCCUPANCY_NDR_VN1 0x02 0x01 0x00
+
+EVENT_RXL_OCCUPANCY_SNP 0x19 QBOX
+UMASK_RXL_OCCUPANCY_SNP_VN0 0x01 0x01 0x00
+UMASK_RXL_OCCUPANCY_SNP_VN1 0x02 0x01 0x00
+
+EVENT_TXR_AD_HOM_CREDIT_ACQUIRED 0x26 QBOX
+UMASK_TXR_AD_HOM_CREDIT_ACQUIRED_VN0 0x01 0x01 0x00
+UMASK_TXR_AD_HOM_CREDIT_ACQUIRED_VN1 0x02 0x01 0x00
+
+EVENT_TXR_AD_HOM_CREDIT_OCCUPANCY 0x22 QBOX
+UMASK_TXR_AD_HOM_CREDIT_OCCUPANCY_VN0 0x01 0x01 0x00
+UMASK_TXR_AD_HOM_CREDIT_OCCUPANCY_VN1 0x02 0x01 0x00
+
+EVENT_TXR_AD_NDR_CREDIT_ACQUIRED 0x28 QBOX
+UMASK_TXR_AD_NDR_CREDIT_ACQUIRED_VN0 0x01 0x01 0x00
+UMASK_TXR_AD_NDR_CREDIT_ACQUIRED_VN1 0x02 0x01 0x00
+
+EVENT_TXR_AD_NDR_CREDIT_OCCUPANCY 0x24 QBOX
+UMASK_TXR_AD_NDR_CREDIT_OCCUPANCY_VN0 0x01 0x01 0x00
+UMASK_TXR_AD_NDR_CREDIT_OCCUPANCY_VN1 0x02 0x01 0x00
+
+EVENT_TXR_AD_SNP_CREDIT_ACQUIRED 0x27 QBOX
+UMASK_TXR_AD_SNP_CREDIT_ACQUIRED_VN0 0x01 0x01 0x00
+UMASK_TXR_AD_SNP_CREDIT_ACQUIRED_VN1 0x02 0x01 0x00
+
+EVENT_TXR_AD_SNP_CREDIT_OCCUPANCY 0x23 QBOX
+UMASK_TXR_AD_SNP_CREDIT_OCCUPANCY_VN0 0x01 0x01 0x00
+UMASK_TXR_AD_SNP_CREDIT_OCCUPANCY_VN1 0x02 0x01 0x00
+
+EVENT_TXR_AK_NDR_CREDIT_ACQUIRED 0x29 QBOX
+UMASK_TXR_AK_NDR_CREDIT_ACQUIRED 0x00 0x01 0x00
+
+EVENT_TXR_AK_NDR_CREDIT_OCCUPANCY 0x25 QBOX
+UMASK_TXR_AK_NDR_CREDIT_OCCUPANCY 0x00 0x01 0x00
+
+EVENT_TXR_BL_DRS_CREDIT_ACQUIRED 0x2A QBOX
+UMASK_TXR_BL_DRS_CREDIT_ACQUIRED_VN0 0x01 0x01 0x00
+UMASK_TXR_BL_DRS_CREDIT_ACQUIRED_VN1 0x02 0x01 0x00
+UMASK_TXR_BL_DRS_CREDIT_ACQUIRED_VN_SHR 0x04 0x01 0x00
+
+EVENT_TXR_BL_DRS_CREDIT_OCCUPANCY 0x1F QBOX
+UMASK_TXR_BL_DRS_CREDIT_OCCUPANCY_VN0 0x01 0x01 0x00
+UMASK_TXR_BL_DRS_CREDIT_OCCUPANCY_VN1 0x02 0x01 0x00
+UMASK_TXR_BL_DRS_CREDIT_OCCUPANCY_VN_SHR 0x04 0x01 0x00
+
+EVENT_TXR_BL_NCB_CREDIT_ACQUIRED 0x2B QBOX
+UMASK_TXR_BL_NCB_CREDIT_ACQUIRED_VN0 0x01 0x01 0x00
+UMASK_TXR_BL_NCB_CREDIT_ACQUIRED_VN1 0x02 0x01 0x00
+
+EVENT_TXR_BL_NCB_CREDIT_OCCUPANCY 0x20 QBOX
+UMASK_TXR_BL_NCB_CREDIT_OCCUPANCY_VN0 0x01 0x01 0x00
+UMASK_TXR_BL_NCB_CREDIT_OCCUPANCY_VN1 0x02 0x01 0x00
+
+EVENT_TXR_BL_NCS_CREDIT_ACQUIRED 0x2C QBOX
+UMASK_TXR_BL_NCS_CREDIT_ACQUIRED_VN0 0x01 0x01 0x00
+UMASK_TXR_BL_NCS_CREDIT_ACQUIRED_VN1 0x02 0x01 0x00
+
+EVENT_TXR_BL_NCS_CREDIT_OCCUPANCY 0x21 QBOX
+UMASK_TXR_BL_NCS_CREDIT_OCCUPANCY_VN0 0x01 0x01 0x00
+UMASK_TXR_BL_NCS_CREDIT_OCCUPANCY_VN1 0x02 0x01 0x00
+
+EVENT_VNA_CREDIT_RETURNS 0x1C QBOX
+UMASK_VNA_CREDIT_RETURNS 0x00 0x01 0x00
+
+EVENT_VNA_CREDIT_RETURN_OCCUPANCY 0x1B QBOX
+UMASK_VNA_CREDIT_RETURN_OCCUPANCY 0x00 0x01 0x00
+
+EVENT_QPI_RATE 0x00 QBOX0FIX0|QBOX1FIX0
+UMASK_QPI_RATE 0x00
+
+EVENT_QPI_RX_IDLE 0x01 QBOX0FIX1|QBOX1FIX1
+UMASK_QPI_RX_IDLE 0x00
+
+EVENT_QPI_RX_LLR 0x02 QBOX0FIX2|QBOX1FIX2
+UMASK_QPI_RX_LLR 0x00
diff --git a/src/includes/perfmon_haswell_counters.h b/src/includes/perfmon_haswell_counters.h
index 3dc7247..4964994 100644
--- a/src/includes/perfmon_haswell_counters.h
+++ b/src/includes/perfmon_haswell_counters.h
@@ -3,15 +3,16 @@
*
* Filename: perfmon_haswell_counters.h
*
- * Description: Counter Header File of perfmon module for Haswell.
+ * Description: Counter Header File of perfmon module for Intel Haswell.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -28,26 +29,56 @@
* =======================================================================================
*/
-#define NUM_COUNTERS_HASWELL 12
-#define NUM_COUNTERS_UNCORE_HASWELL 4
+#define NUM_COUNTERS_HASWELL 23
#define NUM_COUNTERS_CORE_HASWELL 8
+#define NUM_COUNTERS_UNCORE_HASWELL 15
-static PerfmonCounterMap haswell_counter_map[NUM_COUNTERS_HASWELL] = {
+#define HAS_VALID_OPTIONS_FIXED EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_COUNT_KERNEL_MASK
+#define HAS_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK| \
+ EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_IN_TRANS_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define HAS_VALID_OPTIONS_CBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+#define HAS_VALID_OPTIONS_UBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+
+static RegisterMap haswell_counter_map[NUM_COUNTERS_HASWELL] = {
/* Fixed Counters: instructions retired, cycles unhalted core */
- {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0},
- {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0},
- {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0},
+ {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, HAS_VALID_OPTIONS_FIXED},
+ {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, HAS_VALID_OPTIONS_FIXED},
+ {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, HAS_VALID_OPTIONS_FIXED},
/* PMC Counters: 4 48bit wide */
- {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
- {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0},
- {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0},
- {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0},
+ {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, HAS_VALID_OPTIONS_PMC},
+ {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, HAS_VALID_OPTIONS_PMC},
+ {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, HAS_VALID_OPTIONS_PMC|EVENT_OPTION_IN_TRANS_ABORT_MASK},
+ {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, HAS_VALID_OPTIONS_PMC},
/* Temperature Sensor*/
- {"TMP0", PMC7, THERMAL, 0, 0, 0, 0},
+ {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
/* RAPL counters */
- {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0},
- {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0},
- {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS, 0, 0},
- {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS, 0, 0},
+ {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"CBOX0C0", PMC12, CBOX0, MSR_UNC_CBO_0_PERFEVTSEL0, MSR_UNC_CBO_0_CTR0, 0, 0, HAS_VALID_OPTIONS_CBOX},
+ {"CBOX0C1", PMC13, CBOX0, MSR_UNC_CBO_0_PERFEVTSEL1, MSR_UNC_CBO_0_CTR1, 0, 0, HAS_VALID_OPTIONS_CBOX},
+ {"CBOX1C0", PMC14, CBOX1, MSR_UNC_CBO_1_PERFEVTSEL0, MSR_UNC_CBO_1_CTR0, 0, 0, HAS_VALID_OPTIONS_CBOX},
+ {"CBOX1C1", PMC15, CBOX1, MSR_UNC_CBO_1_PERFEVTSEL1, MSR_UNC_CBO_1_CTR1, 0, 0, HAS_VALID_OPTIONS_CBOX},
+ {"CBOX2C0", PMC16, CBOX2, MSR_UNC_CBO_2_PERFEVTSEL0, MSR_UNC_CBO_2_CTR0, 0, 0, HAS_VALID_OPTIONS_CBOX},
+ {"CBOX2C1", PMC17, CBOX2, MSR_UNC_CBO_2_PERFEVTSEL1, MSR_UNC_CBO_2_CTR1, 0, 0, HAS_VALID_OPTIONS_CBOX},
+ {"CBOX3C0", PMC18, CBOX3, MSR_UNC_CBO_3_PERFEVTSEL0, MSR_UNC_CBO_3_CTR0, 0, 0, HAS_VALID_OPTIONS_CBOX},
+ {"CBOX3C1", PMC19, CBOX3, MSR_UNC_CBO_3_PERFEVTSEL1, MSR_UNC_CBO_3_CTR1, 0, 0, HAS_VALID_OPTIONS_CBOX},
+ {"UBOX0", PMC20, UBOX, MSR_UNC_ARB_PERFEVTSEL0, MSR_UNC_ARB_CTR0, 0, 0, HAS_VALID_OPTIONS_UBOX},
+ {"UBOX1", PMC21, UBOX, MSR_UNC_ARB_PERFEVTSEL1, MSR_UNC_ARB_CTR1, 0, 0, HAS_VALID_OPTIONS_UBOX},
+ {"UBOXFIX", PMC22, UBOXFIX, MSR_UNC_PERF_FIXED_CTRL, MSR_UNC_PERF_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
};
+
+static BoxMap haswell_box_map[NUM_UNITS] = {
+ [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+ [THERMAL] = {0, 0, 0, 0, 0, 0, 8},
+ [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+ [POWER] = {0, 0, 0, 0, 0, 0, 32},
+ [CBOX0] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+ [CBOX1] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+ [CBOX2] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+ [CBOX3] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+ [UBOX] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 1, 0, MSR_DEV, 44},
+ [UBOXFIX] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 0, 0, MSR_DEV, 44},
+};
diff --git a/src/includes/perfmon_haswell_events.txt b/src/includes/perfmon_haswell_events.txt
index f958a3a..bc5a37d 100644
--- a/src/includes/perfmon_haswell_events.txt
+++ b/src/includes/perfmon_haswell_events.txt
@@ -1,16 +1,17 @@
# =======================================================================================
-#
+#
# Filename: perfmon_haswell_events.txt
-#
-# Description: Event list for Intel Ivy Bridge
-#
-# Version: 3.1.3
-# Released: 4.11.2014
-#
-# Author: Jan Treibig (jt), jan.treibig at gmail.com
+#
+# Description: Event list for Intel Haswell
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
+# Author: Jan Treibig (jt), jan.treibig at gmail.com
+# Thomas Roehl (tr), thomas.roehl at googlemail.com
# Project: likwid
#
-# Copyright (C) 2014 Jan Treibig
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
@@ -46,6 +47,8 @@ UMASK_INSTR_RETIRED_ANY 0x00
EVENT_CPU_CLK_UNHALTED 0x00 FIXC1
UMASK_CPU_CLK_UNHALTED_CORE 0x00
+DEFAULT_OPTIONS_CPU_CLK_UNHALTED_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_CPU_CLK_UNHALTED_ANY 0x00
EVENT_CPU_CLK_UNHALTED 0x00 FIXC2
UMASK_CPU_CLK_UNHALTED_REF 0x00
@@ -54,15 +57,15 @@ EVENT_LD_BLOCKS 0x03 PMC
UMASK_LD_BLOCKS_STORE_FORWARD 0x02
UMASK_LD_BLOCKS_NO_SR 0x08
-EVENT_MISALIGN_MEM_REF 0x05 PMC
+EVENT_MISALIGN_MEM_REF 0x05 PMC
UMASK_MISALIGN_MEM_REF_LOADS 0x01
UMASK_MISALIGN_MEM_REF_STORES 0x02
UMASK_MISALIGN_MEM_REF_ANY 0x03
-EVENT_LD_BLOCKS_PARTIAL 0x07 PMC
+EVENT_LD_BLOCKS_PARTIAL 0x07 PMC
UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS 0x01
-EVENT_DTLB_LOAD_MISSES 0x08 PMC
+EVENT_DTLB_LOAD_MISSES 0x08 PMC
UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK 0x01
UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED_4K 0x02
UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED_LARGE 0x04
@@ -73,125 +76,211 @@ UMASK_DTLB_LOAD_MISSES_STLB_HIT_2M 0x40
UMASK_DTLB_LOAD_MISSES_STLB_HIT 0x60
UMASK_DTLB_LOAD_MISSES_PDE_CACHE_MISS 0x80
-EVENT_INT_MISC 0x0D PMC
-UMASK_INT_MISC_RECOVERY_CYCLES 0x03 0x01
+EVENT_INT_MISC 0x0D PMC
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_INT_MISC_RECOVERY_CYCLES 0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=0x1
+UMASK_INT_MISC_RECOVERY_COUNT 0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES_ANY EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_INT_MISC_RECOVERY_CYCLES_ANY 0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_COUNT_ANY EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_EDGE=0x1
+UMASK_INT_MISC_RECOVERY_COUNT_ANY 0x03
EVENT_UOPS_ISSUED 0x0E PMC
UMASK_UOPS_ISSUED_ANY 0x01
UMASK_UOPS_ISSUED_FLAGS_MERGE 0x10
UMASK_UOPS_ISSUED_SLOW_LEA 0x20
UMASK_UOPS_ISSUED_SINGLE_MUL 0x40
-
-EVENT_L2_RQSTS 0x24 PMC
-UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD_MISS 0x21
-UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD_HIT 0x41
-UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD 0xE1
-UMASK_L2_RQSTS_RFO_HIT 0x42
-UMASK_L2_RQSTS_RFO_MISS 0x22
-UMASK_L2_RQSTS_ALL_RFO 0xE2
-UMASK_L2_RQSTS_CODE_RD_HIT 0x44
-UMASK_L2_RQSTS_CODE_RD_MISS 0x24
-UMASK_L2_RQSTS_ALL_DEMAND_MISS 0x27
-UMASK_L2_RQSTS_ALL_DEMAND_REFERENCES 0xE7
-UMASK_L2_RQSTS_ALL_CODE_RD 0xE4
-UMASK_L2_RQSTS_L2_PF_HIT 0x50
-UMASK_L2_RQSTS_L2_PF_MISS 0x30
-UMASK_L2_RQSTS_ALL_PF 0xF8
-UMASK_L2_RQSTS_MISS 0x3F
-UMASK_L2_RQSTS_REFERENCES 0xFF
+DEFAULT_OPTIONS_UOPS_ISSUED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_ISSUED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_TOTAL_CYCLES 0x01
+
+EVENT_ARITH_DIVIDER_UOPS 0x14 PMC
+UMASK_ARITH_DIVIDER_CYCLES 0x01
+UMASK_ARITH_DIVIDER_UOPS 0x02
+
+EVENT_L2_RQSTS 0x24 PMC
+UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD_MISS 0x21
+UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD_HIT 0x41
+UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD 0xE1
+UMASK_L2_RQSTS_RFO_HIT 0x42
+UMASK_L2_RQSTS_RFO_MISS 0x22
+UMASK_L2_RQSTS_ALL_RFO 0xE2
+UMASK_L2_RQSTS_CODE_RD_HIT 0x44
+UMASK_L2_RQSTS_CODE_RD_MISS 0x24
+UMASK_L2_RQSTS_ALL_DEMAND_MISS 0x27
+UMASK_L2_RQSTS_ALL_DEMAND_REFERENCES 0xE7
+UMASK_L2_RQSTS_ALL_CODE_RD 0xE4
+UMASK_L2_RQSTS_L2_PF_HIT 0x50
+UMASK_L2_RQSTS_L2_PF_MISS 0x30
+UMASK_L2_RQSTS_ALL_PF 0xF8
+UMASK_L2_RQSTS_MISS 0x3F
+UMASK_L2_RQSTS_REFERENCES 0xFF
EVENT_L2_DEMAND_RQST_WB_HIT 0x27 PMC
-UMASK_L2_DEMAND_RQST_WB_HIT 0x50
+UMASK_L2_DEMAND_RQST_WB_HIT 0x50
-EVENT_LONGEST_LAT_CACHE_REFERENCE 0x2E PMC
+EVENT_LONGEST_LAT_CACHE 0x2E PMC
UMASK_LONGEST_LAT_CACHE_REFERENCE 0x4F
UMASK_LONGEST_LAT_CACHE_MISS 0x41
EVENT_CPU_CLOCK_UNHALTED 0x3C PMC
UMASK_CPU_CLOCK_UNHALTED_THREAD_P 0x00
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_THREAD_P_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P_ANY 0x00
UMASK_CPU_CLOCK_UNHALTED_REF_XCLK 0x01
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_REF_XCLK_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_CPU_CLOCK_UNHALTED_REF_XCLK_ANY 0x01
+UMASK_CPU_CLOCK_THREAD_UNHALTED_ONE_THREAD_ACTIVE 0x02
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0x2,EVENT_OPTION_INVERT=0x1
+UMASK_CPU_CLOCK_UNHALTED_TOTAL_CYCLES 0x00
-EVENT_L1D_PEND_MISS 0x48 PMC1
+EVENT_L1D_PEND_MISS 0x48 PMC2
UMASK_L1D_PEND_MISS_PENDING 0x01
+DEFAULT_OPTIONS_L1D_PEND_MISS_PENDING_CYCLES_ANY EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_THRESHOLD=0x1
+UMASK_L1D_PEND_MISS_PENDING_CYCLES_ANY 0x01
+DEFAULT_OPTIONS_L1D_PEND_MISS_PENDING_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_L1D_PEND_MISS_PENDING_CYCLES 0x01
-EVENT_DTLB_STORE_MISSES 0x49 PMC
-UMASK_DTLB_STORE_MISSES_MISS_CAUSES_A_WALK 0x01
+EVENT_L1D_PEND_MISS_REQUEST_FB_FULL 0x48 PMC
+UMASK_L1D_PEND_MISS_REQUEST_FB_FULL 0x02
+DEFAULT_OPTIONS_L1D_PEND_MISS_FB_FULL EVENT_OPTION_THRESHOLD=0x1
+UMASK_L1D_PEND_MISS_FB_FULL 0x02
+
+
+EVENT_DTLB_STORE_MISSES 0x49 PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK 0x01
UMASK_DTLB_STORE_MISSES_WALK_COMPLETED_4K 0x02
UMASK_DTLB_STORE_MISSES_WALK_COMPLETED_LARGE 0x04
UMASK_DTLB_STORE_MISSES_WALK_COMPLETED 0x0E
-UMASK_DTLB_STORE_MISSES_WALK_DURATION 0x10
+UMASK_DTLB_STORE_MISSES_WALK_DURATION 0x10
UMASK_DTLB_STORE_MISSES_STLB_HIT_4K 0x20
-UMASK_DTLB_STORE_MISSES_STLB_HIT_LARGE 0x40
-UMASK_DTLB_STORE_MISSES_STLB_HIT 0x60
-UMASK_DTLB_STORE_MISSES_PDE_CACHE_MISS 0x80
+UMASK_DTLB_STORE_MISSES_STLB_HIT_2M 0x40
+UMASK_DTLB_STORE_MISSES_STLB_HIT 0x60
+UMASK_DTLB_STORE_MISSES_PDE_CACHE_MISS 0x80
-EVENT_LOAD_HIT_PRE 0x4C PMC
+EVENT_LOAD_HIT_PRE 0x4C PMC
UMASK_LOAD_HIT_PRE_SW_PF 0x01
UMASK_LOAD_HIT_PRE_HW_PF 0x02
-EVENT_L1D 0x51 PMC
+EVENT_EPT_WALK_CYCLES 0x4F PMC
+UMASK_EPT_WALK_CYCLES 0x10
+
+EVENT_L1D 0x51 PMC
UMASK_L1D_REPLACEMENT 0x01
UMASK_L1D_ALLOCATED_IN_M 0x02
UMASK_L1D_M_EVICT 0x04
UMASK_L1D_ALL_M_REPLACEMENT 0x08
+EVENT_TX_MEM 0x54 PMC
+UMASK_TX_MEM_ABORT_CONFLICT 0x01
+UMASK_TX_MEM_ABORT_CAPACITY_WRITE 0x02
+UMASK_TX_MEM_ABORT_HLE_STORE_TO_ELIDED_LOCK 0x04
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_NOT_EMPTY 0x08
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_MISMATCH 0x10
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_UNSUPPORTED_ALIGNMENT 0x20
+UMASK_TX_MEM_HLE_ELISION_BUFFER_FULL 0x40
+
EVENT_MOVE_ELIMINATION 0x58 PMC
UMASK_MOVE_ELIMINATION_INT_NOT_ELIMINATED 0x04
UMASK_MOVE_ELIMINATION_SIMD_NOT_ELIMINATED 0x08
UMASK_MOVE_ELIMINATION_INT_ELIMINATED 0x01
UMASK_MOVE_ELIMINATION_SIMD_ELIMINATED 0x02
-EVENT_CPL_CYCLES 0x5C PMC
+EVENT_CPL_CYCLES 0x5C PMC
UMASK_CPL_CYCLES_RING0 0x01
-UMASK_CPL_CYCLES_RING123 0x02
-
-EVENT_RS_EVENTS 0x5E PMC
+UMASK_CPL_CYCLES_RING123 0x02
+DEFAULT_OPTIONS_CPL_CYCLES_RING0_TRANS EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_CPL_CYCLES_RING0_TRANS 0x01
+
+EVENT_TX_EXEC 0x5D PMC
+UMASK_TX_EXEC_MISC1 0x01
+UMASK_TX_EXEC_MISC2 0x02
+UMASK_TX_EXEC_MISC3 0x04
+UMASK_TX_EXEC_MISC4 0x08
+UMASK_TX_EXEC_MISC5 0x10
+
+EVENT_RS_EVENTS 0x5E PMC
UMASK_RS_EVENTS_EMPTY_CYCLES 0x01
+DEFAULT_OPTIONS_RS_EVENTS_EMPTY_END EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_EDGE=1
+UMASK_RS_EVENTS_EMPTY_END 0x01
-EVENT_OFFCORE_REQUESTS_OUTSTANDING 0x60 PMC
+# Errata HSW62: May be unreliable in SMT mode
+EVENT_OFFCORE_REQUESTS_OUTSTANDING 0x60 PMC
UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD 0x01
UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD 0x02
-UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO 0x04
-UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD 0x08
-
-EVENT_CACHE_LOCK_CYCLES 0x63 PMC
-UMASK_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION 0x01
-UMASK_CACHE_LOCK_CYCLES_CACHE_LOCK_DURATION 0x02
-
-EVENT_IDQ 0x79 PMC
-UMASK_IDQ_EMPTY 0x02
-UMASK_IDQ_MITE_UOPS 0x04
-UMASK_IDQ_MITE_UOPS_CYCLES 0x04 0x00 0x01
-UMASK_IDQ_DSB_UOPS 0x08
-UMASK_IDQ_DSB_UOPS_CYCLES 0x08 0x00 0x01
-UMASK_IDQ_MS_DSB_UOPS 0x10
-UMASK_IDQ_MS_DSB_UOPS_CYCLES 0x10 0x00 0x01
-UMASK_IDQ_MS_MITE_UOPS 0x20
-UMASK_IDQ_MS_MITE_UOPS_CYCLES 0x20 0x00 0x01
-UMASK_IDQ_MS_UOPS 0x30
-UMASK_IDQ_MS_UOPS_CYCLES 0x30 0x00 0x01
-UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS 0x18 0x00 0x01
-UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS 0x18 0x00 0x01
-UMASK_IDQ_ALL_DSB_CYCLES_4_UOPS 0x18 0x00 0x04
-UMASK_IDQ_ALL_MITE_CYCLES_ANY_UOPS 0x24 0x00 0x01
-UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS 0x24 0x00 0x04
-UMASK_IDQ_MITE_ALL_UOPS 0x3C
-
-EVENT_ICACHE 0x80 PMC
-UMASK_ICACHE_HITS 0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO 0x04
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD 0x08
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DEMAND_DATA_RD EVENT_OPTION_THRESHOLD=0x1
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DEMAND_DATA_RD 0x01
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DATA_RD EVENT_OPTION_THRESHOLD=0x1
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DATA_RD 0x08
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD_C6 EVENT_OPTION_THRESHOLD=0x6
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD_C6 0x01
+
+EVENT_LOCK_CYCLES 0x63 PMC
+UMASK_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION 0x01
+DEFAULT_OPTIONS_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_COUNT EVENT_OPTION_EDGE=0x1
+UMASK_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_COUNT 0x01
+UMASK_LOCK_CYCLES_CACHE_LOCK_DURATION 0x02
+DEFAULT_OPTIONS_LOCK_CYCLES_CACHE_LOCK_COUNT EVENT_OPTION_EDGE=0x1
+UMASK_LOCK_CYCLES_CACHE_LOCK_COUNT 0x02
+
+EVENT_IDQ 0x79 PMC
+UMASK_IDQ_EMPTY 0x02
+UMASK_IDQ_MITE_UOPS 0x04
+DEFAULT_OPTIONS_IDQ_MITE_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_MITE_CYCLES 0x04
+UMASK_IDQ_DSB_UOPS 0x08
+DEFAULT_OPTIONS_IDQ_DSB_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_DSB_CYCLES 0x08
+UMASK_IDQ_MS_DSB_UOPS 0x10
+DEFAULT_OPTIONS_IDQ_MS_DSB_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_MS_DSB_CYCLES 0x10
+DEFAULT_OPTIONS_IDQ_MS_DSB_OCCUR EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_IDQ_MS_DSB_OCCUR 0x10
+UMASK_IDQ_MS_MITE_UOPS 0x20
+DEFAULT_OPTIONS_IDQ_MS_MITE_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_MS_MITE_CYCLES 0x20
+UMASK_IDQ_MS_UOPS 0x30
+DEFAULT_OPTIONS_IDQ_MS_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_MS_CYCLES 0x30
+DEFAULT_OPTIONS_IDQ_MS_SWITCHES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_IDQ_MS_SWITCHES 0x30
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS 0x18
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_ALL_DSB_CYCLES_4_UOPS 0x18
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_ALL_MITE_CYCLES_ANY_UOPS 0x24
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS 0x24
+UMASK_IDQ_MITE_ALL_UOPS 0x3C
+
+EVENT_ICACHE 0x80 PMC
+UMASK_ICACHE_HIT 0x01
UMASK_ICACHE_MISSES 0x02
UMASK_ICACHE_ACCESSES 0x03
UMASK_ICACHE_IFETCH_STALL 0x04
-EVENT_ITLB_MISSES 0x85 PMC
-UMASK_ITLB_MISSES_CAUSES_A_WALK 0x01
-UMASK_ITLB_MISSES_WALK_COMPLETED_4K 0x02
+EVENT_ITLB_MISSES 0x85 PMC
+UMASK_ITLB_MISSES_CAUSES_A_WALK 0x01
+UMASK_ITLB_MISSES_WALK_COMPLETED_4K 0x02
UMASK_ITLB_MISSES_WALK_COMPLETED_LARGE 0x04
-UMASK_ITLB_MISSES_WALK_COMPLETED 0x0E
-UMASK_ITLB_MISSES_WALK_DURATION 0x10
-UMASK_ITLB_MISSES_STLB_HIT_4K 0x20
-UMASK_ITLB_MISSES_STLB_HIT_2M 0x40
-UMASK_ITLB_MISSES_STLB_HIT 0x60
+UMASK_ITLB_MISSES_WALK_COMPLETED 0x0E
+UMASK_ITLB_MISSES_WALK_DURATION 0x10
+UMASK_ITLB_MISSES_STLB_HIT_4K 0x20
+UMASK_ITLB_MISSES_STLB_HIT_2M 0x40
+UMASK_ITLB_MISSES_STLB_HIT 0x60
EVENT_ILD_STALL 0x87 PMC
UMASK_ILD_STALL_LCP 0x01
@@ -201,25 +290,51 @@ EVENT_BR_INST_EXEC 0x88 PMC
UMASK_BR_INST_EXEC_COND_TAKEN 0x81
UMASK_BR_INST_EXEC_COND_NON_TAKEN 0x41
UMASK_BR_INST_EXEC_DIRECT_JMP_TAKEN 0x82
+UMASK_BR_INST_EXEC_DIRECT_JMP_NON_TAKEN 0x42
UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN 0x84
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN 0x44
UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN 0x88
+UMASK_BR_INST_EXEC_RETURN_NEAR_NON_TAKEN 0x48
UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN 0x90
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_NON_TAKEN 0x50
UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN 0xA0
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN 0x60
+UMASK_BR_INST_EXEC_ALL_CONDITIONAL 0xC1
+UMASK_BR_INST_EXEC_ALL_DIRECT_JMP 0xC2
+UMASK_BR_INST_EXEC_ALL_DIRECT_NEAR_CALL 0xD0
+UMASK_BR_INST_EXEC_ALL_INDIRECT_JUMP_NON_CALL_RET 0xC4
+UMASK_BR_INST_EXEC_ALL_INDIRECT_NEAR_RETURN 0xC8
UMASK_BR_INST_EXEC_ALL_BRANCHES 0xFF
EVENT_BR_MISP_EXEC 0x89 PMC
UMASK_BR_MISP_EXEC_COND_TAKEN 0x81
UMASK_BR_MISP_EXEC_COND_NON_TAKEN 0x41
UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN 0x84
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN 0x44
UMASK_BR_MISP_EXEC_RETURN_NEAR_TAKEN 0x88
+UMASK_BR_MISP_EXEC_RETURN_NEAR_NON_TAKEN 0x48
UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_TAKEN 0x90
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_NON_TAKEN 0x50
UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN 0xA0
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN 0x60
+UMASK_BR_MISP_EXEC_ALL_CONDITIONAL 0xC1
+UMASK_BR_MISP_EXEC_ALL_INDIRECT_JUMP_NON_CALL_RET 0xC4
UMASK_BR_MISP_EXEC_ALL_BRANCHES 0xFF
EVENT_IDQ_UOPS_NOT_DELIVERED 0x9C PMC
UMASK_IDQ_UOPS_NOT_DELIVERED_CORE 0x01
-
-EVENT_UOPS_EXECUTED_PORT 0xA1 PMC
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x3
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x2
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK 0x01
+
+EVENT_UOPS_EXECUTED_PORT 0xA1 PMC
UMASK_UOPS_EXECUTED_PORT_PORT_0 0x01
UMASK_UOPS_EXECUTED_PORT_PORT_1 0x02
UMASK_UOPS_EXECUTED_PORT_PORT_2 0x04
@@ -228,44 +343,111 @@ UMASK_UOPS_EXECUTED_PORT_PORT_4 0x10
UMASK_UOPS_EXECUTED_PORT_PORT_5 0x20
UMASK_UOPS_EXECUTED_PORT_PORT_6 0x40
UMASK_UOPS_EXECUTED_PORT_PORT_7 0x80
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_0_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_0_CORE 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_1_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_1_CORE 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_2_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_2_CORE 0x04
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_3_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_3_CORE 0x08
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_4_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_4_CORE 0x10
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_5_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_5_CORE 0x20
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_6_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_6_CORE 0x40
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_7_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_7_CORE 0x80
+UMASK_UOPS_EXECUTED_PORT_ARITH_PORTS 0x63
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_ARITH_PORTS_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_ARITH_PORTS_CORE 0x63
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_DATA_PORTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_DATA_PORTS 0x9C
EVENT_RESOURCE_STALLS 0xA2 PMC
UMASK_RESOURCE_STALLS_ANY 0x01
UMASK_RESOURCE_STALLS_RS 0x04
-UMASK_RESOURCE_STALLS_SB 0x08
+UMASK_RESOURCE_STALLS_SB 0x08
UMASK_RESOURCE_STALLS_ROB 0x10
EVENT_CYCLE_ACTIVITY 0xA3 PMC
-UMASK_CYCLE_ACTIVITY_CYCLES_L2_PENDING 0x01
-UMASK_CYCLE_ACTIVITY_CYCLES_L2_PENDING_CYCLES 0x01 0x00 0x02
-UMASK_CYCLE_ACTIVITY_CYCLES_LDM_PENDING 0x02
-UMASK_CYCLE_ACTIVITY_CYCLES_LDM_PENDING_CYCLES 0x01 0x00 0x02
+# Errata HSW62: May be unreliable in SMT mode
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L2_PENDING EVENT_OPTION_THRESHOLD=0x1
+UMASK_CYCLE_ACTIVITY_CYCLES_L2_PENDING 0x01
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_MEM_ANY EVENT_OPTION_THRESHOLD=0x1
+UMASK_CYCLE_ACTIVITY_CYCLES_MEM_ANY 0x02
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE EVENT_OPTION_THRESHOLD=0x4
+UMASK_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE 0x04
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_L2_PENDING EVENT_OPTION_THRESHOLD=0x5
UMASK_CYCLE_ACTIVITY_STALLS_L2_PENDING 0x05
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_MEM_ANY EVENT_OPTION_THRESHOLD=0x6
+UMASK_CYCLE_ACTIVITY_STALLS_MEM_ANY 0x06
EVENT_CYCLE_ACTIVITY_CYCLES 0xA3 PMC2
-UMASK_CYCLE_ACTIVITY_CYCLES_L1D_PENDING 0x08
-UMASK_CYCLE_ACTIVITY_CYCLES_L1D_PENDING_CYCLES 0x08 0x00 0x08
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L1D_PENDING EVENT_OPTION_THRESHOLD=0x8
+UMASK_CYCLE_ACTIVITY_CYCLES_L1D_PENDING 0x08
EVENT_CYCLE_ACTIVITY_STALLS 0xA3 PMC2
-UMASK_CYCLE_ACTIVITY_STALLS_L1D_PENDING 0x0C
-UMASK_CYCLE_ACTIVITY_STALLS_L1D_PENDING_CYCLES 0x0C 0x00 0x0C
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_L1D_PENDING EVENT_OPTION_THRESHOLD=0xC
+UMASK_CYCLE_ACTIVITY_STALLS_L1D_PENDING 0x0C
+
+EVENT_LSD_UOPS 0xA8 PMC
+UMASK_LSD_UOPS 0x01
+DEFAULT_OPTIONS_LSD_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x4
+UMASK_LSD_CYCLES_4_UOPS 0x01
+DEFAULT_OPTIONS_LSD_CYCLES_ACTIVE EVENT_OPTION_THRESHOLD=0x1
+UMASK_LSD_CYCLES_ACTIVE 0x01
-EVENT_LSD_UOPS 0xA8 PMC
-UMASK_LSD_UOPS 0x01
+EVENT_DSB2MITE_SWITCHES 0xAB PMC
+UMASK_DSB2MITE_SWITCHES_COUNT 0x01
+UMASK_DSB2MITE_SWITCHES_PENALTY_CYCLES 0x02
-EVENT_ITLB 0xAE PMC
-UMASK_ITLB_ITLB_FLUSH 0x01
+EVENT_ITLB 0xAE PMC
+UMASK_ITLB_ITLB_FLUSH 0x01
-EVENT_OFFCORE_REQUESTS 0xB0 PMC
+EVENT_OFFCORE_REQUESTS 0xB0 PMC
UMASK_OFFCORE_REQUESTS_DEMAND_DATA_RD 0x01
UMASK_OFFCORE_REQUESTS_DEMAND_CODE_RD 0x02
UMASK_OFFCORE_REQUESTS_DEMAND_RFO 0x04
UMASK_OFFCORE_REQUESTS_ALL_DATA_RD 0x08
-EVENT_UOPS_EXECUTED 0xB1 PMC
-UMASK_UOPS_EXECUTED_CORE 0x02
-
-EVENT_PAGE_WALKER_LOADS 0xBC PMC
+EVENT_UOPS_EXECUTED 0xB1 PMC
+UMASK_UOPS_EXECUTED_THREAD 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC 0x01
+UMASK_UOPS_EXECUTED_CORE 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_USED_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_TOTAL_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC 0x02
+
+EVENT_OFFCORE_REQUESTS_BUFFER_SQ_FULL 0xB2 PMC
+UMASK_OFFCORE_REQUESTS_BUFFER_SQ_FULL 0x01
+
+EVENT_PAGE_WALKER_LOADS 0xBC PMC
UMASK_PAGE_WALKER_LOADS_DTLB_L1 0x11
UMASK_PAGE_WALKER_LOADS_ITLB_L1 0x21
UMASK_PAGE_WALKER_LOADS_DTLB_L2 0x12
@@ -274,14 +456,25 @@ UMASK_PAGE_WALKER_LOADS_DTLB_L3 0x14
UMASK_PAGE_WALKER_LOADS_ITLB_L3 0x24
UMASK_PAGE_WALKER_LOADS_DTLB_MEMORY 0x18
UMASK_PAGE_WALKER_LOADS_ITLB_MEMORY 0x28
-
-EVENT_TLB_FLUSH 0xBD PMC
+UMASK_PAGE_WALKER_LOADS_EPT_DTLB_L1 0x41
+UMASK_PAGE_WALKER_LOADS_EPT_ITLB_L1 0x81
+UMASK_PAGE_WALKER_LOADS_EPT_DTLB_L2 0x42
+UMASK_PAGE_WALKER_LOADS_EPT_ITLB_L2 0x82
+UMASK_PAGE_WALKER_LOADS_EPT_DTLB_L3 0x44
+UMASK_PAGE_WALKER_LOADS_EPT_ITLB_L3 0x84
+UMASK_PAGE_WALKER_LOADS_EPT_DTLB_MEMORY 0x48
+UMASK_PAGE_WALKER_LOADS_EPT_ITLB_MEMORY 0x88
+
+EVENT_TLB_FLUSH 0xBD PMC
UMASK_TLB_FLUSH_DTLB_THREAD 0x01
UMASK_TLB_FLUSH_STLB_ANY 0x20
-EVENT_INST_RETIRED 0xC0 PMC1
+EVENT_INST_RETIRED_ANY 0xC0 PMC
UMASK_INST_RETIRED_ANY_P 0x00
+EVENT_INST_RETIRED_PREC 0xC0 PMC1
+UMASK_INST_RETIRED_PREC_DIST 0x01
+
EVENT_OTHER_ASSISTS 0xC1 PMC
UMASK_OTHER_ASSISTS_AVX_TO_SSE 0x08
UMASK_OTHER_ASSISTS_SSE_TO_AVX 0x10
@@ -289,9 +482,28 @@ UMASK_OTHER_ASSISTS_ANY_WB_ASSIST 0x40
EVENT_UOPS_RETIRED 0xC2 PMC
UMASK_UOPS_RETIRED_ALL 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL 0x01
UMASK_UOPS_RETIRED_RETIRE_SLOTS 0x02
-
-EVENT_MACHINE_CLEARS 0xC3 PMC
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_RETIRED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_TOTAL_CYCLES 0x01
+
+EVENT_MACHINE_CLEARS 0xC3 PMC
+UMASK_MACHINE_CLEARS_CYCLES 0x01
+DEFAULT_OPTIONS_MACHINE_CLEARS_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_MACHINE_CLEARS_COUNT 0x01
UMASK_MACHINE_CLEARS_MEMORY_ORDERING 0x02
UMASK_MACHINE_CLEARS_SMC 0x04
UMASK_MACHINE_CLEARS_MASKMOV 0x20
@@ -306,35 +518,63 @@ UMASK_BR_INST_RETIRED_NOT_TAKEN 0x10
UMASK_BR_INST_RETIRED_NEAR_TAKEN 0x20
UMASK_BR_INST_RETIRED_FAR_BRANCH 0x40
-EVENT_BR_MISP_RETIRED 0xC5 PMC
-UMASK_BR_MISP_RETIRED_ALL_BRANCHES_1 0x00
-UMASK_BR_MISP_RETIRED_CONDITIONAL 0x01
-UMASK_BR_MISP_RETIRED_ALL_BRANCHES_2 0x04
-UMASK_BR_MISP_RETIRED_NEAR_NOT_TAKEN 0x10
-UMASK_BR_MISP_RETIRED_NEAR_TAKEN 0x20
-
-EVENT_FP_ASSIST 0xCA PMC
-UMASK_FP_ASSIST_X87_OUTPUT 0x02
-UMASK_FP_ASSIST_X87_INPUT 0x04
-UMASK_FP_ASSIST_SIMD_OUTPUT 0x08
-UMASK_FP_ASSIST_SIMD_INPUT 0x10
+EVENT_BR_MISP_RETIRED 0xC5 PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES 0x00
+UMASK_BR_MISP_RETIRED_CONDITIONAL 0x01
+UMASK_BR_MISP_RETIRED_NEAR_NOT_TAKEN 0x10
+UMASK_BR_MISP_RETIRED_NEAR_TAKEN 0x20
+
+EVENT_AVX_INSTS 0xC6 PMC
+UMASK_AVX_INSTS_LOADS 0x01
+UMASK_AVX_INSTS_STORES 0x02
+UMASK_AVX_INSTS_CALC 0x04
+UMASK_AVX_INSTS_ALL 0x07
+
+
+EVENT_HLE_RETIRED 0xC8 PMC
+UMASK_HLE_RETIRED_START 0x01
+UMASK_HLE_RETIRED_COMMIT 0x02
+UMASK_HLE_RETIRED_ABORTED 0x04
+UMASK_HLE_RETIRED_ABORTED_MISC1 0x08
+UMASK_HLE_RETIRED_ABORTED_MISC2 0x10
+UMASK_HLE_RETIRED_ABORTED_MISC3 0x20
+# Errata HSW65: May overcount
+UMASK_HLE_RETIRED_ABORTED_MISC4 0x40
+UMASK_HLE_RETIRED_ABORTED_MISC5 0x80
+
+EVENT_RTM_RETIRED 0xC9 PMC
+UMASK_RTM_RETIRED_START 0x01
+UMASK_RTM_RETIRED_COMMIT 0x02
+UMASK_RTM_RETIRED_ABORTED 0x04
+UMASK_RTM_RETIRED_ABORTED_MISC1 0x08
+UMASK_RTM_RETIRED_ABORTED_MISC2 0x10
+UMASK_RTM_RETIRED_ABORTED_MISC3 0x20
+# Errata HSW65: May overcount732H
+UMASK_RTM_RETIRED_ABORTED_MISC4 0x40
+UMASK_RTM_RETIRED_ABORTED_MISC5 0x80
+
+
+EVENT_FP_ASSIST 0xCA PMC
+UMASK_FP_ASSIST_X87_OUTPUT 0x02
+UMASK_FP_ASSIST_X87_INPUT 0x04
+UMASK_FP_ASSIST_SIMD_OUTPUT 0x08
+UMASK_FP_ASSIST_SIMD_INPUT 0x10
+DEFAULT_OPTIONS_FP_ASSIST_ANY EVENT_OPTION_THRESHOLD=0x1
UMASK_FP_ASSIST_ANY 0x1E
EVENT_ROB_MISC_EVENT_LBR_INSERTS 0xCC PMC
UMASK_ROB_MISC_EVENT_LBR_INSERTS 0x20
-EVENT_MEM_TRANS_RETIRED_LOAD_LAT 0xCD PMC
-UMASK_MEM_TRANS_RETIRED_LOAD_LATENCY 0x01
-
-EVENT_MEM_UOP_RETIRED 0xD0 PMC
-UMASK_MEM_UOP_RETIRED_LOADS 0x81
-UMASK_MEM_UOP_RETIRED_STORES 0x82
-UMASK_MEM_UOP_RETIRED_LOADS_STLB_MISS 0x11
-UMASK_MEM_UOP_RETIRED_STORES_STLB_MISS 0x12
-UMASK_MEM_UOP_RETIRED_LOADS_LOCK 0x21
-UMASK_MEM_UOP_RETIRED_STORES_LOCK 0x22
-UMASK_MEM_UOP_RETIRED_LOADS_SPLIT 0x41
-UMASK_MEM_UOP_RETIRED_STORES_SPLIT 0x42
+EVENT_MEM_UOPS_RETIRED 0xD0 PMC
+UMASK_MEM_UOPS_RETIRED_LOADS 0x81
+UMASK_MEM_UOPS_RETIRED_STORES 0x82
+UMASK_MEM_UOPS_RETIRED_ALL 0x83
+UMASK_MEM_UOPS_RETIRED_LOADS_STLB_MISS 0x11
+UMASK_MEM_UOPS_RETIRED_STORES_STLB_MISS 0x12
+UMASK_MEM_UOPS_RETIRED_LOADS_LOCK 0x21
+UMASK_MEM_UOPS_RETIRED_STORES_LOCK 0x22
+UMASK_MEM_UOPS_RETIRED_LOADS_SPLIT 0x41
+UMASK_MEM_UOPS_RETIRED_STORES_SPLIT 0x42
EVENT_MEM_LOAD_UOPS_RETIRED 0xD1 PMC
UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT 0x01
@@ -347,6 +587,9 @@ UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT 0x04
UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS 0x20
UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL 0x24
UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB 0x40
+UMASK_MEM_LOAD_UOPS_RETIRED_ALL_MISS 0x38
+UMASK_MEM_LOAD_UOPS_RETIRED_ALL_HIT 0x07
+UMASK_MEM_LOAD_UOPS_RETIRED_ALL_ALL 0x3F
EVENT_MEM_LOAD_UOPS_L3_HIT_RETIRED 0xD2 PMC
UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_MISS 0x01
@@ -354,8 +597,8 @@ UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HIT 0x02
UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM 0x04
UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_NONE 0x08
-EVENT_MEM_LOAD_UOPS_LLC_MISS_RETIRED 0xD3 PMC
-UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_LOCAL_DRAM 0x01
+EVENT_MEM_LOAD_UOPS_L3_MISS_RETIRED 0xD3 PMC
+UMASK_MEM_LOAD_UOPS_L3_MISS_RETIRED_LOCAL_DRAM 0x01
EVENT_BACLEARS 0xE6 PMC
UMASK_BACLEARS_ANY 0x1F
@@ -364,55 +607,113 @@ EVENT_L2_TRANS 0xF0 PMC
UMASK_L2_TRANS_DEMAND_DATA_RD 0x01
UMASK_L2_TRANS_RFO 0x02
UMASK_L2_TRANS_CODE_RD 0x04
-UMASK_L2_TRANS_ALL_PREF 0x08
+UMASK_L2_TRANS_ALL_PF 0x08
UMASK_L2_TRANS_L1D_WB 0x10
UMASK_L2_TRANS_L2_FILL 0x20
UMASK_L2_TRANS_L2_WB 0x40
UMASK_L2_TRANS_ALL_REQUESTS 0x80
EVENT_L2_LINES_IN 0xF1 PMC
-UMASK_L2_LINES_IN_I 0x01
-UMASK_L2_LINES_IN_S 0x02
-UMASK_L2_LINES_IN_E 0x04
+UMASK_L2_LINES_IN_I 0x01
+UMASK_L2_LINES_IN_S 0x02
+UMASK_L2_LINES_IN_E 0x04
UMASK_L2_LINES_IN_ALL 0x07
EVENT_L2_LINES_OUT 0xF2 PMC
UMASK_L2_LINES_OUT_DEMAND_CLEAN 0x05
UMASK_L2_LINES_OUT_DEMAND_DIRTY 0x06
-EVENT_TX_MEM_ABORT_CONFLICT 0x54 PMC
-UMASK_TX_MEM_ABORT_CONFLICT 0x01
-UMASK_TX_MEM_ABORT_CAPACITY 0x02
-UMASK_TX_MEM_ABORT_HLE_STORE_TO_ELIDED_LOCK 0x04
-UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_NOT_EMPTY 0x08
-UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_MISMATCH 0x10
-UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_UNSUPP_ALIGNMENT 0x20
-UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_FULL 0x40
-
-EVENT_TX_EXEC 0x5D PMC
-UMASK_TX_EXEC_MISC1 0x01
-UMASK_TX_EXEC_MISC2 0x02
-UMASK_TX_EXEC_MISC3 0x04
-UMASK_TX_EXEC_MISC4 0x08
-UMASK_TX_EXEC_MISC5 0x10
-
-
-EVENT_HLE_RETIRED 0xC8 PMC
-UMASK_HLE_RETIRED_START 0x01
-UMASK_HLE_RETIRED_COMMIT 0x02
-UMASK_HLE_RETIRED_ABORTED 0x04
-UMASK_HLE_RETIRED_ABORTED_MISC1 0x08
-UMASK_HLE_RETIRED_ABORTED_MISC2 0x10
-UMASK_HLE_RETIRED_ABORTED_MISC3 0x20
-UMASK_HLE_RETIRED_ABORTED_MISC4 0x40
-UMASK_HLE_RETIRED_ABORTED_MISC5 0x80
-
-EVENT_RTM_RETIRED 0xC9 PMC
-UMASK_RTM_RETIRED_START 0x01
-UMASK_RTM_RETIRED_COMMIT 0x02
-UMASK_RTM_RETIRED_ABORTED 0x04
-UMASK_RTM_RETIRED_ABORTED_MISC1 0x08
-UMASK_RTM_RETIRED_ABORTED_MISC2 0x10
-UMASK_RTM_RETIRED_ABORTED_MISC3 0x20
-UMASK_RTM_RETIRED_ABORTED_MISC4 0x40
-UMASK_RTM_RETIRED_ABORTED_MISC5 0x80
+EVENT_OFFCORE_RESPONSE_0 0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS 0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY 0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY 0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY 0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY 0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY 0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY 0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY 0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY 0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY 0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY 0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY 0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY 0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY 0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1 0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS 0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY 0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY 0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY 0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY 0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY 0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY 0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY 0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY 0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY 0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY 0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY 0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY 0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY 0x01 0x0F 0x10
+
+EVENT_CACHE_LOOKUP 0x34 CBOX
+UMASK_CACHE_LOOKUP_M 0x01
+UMASK_CACHE_LOOKUP_E 0x02
+UMASK_CACHE_LOOKUP_S 0x04
+UMASK_CACHE_LOOKUP_I 0x08
+UMASK_CACHE_LOOKUP_READ_FILTER 0x10
+UMASK_CACHE_LOOKUP_WRITE_FILTER 0x20
+UMASK_CACHE_LOOKUP_EXTSNP_FILTER 0x40
+UMASK_CACHE_LOOKUP_ANY_REQUEST_FILTER 0x80
+UMASK_CACHE_LOOKUP_READ_M 0x11
+UMASK_CACHE_LOOKUP_WRITE_M 0x21
+UMASK_CACHE_LOOKUP_EXTSNP_M 0x41
+UMASK_CACHE_LOOKUP_ANY_M 0x81
+UMASK_CACHE_LOOKUP_READ_E 0x12
+UMASK_CACHE_LOOKUP_WRITE_E 0x22
+UMASK_CACHE_LOOKUP_EXTSNP_E 0x42
+UMASK_CACHE_LOOKUP_ANY_E 0x82
+UMASK_CACHE_LOOKUP_READ_S 0x14
+UMASK_CACHE_LOOKUP_WRITE_S 0x24
+UMASK_CACHE_LOOKUP_EXTSNP_S 0x44
+UMASK_CACHE_LOOKUP_ANY_S 0x84
+UMASK_CACHE_LOOKUP_READ_ES 0x16
+UMASK_CACHE_LOOKUP_WRITE_ES 0x26
+UMASK_CACHE_LOOKUP_EXTSNP_ES 0x46
+UMASK_CACHE_LOOKUP_ANY_ES 0x86
+UMASK_CACHE_LOOKUP_READ_I 0x18
+UMASK_CACHE_LOOKUP_WRITE_I 0x28
+UMASK_CACHE_LOOKUP_EXTSNP_I 0x48
+UMASK_CACHE_LOOKUP_ANY_I 0x88
+UMASK_CACHE_LOOKUP_READ_MESI 0x1F
+UMASK_CACHE_LOOKUP_WRITE_MESI 0x2F
+UMASK_CACHE_LOOKUP_EXTSNP_MESI 0x4F
+UMASK_CACHE_LOOKUP_ANY_MESI 0x8F
+
+EVENT_XSNP_RESPONSE 0x22 CBOX
+UMASK_XSNP_RESPONSE_MISS_EXTERNAL 0x21
+UMASK_XSNP_RESPONSE_MISS_XCORE 0x41
+UMASK_XSNP_RESPONSE_MISS_EVICTION 0x81
+UMASK_XSNP_RESPONSE_HIT_EXTERNAL 0x24
+UMASK_XSNP_RESPONSE_HIT_XCORE 0x44
+UMASK_XSNP_RESPONSE_HIT_EVICTION 0x84
+UMASK_XSNP_RESPONSE_HITM_EXTERNAL 0x28
+UMASK_XSNP_RESPONSE_HITM_XCORE 0x48
+UMASK_XSNP_RESPONSE_HITM_EVICTION 0x88
+
+EVENT_TRK_OCCUPANCY_ALL 0x80 UBOX0
+UMASK_TRK_OCCUPANCY_ALL 0x01
+
+EVENT_TRK_REQUESTS 0x81 UBOX
+UMASK_TRK_REQUESTS_ALL 0x01
+UMASK_TRK_REQUESTS_WRITES 0x20
+
+EVENT_COH_TRK_OCCUPANCY 0x83 UBOX0
+UMASK_COH_TRK_OCCUPANCY 0x01
+
+EVENT_COH_TRK_REQUESTS 0x84 UBOX
+UMASK_COH_TRK_REQUESTS_ALL 0x01
+
+EVENT_UNCORE_CLOCK 0x00 UBOXFIX
+UMASK_UNCORE_CLOCK 0x01
diff --git a/src/includes/perfmon_interlagos.h b/src/includes/perfmon_interlagos.h
index d28bb18..b922ce2 100644
--- a/src/includes/perfmon_interlagos.h
+++ b/src/includes/perfmon_interlagos.h
@@ -5,13 +5,14 @@
*
* Description: Header file of perfmon module for AMD Interlagos
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -29,219 +30,273 @@
*/
#include <perfmon_interlagos_events.h>
-#include <perfmon_interlagos_groups.h>
#include <perfmon_interlagos_counters.h>
+#include <error.h>
static int perfmon_numCountersInterlagos = NUM_COUNTERS_INTERLAGOS;
-static int perfmon_numGroupsInterlagos = NUM_GROUPS_INTERLAGOS;
static int perfmon_numArchEventsInterlagos = NUM_ARCH_EVENTS_INTERLAGOS;
-void perfmon_init_interlagos(PerfmonThread *thread)
+int perfmon_init_interlagos(int cpu_id)
+{
+ lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+ return 0;
+}
+
+int ilg_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent* event)
{
uint64_t flags = 0x0ULL;
- int cpu_id = thread->processorId;
-
- msr_write(cpu_id, MSR_AMD15_PERFEVTSEL0, 0x0ULL);
- msr_write(cpu_id, MSR_AMD15_PERFEVTSEL1, 0x0ULL);
- msr_write(cpu_id, MSR_AMD15_PERFEVTSEL2, 0x0ULL);
- msr_write(cpu_id, MSR_AMD15_PERFEVTSEL3, 0x0ULL);
- msr_write(cpu_id, MSR_AMD15_PERFEVTSEL4, 0x0ULL);
- msr_write(cpu_id, MSR_AMD15_PERFEVTSEL5, 0x0ULL);
-
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
- lock_acquire(
- (int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id)
- )
+
+ flags |= (1ULL<<16);
+ flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
+
+ if (event->numberOfOptions > 0)
+ {
+ for(int j=0;j<event->numberOfOptions;j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<17);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ if ((event->options[j].value & 0xFFULL) < 0x20)
+ {
+ flags |= (event->options[j].value & 0xFFULL) << 24;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
{
- msr_write(cpu_id, MSR_AMD15_NB_PERFEVTSEL0, 0x0ULL);
- msr_write(cpu_id, MSR_AMD15_NB_PERFEVTSEL1, 0x0ULL);
- msr_write(cpu_id, MSR_AMD15_NB_PERFEVTSEL2, 0x0ULL);
- msr_write(cpu_id, MSR_AMD15_NB_PERFEVTSEL3, 0x0ULL);
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
}
-
- //flags |= (1<<16); /* user mode flag */
- /*msr_write(cpu_id, MSR_AMD15_PERFEVTSEL0, flags);
- msr_write(cpu_id, MSR_AMD15_PERFEVTSEL1, flags);
- msr_write(cpu_id, MSR_AMD15_PERFEVTSEL2, flags);
- msr_write(cpu_id, MSR_AMD15_PERFEVTSEL3, flags);
- msr_write(cpu_id, MSR_AMD15_PERFEVTSEL4, flags);
- msr_write(cpu_id, MSR_AMD15_PERFEVTSEL5, flags);*/
+ return 0;
}
-
-void perfmon_setupCounterThread_interlagos(
- int thread_id,
- PerfmonEvent* event,
- PerfmonCounterIndex index)
+int ilg_uncore_setup(int cpu_id, RegisterIndex index, PerfmonEvent* event)
{
- uint64_t flags;
- uint64_t reg = interlagos_counter_map[index].configRegister;
- int cpu_id = perfmon_threadData[thread_id].processorId;
- perfmon_threadData[thread_id].counters[index].init = TRUE;
-
- /* only one thread accesses Uncore */
- if ( (interlagos_counter_map[index].type == UNCORE) &&
- !(socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) )
+ uint64_t flags = 0x0ULL;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
{
- return;
+ return 0;
}
- flags = (1<<16);
- /* AMD uses a 12 bit Event mask: [35:32][7:0] */
flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
-
- if (perfmon_verbose)
+ if (flags != currentConfig[cpu_id][index])
{
- printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n",
- cpu_id,
- LLU_CAST reg,
- LLU_CAST flags);
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_UNCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
}
-
- msr_write(cpu_id, reg , flags);
+ return 0;
}
-void perfmon_startCountersThread_interlagos(int thread_id)
+int perfmon_setupCounterThread_interlagos(int thread_id, PerfmonEventSet* eventSet)
{
int haveLock = 0;
- uint64_t flags;
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
haveLock = 1;
}
- for ( int i=0; i<NUM_COUNTERS_INTERLAGOS; i++ )
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- if (interlagos_counter_map[i].type == PMC)
- {
- msr_write(cpu_id, interlagos_counter_map[i].counterRegister , 0x0ULL);
- flags = msr_read(cpu_id, interlagos_counter_map[i].configRegister);
- flags |= (1<<22); /* enable flag */
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ eventSet->events[i].threadCounter[thread_id].init = TRUE;
+ switch(type)
+ {
+ case PMC:
+ ilg_pmc_setup(cpu_id, index, event);
+ break;
+ case UNCORE:
+ ilg_uncore_setup(cpu_id, index, event);
+ break;
+ default:
+ break;
+ }
+ }
+ return 0;
+}
- if (perfmon_verbose)
- {
- printf("perfmon_start_counters: Write Register 0x%llX , Flags: 0x%llX \n",
- LLU_CAST interlagos_counter_map[i].configRegister,
- LLU_CAST flags);
- }
- msr_write(cpu_id, interlagos_counter_map[i].configRegister , flags);
+int perfmon_startCountersThread_interlagos(int thread_id, PerfmonEventSet* eventSet)
+{
+ int haveLock = 0;
+ uint64_t flags = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveLock = 1;
+ }
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
}
- else if ( interlagos_counter_map[i].type == UNCORE )
+ RegisterIndex index = eventSet->events[i].index;
+ uint32_t counter = counter_map[index].counterRegister;
+ uint32_t reg = counter_map[index].configRegister;
+ eventSet->events[i].threadCounter[thread_id].startData = 0;
+ eventSet->events[i].threadCounter[thread_id].counterData = 0;
+ if (type == PMC || ((type == UNCORE) && (haveLock)))
{
- if(haveLock)
- {
- msr_write(cpu_id, interlagos_counter_map[i].counterRegister , 0x0ULL);
- flags = msr_read(cpu_id, interlagos_counter_map[i].configRegister);
- flags |= (1<<22); /* enable flag */
-
- if (perfmon_verbose)
- {
- printf("perfmon_start_counters: Write Register 0x%llX , Flags: 0x%llX \n",
- LLU_CAST interlagos_counter_map[i].configRegister,
- LLU_CAST flags);
- }
-
- msr_write(cpu_id, interlagos_counter_map[i].configRegister , flags);
- }
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter, 0x0ULL));
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &flags));
+ flags |= (1<<22); /* enable flag */
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
}
}
}
+ return 0;
}
-void perfmon_stopCountersThread_interlagos(int thread_id)
+int perfmon_stopCountersThread_interlagos(int thread_id, PerfmonEventSet* eventSet)
{
- uint64_t flags;
+ uint64_t flags = 0x0ULL;
int haveLock = 0;
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ uint64_t tmp;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
haveLock = 1;
}
- for ( int i=0; i<NUM_COUNTERS_INTERLAGOS; i++ )
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- if ( interlagos_counter_map[i].type == PMC )
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- flags = msr_read(cpu_id,interlagos_counter_map[i].configRegister);
- flags &= ~(1<<22); /* clear enable flag */
- msr_write(cpu_id, interlagos_counter_map[i].configRegister , flags);
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, interlagos_counter_map[i].counterRegister);
-
- if (perfmon_verbose)
- {
- printf("perfmon_stop_counters: Write Register 0x%llX , Flags: 0x%llX \n",
- LLU_CAST interlagos_counter_map[i].configRegister,
- LLU_CAST flags);
- printf("perfmon_stop_counters: Read Register 0x%llX , Flags: 0x%llX \n",
- LLU_CAST interlagos_counter_map[i].counterRegister,
- LLU_CAST perfmon_threadData[thread_id].counters[i].counterData);
- }
-
+ continue;
}
- else if (interlagos_counter_map[i].type == UNCORE)
+ tmp = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ uint32_t counter = counter_map[index].counterRegister;
+ uint32_t reg = counter_map[index].configRegister;
+ switch (type)
{
- if(haveLock)
- {
- flags = msr_read(cpu_id, interlagos_counter_map[i].configRegister);
+ case PMC:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &flags));
flags &= ~(1<<22); /* clear enable flag */
- msr_write(cpu_id, interlagos_counter_map[i].configRegister , flags);
-
- if (perfmon_verbose)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &tmp));
+ break;
+ case UNCORE:
+ if (haveLock)
{
- printf("perfmon_stop_counters: Write Register 0x%llX , Flags: 0x%llX \n",
- LLU_CAST interlagos_counter_map[i].configRegister,
- LLU_CAST flags);
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &flags));
+ flags &= ~(1<<22); /* clear enable flag */
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &tmp));
}
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, interlagos_counter_map[i].counterRegister);
- }
+ break;
+ default:
+ break;
}
+ eventSet->events[i].threadCounter[thread_id].counterData = field64(tmp, 0, box_map[type].regWidth);
}
}
+ return 0;
}
-void perfmon_readCountersThread_interlagos(int thread_id)
+int perfmon_readCountersThread_interlagos(int thread_id, PerfmonEventSet* eventSet)
{
int haveLock = 0;
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+ uint64_t tmp;
if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
haveLock = 1;
}
-
- for (int i=0;i<NUM_COUNTERS_INTERLAGOS;i++)
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- if ( interlagos_counter_map[i].type == UNCORE )
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- if ( haveLock )
- {
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, interlagos_counter_map[i].counterRegister);
- }
+ continue;
}
- else
+ tmp = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ uint32_t counter = counter_map[index].counterRegister;
+ switch (type)
{
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, interlagos_counter_map[i].counterRegister);
+ case PMC:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &tmp));
+ VERBOSEPRINTREG(cpu_id, counter, LLU_CAST tmp, READ_PMC);
+ break;
+ case UNCORE:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &tmp));
+ VERBOSEPRINTREG(cpu_id, counter, LLU_CAST tmp, READ_UNCORE);
+ break;
+ default:
+ break;
}
+ eventSet->events[i].threadCounter[thread_id].counterData = field64(tmp, 0, box_map[type].regWidth);
}
}
+ return 0;
}
+
+int perfmon_finalizeCountersThread_interlagos(int thread_id, PerfmonEventSet* eventSet)
+{
+ int haveLock = 0;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveLock = 1;
+ }
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ uint32_t reg = counter_map[index].configRegister;
+ if ((reg) && (((type == PMC)||(type == FIXED))||((type >= UNCORE) && (haveLock))))
+ {
+ VERBOSEPRINTREG(cpu_id, reg, LLU_CAST 0x0ULL, CLEAR_CTRL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL));
+ }
+ eventSet->events[i].threadCounter[thread_id].init = FALSE;
+ }
+ return 0;
+}
diff --git a/src/includes/perfmon_interlagos_counters.h b/src/includes/perfmon_interlagos_counters.h
index a593f5a..5f7ac2f 100644
--- a/src/includes/perfmon_interlagos_counters.h
+++ b/src/includes/perfmon_interlagos_counters.h
@@ -5,13 +5,14 @@
*
* Description: Counter Header File of perfmon module for AMD Interlagos
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -31,18 +32,24 @@
#define NUM_COUNTERS_INTERLAGOS 10
#define NUM_COUNTERS_CORE_INTERLAGOS 6
-static PerfmonCounterMap interlagos_counter_map[NUM_COUNTERS_INTERLAGOS] = {
+#define ILG_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD
+
+static RegisterMap interlagos_counter_map[NUM_COUNTERS_INTERLAGOS] = {
/* Core counters */
- {"PMC0",PMC0, PMC, MSR_AMD15_PERFEVTSEL0, MSR_AMD15_PMC0, 0, 0},
- {"PMC1",PMC1, PMC, MSR_AMD15_PERFEVTSEL1, MSR_AMD15_PMC1, 0, 0},
- {"PMC2",PMC2, PMC, MSR_AMD15_PERFEVTSEL2, MSR_AMD15_PMC2, 0, 0},
- {"PMC3",PMC3, PMC, MSR_AMD15_PERFEVTSEL3, MSR_AMD15_PMC3, 0, 0},
- {"PMC4",PMC4, PMC, MSR_AMD15_PERFEVTSEL4, MSR_AMD15_PMC4, 0, 0},
- {"PMC5",PMC5, PMC, MSR_AMD15_PERFEVTSEL5, MSR_AMD15_PMC5, 0, 0},
+ {"PMC0",PMC0, PMC, MSR_AMD15_PERFEVTSEL0, MSR_AMD15_PMC0, 0, 0, ILG_VALID_OPTIONS_PMC},
+ {"PMC1",PMC1, PMC, MSR_AMD15_PERFEVTSEL1, MSR_AMD15_PMC1, 0, 0, ILG_VALID_OPTIONS_PMC},
+ {"PMC2",PMC2, PMC, MSR_AMD15_PERFEVTSEL2, MSR_AMD15_PMC2, 0, 0, ILG_VALID_OPTIONS_PMC},
+ {"PMC3",PMC3, PMC, MSR_AMD15_PERFEVTSEL3, MSR_AMD15_PMC3, 0, 0, ILG_VALID_OPTIONS_PMC},
+ {"PMC4",PMC4, PMC, MSR_AMD15_PERFEVTSEL4, MSR_AMD15_PMC4, 0, 0, ILG_VALID_OPTIONS_PMC},
+ {"PMC5",PMC5, PMC, MSR_AMD15_PERFEVTSEL5, MSR_AMD15_PMC5, 0, 0, ILG_VALID_OPTIONS_PMC},
/* Northbridge counters */
{"UPMC0",PMC6, UNCORE, MSR_AMD15_NB_PERFEVTSEL0, MSR_AMD15_NB_PMC0, 0, 0},
- {"UPMC1",PMC7, UNCORE, MSR_AMD15_NB_PERFEVTSEL0, MSR_AMD15_NB_PMC0, 0, 0},
- {"UPMC2",PMC8, UNCORE, MSR_AMD15_NB_PERFEVTSEL0, MSR_AMD15_NB_PMC0, 0, 0},
- {"UPMC3",PMC9, UNCORE, MSR_AMD15_NB_PERFEVTSEL0, MSR_AMD15_NB_PMC0, 0, 0}
+ {"UPMC1",PMC7, UNCORE, MSR_AMD15_NB_PERFEVTSEL1, MSR_AMD15_NB_PMC1, 0, 0},
+ {"UPMC2",PMC8, UNCORE, MSR_AMD15_NB_PERFEVTSEL2, MSR_AMD15_NB_PMC2, 0, 0},
+ {"UPMC3",PMC9, UNCORE, MSR_AMD15_NB_PERFEVTSEL3, MSR_AMD15_NB_PMC3, 0, 0}
};
+static BoxMap interlagos_box_map[NUM_UNITS] = {
+ [PMC] = {0, 0, 0, 0, 0, 0, 48},
+ [UNCORE] = {0, 0, 0, 0, 0, 0, 48},
+};
diff --git a/src/includes/perfmon_interlagos_events.txt b/src/includes/perfmon_interlagos_events.txt
index 1fa0a44..3a79497 100644
--- a/src/includes/perfmon_interlagos_events.txt
+++ b/src/includes/perfmon_interlagos_events.txt
@@ -1,16 +1,17 @@
# =======================================================================================
-#
+#
# Filename: perfmon_interlagos_events.txt
-#
+#
# Description: Event list for AMD Interlagos
-#
-# Version: 3.1.3
-# Released: 4.11.2014
-#
-# Author: Jan Treibig (jt), jan.treibig at gmail.com
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
+# Author: Jan Treibig (jt), jan.treibig at gmail.com
+# Thomas Roehl (tr), thomas.roehl at googlemail.com
# Project: likwid
#
-# Copyright (C) 2014 Jan Treibig
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
@@ -131,17 +132,23 @@ EVENT_UNIFIED_TLB_HIT 0x45 PMC0|PMC1|PMC2
UMASK_UNIFIED_TLB_HIT_4KB_DATA 0x01
UMASK_UNIFIED_TLB_HIT_2MB_DATA 0x02
UMASK_UNIFIED_TLB_HIT_1GB_DATA 0x04
+UMASK_UNIFIED_TLB_HIT_ANY_DATA 0x07
UMASK_UNIFIED_TLB_HIT_4KB_INSTR 0x10
UMASK_UNIFIED_TLB_HIT_2MB_INSTR 0x20
UMASK_UNIFIED_TLB_HIT_1GB_INSTR 0x40
+UMASK_UNIFIED_TLB_HIT_ANY_INSTR 0x70
+UMASK_UNIFIED_TLB_HIT_ANY 0x77
EVENT_UNIFIED_TLB_MISS 0x46 PMC0|PMC1|PMC2
UMASK_UNIFIED_TLB_MISS_4KB_DATA 0x01
UMASK_UNIFIED_TLB_MISS_2MB_DATA 0x02
UMASK_UNIFIED_TLB_MISS_1GB_DATA 0x04
+UMASK_UNIFIED_TLB_MISS_ANY_DATA 0x07
UMASK_UNIFIED_TLB_MISS_4KB_INSTR 0x10
UMASK_UNIFIED_TLB_MISS_2MB_INSTR 0x20
UMASK_UNIFIED_TLB_MISS_1GB_INSTR 0x40
+UMASK_UNIFIED_TLB_MISS_ANY_INSTR 0x70
+UMASK_UNIFIED_TLB_MISS_ANY 0x77
EVENT_MISALIGNED_ACCESS 0x47 PMC
UMASK_MISALIGNED_ACCESS 0x00
@@ -230,6 +237,7 @@ EVENT_ITLB_L1_MISS_L2_MISS 0x085 PMC0|PMC1|PMC2
UMASK_ITLB_L1_MISS_L2_MISS_4KB 0x01
UMASK_ITLB_L1_MISS_L2_MISS_2MB 0x02
UMASK_ITLB_L1_MISS_L2_MISS_1GB 0x04
+UMASK_ITLB_L1_MISS_L2_MISS_ANY 0x07
EVENT_PIPELINE_RESTART_DUE_TO_ISB 0x086 PMC0|PMC1|PMC2
UMASK_PIPELINE_RESTART_DUE_TO_ISB 0x00
@@ -387,6 +395,14 @@ UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_REMOTE_IO_MEM 0x92
UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_REMOTE_IO_IO 0x91
UMASK_UNC_CPU_REQUEST_TO_MEMORY_REMOTE_LOCAL_CPU_IO 0x64
UMASK_UNC_CPU_REQUEST_TO_MEMORY_REMOTE_LOCAL_IO_IO 0x61
+UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_ANY_CPU_MEM 0xB8
+UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_ANY_CPU_IO 0xB4
+UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_ANY_IO_MEM 0xB2
+UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_ANY_IO_IO 0xA1
+UMASK_UNC_CPU_REQUEST_TO_MEMORY_ANY_LOCAL_CPU_IO 0xE4
+UMASK_UNC_CPU_REQUEST_TO_MEMORY_ANY_LOCAL_IO_IO 0xE1
+UMASK_UNC_CPU_REQUEST_TO_MEMORY_ANY_ANY_CPU_IO 0xF4
+UMASK_UNC_CPU_REQUEST_TO_MEMORY_ANY_ANY_IO_IO 0xF1
EVENT_UNC_CACHE_BLOCK_COMMANDS 0x0EA UPMC
UMASK_UNC_CACHE_BLOCK_COMMANDS_VICTIM_BLOCK 0x01
@@ -420,21 +436,97 @@ UMASK_UNC_GART_EVENTS_MISS 0x04
UMASK_UNC_GART_EVENTS_REQUEST_WALK 0x08
UMASK_UNC_GART_EVENTS_MULTIPLE_WALK 0x80
-EVENT_UNC_LINK_TRANSMIT_BW_L0 0x0F6 UPMC
-UMASK_UNC_LINK_TRANSMIT_BW_L0_USE 0x17
+EVENT_UNC_LINK_TRANSMIT_BW_L0 0x0F6 UPMC
+UMASK_UNC_LINK_TRANSMIT_BW_L0_USE 0x37
UMASK_UNC_LINK_TRANSMIT_BW_L0_NOP 0x08
-
-EVENT_UNC_LINK_TRANSMIT_BW_L1 0x0F7 UPMC
-UMASK_UNC_LINK_TRANSMIT_BW_L1_USE 0x17
+UMASK_UNC_LINK_TRANSMIT_BW_L0_CMDS 0x00
+UMASK_UNC_LINK_TRANSMIT_BW_L0_DATA 0x01
+UMASK_UNC_LINK_TRANSMIT_BW_L0_BUF_REL 0x02
+UMASK_UNC_LINK_TRANSMIT_BW_L0_ADDR 0x10
+UMASK_UNC_LINK_TRANSMIT_BW_L0_CRC 0x20
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S0_USE 0x37
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S0_NOP 0x08
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S0_CMDS 0x00
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S0_DATA 0x01
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S0_BUF_REL 0x02
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S0_ADDR 0x10
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S0_CRC 0x20
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S1_USE 0xB7
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S1_NOP 0x88
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S1_CMDS 0x80
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S1_DATA 0x81
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S1_BUF_REL 0x82
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S1_ADDR 0x90
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S1_CRC 0xA0
+
+EVENT_UNC_LINK_TRANSMIT_BW_L1 0x0F7 UPMC
+UMASK_UNC_LINK_TRANSMIT_BW_L1_USE 0x37
UMASK_UNC_LINK_TRANSMIT_BW_L1_NOP 0x08
-
-EVENT_UNC_LINK_TRANSMIT_BW_L2 0x0F8 UPMC
-UMASK_UNC_LINK_TRANSMIT_BW_L2_USE 0x17
+UMASK_UNC_LINK_TRANSMIT_BW_L1_CMDS 0x00
+UMASK_UNC_LINK_TRANSMIT_BW_L1_DATA 0x01
+UMASK_UNC_LINK_TRANSMIT_BW_L1_BUF_REL 0x02
+UMASK_UNC_LINK_TRANSMIT_BW_L1_ADDR 0x10
+UMASK_UNC_LINK_TRANSMIT_BW_L1_CRC 0x20
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S0_USE 0x37
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S0_NOP 0x08
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S0_CMDS 0x00
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S0_DATA 0x01
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S0_BUF_REL 0x02
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S0_ADDR 0x10
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S0_CRC 0x20
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S1_USE 0xB7
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S1_NOP 0x88
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S1_CMDS 0x80
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S1_DATA 0x81
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S1_BUF_REL 0x82
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S1_ADDR 0x90
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S1_CRC 0xA0
+
+EVENT_UNC_LINK_TRANSMIT_BW_L2 0x0F8 UPMC
+UMASK_UNC_LINK_TRANSMIT_BW_L2_USE 0x37
UMASK_UNC_LINK_TRANSMIT_BW_L2_NOP 0x08
-
-EVENT_UNC_LINK_TRANSMIT_BW_L3 0x1F9 UPMC
-UMASK_UNC_LINK_TRANSMIT_BW_L3_USE 0x17
+UMASK_UNC_LINK_TRANSMIT_BW_L2_CMDS 0x00
+UMASK_UNC_LINK_TRANSMIT_BW_L2_DATA 0x01
+UMASK_UNC_LINK_TRANSMIT_BW_L2_BUF_REL 0x02
+UMASK_UNC_LINK_TRANSMIT_BW_L2_ADDR 0x10
+UMASK_UNC_LINK_TRANSMIT_BW_L2_CRC 0x20
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S0_USE 0x37
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S0_NOP 0x08
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S0_CMDS 0x00
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S0_DATA 0x01
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S0_BUF_REL 0x02
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S0_ADDR 0x10
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S0_CRC 0x20
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S1_USE 0xB7
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S1_NOP 0x88
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S1_CMDS 0x80
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S1_DATA 0x81
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S1_BUF_REL 0x82
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S1_ADDR 0x90
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S1_CRC 0xA0
+
+EVENT_UNC_LINK_TRANSMIT_BW_L3 0x1F9 UPMC
+UMASK_UNC_LINK_TRANSMIT_BW_L3_USE 0x37
UMASK_UNC_LINK_TRANSMIT_BW_L3_NOP 0x08
+UMASK_UNC_LINK_TRANSMIT_BW_L3_CMDS 0x00
+UMASK_UNC_LINK_TRANSMIT_BW_L3_DATA 0x01
+UMASK_UNC_LINK_TRANSMIT_BW_L3_BUF_REL 0x02
+UMASK_UNC_LINK_TRANSMIT_BW_L3_ADDR 0x10
+UMASK_UNC_LINK_TRANSMIT_BW_L3_CRC 0x20
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S0_USE 0x37
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S0_NOP 0x08
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S0_CMDS 0x00
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S0_DATA 0x01
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S0_BUF_REL 0x02
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S0_ADDR 0x10
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S0_CRC 0x20
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S1_USE 0xB7
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S1_NOP 0x88
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S1_CMDS 0x80
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S1_DATA 0x81
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S1_BUF_REL 0x82
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S1_ADDR 0x90
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S1_CRC 0xA0
EVENT_UNC_CPU_TO_DRAM 0x1E0 UPMC
UMASK_UNC_CPU_TO_DRAM_LOCAL_TO_0 0x01
diff --git a/src/includes/perfmon_ivybridge.h b/src/includes/perfmon_ivybridge.h
index 0615c27..19e03d9 100644
--- a/src/includes/perfmon_ivybridge.h
+++ b/src/includes/perfmon_ivybridge.h
@@ -3,15 +3,16 @@
*
* Filename: perfmon_ivybridge.h
*
- * Description: Header File of perfmon module for Ivy Bridge.
+ * Description: Header File of perfmon module for Intel Ivy Bridge.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -28,777 +29,1490 @@
* =======================================================================================
*/
+
#include <perfmon_ivybridge_events.h>
-#include <perfmon_ivybridge_groups.h>
#include <perfmon_ivybridge_counters.h>
-
-
+#include <perfmon_ivybridgeEP_events.h>
+#include <perfmon_ivybridgeEP_counters.h>
+#include <error.h>
+#include <affinity.h>
+#include <limits.h>
+#include <topology.h>
+
+static int perfmon_numCountersIvybridgeEP = NUM_COUNTERS_IVYBRIDGEEP;
+static int perfmon_numCoreCountersIvybridgeEP = NUM_COUNTERS_CORE_IVYBRIDGEEP;
+static int perfmon_numArchEventsIvybridgeEP = NUM_ARCH_EVENTS_IVYBRIDGEEP;
static int perfmon_numCountersIvybridge = NUM_COUNTERS_IVYBRIDGE;
-static int perfmon_numGroupsIvybridge = NUM_GROUPS_IVYBRIDGE;
+static int perfmon_numCoreCountersIvybridge = NUM_COUNTERS_CORE_IVYBRIDGE;
static int perfmon_numArchEventsIvybridge = NUM_ARCH_EVENTS_IVYBRIDGE;
-#define OFFSET_PMC 3
+int ivb_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event);
+int ivbep_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event);
+int (*ivy_cbox_setup)(int, RegisterIndex, PerfmonEvent*);
+
+int perfmon_init_ivybridge(int cpu_id)
+{
+ int ret;
+ uint64_t data = 0x0ULL;
+ lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+ lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+ HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL);
+ ret = HPMwrite(cpu_id, MSR_DEV, MSR_UNC_CBO_0_PERFEVTSEL0, 0x0ULL);
+ ret += HPMread(cpu_id, MSR_DEV, MSR_UNC_PERF_GLOBAL_CTRL, &data);
+ ret += HPMwrite(cpu_id, MSR_DEV, MSR_UNC_PERF_GLOBAL_CTRL, 0x0ULL);
+ ret += HPMread(cpu_id, MSR_DEV, MSR_UNC_CBO_0_PERFEVTSEL0, &data);
+ if ((cpuid_info.model == IVYBRIDGE_EP))
+ {
+ ivy_cbox_setup = ivbep_cbox_setup;
+ }
+ else if ((ret == 0) && (data == 0x0ULL))
+ {
+ ivy_cbox_setup = ivb_cbox_setup;
+ }
+ return 0;
+}
+
+
+uint32_t ivb_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ uint32_t flags = 0x0UL;
+ flags |= (1ULL<<(1+(index*4)));
+ for(int j=0;j<event->numberOfOptions;j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<(index*4));
+ break;
+ case EVENT_OPTION_ANYTHREAD:
+ flags |= (1ULL<<(2+(index*4)));
+ break;
+ default:
+ break;
+ }
+ }
+ return flags;
+}
+
+
+int ivb_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ uint32_t flags = 0x0UL;
+ uint64_t offcore_flags = 0x0ULL;
+ flags = (1ULL<<22)|(1ULL<<16);
+
+ /* Intel with standard 8 bit event mask: [7:0] */
+ flags |= (event->umask<<8) + event->eventId;
+
+ /* set custom cfg and cmask */
+ if ((event->cfgBits != 0) &&
+ (event->eventId != 0xB7) &&
+ (event->eventId != 0xBB))
+ {
+ flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+ }
+
+ if (event->numberOfOptions > 0)
+ {
+ for(int j=0;j<event->numberOfOptions;j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<17);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_ANYTHREAD:
+ flags |= (1ULL<<21);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL) << 24;
+ break;
+ case EVENT_OPTION_MATCH0:
+ offcore_flags |= (event->options[j].value & 0x8FFF);
+ break;
+ case EVENT_OPTION_MATCH1:
+ offcore_flags |= (event->options[j].value<<16);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (event->eventId == 0xB7)
+ {
+ if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+ {
+ offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+ }
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, offcore_flags));
+ }
+ else if (event->eventId == 0xBB)
+ {
+ if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+ {
+ offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+ }
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, offcore_flags));
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int ivb_bbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ uint64_t flags = 0x0UL;
+ uint64_t filter = 0x0UL;
+ uint32_t reg = counter_map[index].configRegister;
+ PciDeviceIndex dev = counter_map[index].device;
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(dev, cpu_id))
+ {
+ return -ENODEV;
+ }
+ flags = (1ULL<<22)|(1ULL<<20);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for (int j=0;j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0x1FULL) << 24);
+ break;
+ case EVENT_OPTION_OPCODE:
+ filter = (event->options[j].value & 0x3FULL);
+ VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH, flags, SETUP_OPCODE_FILTER);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH, filter));
+ break;
+ case EVENT_OPTION_MATCH0:
+ filter = ((event->options[j].value & 0xFFFFFFC0ULL));
+ VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, filter, SETUP_ADDR0_FILTER);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, filter));
+ filter = (((event->options[j].value>>32) & 0x3FFFULL));
+ VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, filter, SETUP_ADDR1_FILTER);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, filter));
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, reg, flags, SETUP_BBOX);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int ivb_pci_box_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ uint64_t flags = 0x0UL;
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(counter_map[index].device, cpu_id))
+ {
+ return -ENODEV;
+ }
+ flags = (1ULL<<22)|(1ULL<<20);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for (int j=0;j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0x1FULL) << 24);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTPCIREG(cpu_id, counter_map[index].device, counter_map[index].configRegister,
+ flags, SETUP_BOX);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, counter_map[index].device,
+ counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int ivb_mboxfix_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ uint64_t flags = 0x0ULL;
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(counter_map[index].device, cpu_id))
+ {
+ return -ENODEV;
+ }
+ flags = (1ULL<<22);
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTPCIREG(cpu_id, counter_map[index].device,
+ counter_map[index].configRegister, flags, SETUP_MBOXFIX);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, counter_map[index].device,
+ counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int ivb_sbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event, PciDeviceIndex filterdev)
+{
+ uint64_t flags = 0x0UL;
+ uint32_t filterreg = 0x0U;
+ uint64_t filterval = 0x0ULL;
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(counter_map[index].device, cpu_id))
+ {
+ return -ENODEV;
+ }
+ PciDeviceIndex dev = counter_map[index].device;
+ flags = (1ULL<<22)|(1ULL<<20);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->cfgBits != 0x0)
+ {
+ flags = (1ULL<<21);
+ }
+ if (event->numberOfOptions > 0)
+ {
+ for (int j=0;j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0x1FULL) << 24);
+ break;
+ case EVENT_OPTION_MATCH0:
+ if (HPMcheck(filterdev, cpu_id))
+ {
+ filterreg = PCI_UNC_QPI_PMON_MATCH_0;
+ filterval = event->options[j].value & 0x8003FFF8ULL;
+ VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_MATCH0);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+ }
+ break;
+ case EVENT_OPTION_MATCH1:
+ if (HPMcheck(filterdev, cpu_id))
+ {
+ filterreg = PCI_UNC_QPI_PMON_MATCH_1;
+ filterval = event->options[j].value & 0x000F000FULL;
+ VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_MATCH1);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+ }
+ break;
+ case EVENT_OPTION_MASK0:
+ if (HPMcheck(filterdev, cpu_id))
+ {
+ filterreg = PCI_UNC_QPI_PMON_MASK_0;
+ filterval = event->options[j].value & 0x8003FFF8ULL;
+ VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_MASK0);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+ }
+ break;
+ case EVENT_OPTION_MASK1:
+ if (HPMcheck(filterdev, cpu_id))
+ {
+ filterreg = PCI_UNC_QPI_PMON_MASK_1;
+ filterval = event->options[j].value & 0x000F000FULL;
+ VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_MASK1);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_SBOX);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int ivb_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ uint32_t flags = 0x0UL;
+ uint64_t mask = 0x0ULL;
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ flags = (1ULL<<22)|(1ULL<<20);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for (int j=0;j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0x1FULL) << 24);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_CBOX);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int ivbep_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ uint32_t flags = 0x0UL;
+ uint64_t mask = 0x0ULL;
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ flags = (1ULL<<22);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ RegisterType type = counter_map[index].type;
+ uint64_t filter0 = 0x0ULL;
+ uint64_t filter1 = 0x0ULL;
+ int state_set = 0;
+ for (int j=0;j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0x1FULL) << 24);
+ break;
+ case EVENT_OPTION_TID:
+ flags |= (1<<19);
+ filter0 |= (event->options[j].value & 0x1FULL);
+ break;
+ case EVENT_OPTION_STATE:
+ filter0 |= ((event->options[j].value & 0x3FULL) << 17);
+ state_set = 1;
+ break;
+ case EVENT_OPTION_NID:
+ mask = 0x0ULL;
+ for (int i=0; i<affinityDomains.numberOfNumaDomains;i++)
+ mask |= (1ULL<<i);
+
+ if (event->options[j].value & mask)
+ {
+ filter1 |= (event->options[j].value & 0xFFFFULL);
+ }
+ break;
+ case EVENT_OPTION_OPCODE:
+ filter1 |= ((event->options[j].value & 0x1FFULL) << 20);
+ break;
+ case EVENT_OPTION_MATCH0:
+ filter1 |= ((event->options[j].value & 0x3) << 30);
+ break;
+ default:
+ break;
+ }
+ }
+ if (state_set == 0 && event->eventId == 0x34)
+ {
+ filter0 |= (0x1FULL<<17);
+ }
+ if (filter0 != 0x0ULL)
+ {
+ VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, filter0, SETUP_CBOX_FILTER0);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1, filter0));
+ }
+ if (filter1 != 0x0ULL)
+ {
+ VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister2, filter1, SETUP_CBOX_FILTER1);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2, filter1));
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_CBOX);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int ivb_ubox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ uint32_t flags = 0x0UL;
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ flags = (1ULL<<22)|(1ULL<<20);
+ if (cpuid_info.model == IVYBRIDGE_EP)
+ {
+ flags |= (1ULL<<17);
+ }
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for (int j=0;j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0x1F) << 24);
+ break;
+ case EVENT_OPTION_INVERT:
+ if (cpuid_info.model == IVYBRIDGE)
+ {
+ flags |= (1ULL<<23);
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_UBOX);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int ivb_uboxfix_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ uint32_t flags = 0x0UL;
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ flags = (1ULL<<22)|(1ULL<<20);
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_UBOXFIX)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int ivb_wbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ uint32_t flags = 0x0UL;
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ flags = (1ULL<<22)|(1ULL<<20);
+ flags |= event->eventId;
+ if (event->cfgBits != 0x0)
+ {
+ flags |= ((event->cfgBits & 0x1) << 21);
+ }
+ if (event->numberOfOptions > 0)
+ {
+ RegisterType type = counter_map[index].type;
+ for (int j=0;j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0x1F) << 24);
+ break;
+ case EVENT_OPTION_OCCUPANCY:
+ flags |= ((event->options[j].value & 0x3) << 14);
+ break;
+ case EVENT_OPTION_OCCUPANCY_INVERT:
+ flags |= (1ULL<<30);
+ break;
+ case EVENT_OPTION_OCCUPANCY_EDGE:
+ flags |= (1ULL<<31);
+ break;
+ case EVENT_OPTION_MATCH0:
+ VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1,
+ event->options[j].value & 0xFFFFFFFFULL, SETUP_WBOX_FILTER);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV,
+ box_map[type].filterRegister1,
+ event->options[j].value & 0xFFFFFFFFULL));
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_WBOX);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int ivb_ibox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ uint32_t flags = 0x0UL;
+ PciDeviceIndex dev = counter_map[index].device;
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(dev, cpu_id))
+ {
+ return -ENODEV;
+ }
+ flags = (1ULL<<22);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for (int j=0;j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0xFFULL) << 24);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_IBOX);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+
+int ivb_uncore_freeze(int cpu_id, PerfmonEventSet* eventSet, int flags)
+{
+ uint32_t freeze_reg = (cpuid_info.model == IVYBRIDGE_EP ? MSR_UNC_U_PMON_GLOBAL_CTL : MSR_UNC_PERF_GLOBAL_CTRL);
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (eventSet->regTypeMask & ~(0xF))
+ {
+ VERBOSEPRINTREG(cpu_id, freeze_reg, LLU_CAST (1ULL<<31), FREEZE_UNCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, freeze_reg, (1ULL<<31)));
+ }
+ if ((flags != FREEZE_FLAG_ONLYFREEZE) && (eventSet->regTypeMask & ~(0xF)))
+ {
+ for (int j=UNCORE; j<NUM_UNITS; j++)
+ {
+ if (eventSet->regTypeMask & REG_TYPE_MASK(j))
+ {
+ if ((box_map[j].ctrlRegister != 0x0) && (box_map[j].isPci))
+ {
+ DEBUG_PRINT(DEBUGLEV_DETAIL, Clearing %s registers of %s, (flags == FREEZE_FLAG_CLEAR_CTL ? "control" : "counter"), RegisterTypeNames[j]);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, box_map[j].device,
+ box_map[j].ctrlRegister, flags));
+ }
+ else if (box_map[j].ctrlRegister != 0x0)
+ {
+ DEBUG_PRINT(DEBUGLEV_DETAIL, Clearing %s registers of %s, (flags == FREEZE_FLAG_CLEAR_CTL ? "control" : "counter"), RegisterTypeNames[j]);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV,
+ box_map[j].ctrlRegister, flags));
+ }
+ }
+ }
+ }
+ return 0;
+}
-void perfmon_init_ivybridge(PerfmonThread *thread)
+int ivb_uncore_unfreeze(int cpu_id, PerfmonEventSet* eventSet, int flags)
{
- uint64_t flags = 0x0ULL;
- int cpu_id = thread->processorId;
-
- /* Initialize registers */
- msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL2, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL3, 0x0ULL);
- msr_write(cpu_id, MSR_PMC0, 0x0ULL);
- msr_write(cpu_id, MSR_PMC1, 0x0ULL);
- msr_write(cpu_id, MSR_PMC2, 0x0ULL);
- msr_write(cpu_id, MSR_PMC3, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_FIXED_CTR0, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_FIXED_CTR1, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_FIXED_CTR2, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
-
- /* initialize fixed counters
- * FIXED 0: Instructions retired
- * FIXED 1: Clocks unhalted core
- * FIXED 2: Clocks unhalted ref */
- //msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
-
- /* Preinit of PERFEVSEL registers */
- //flags |= (1<<22); /* enable flag */
- //flags |= (1<<16); /* user mode flag */
-
- /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
- msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
- msr_write(cpu_id, MSR_PERFEVTSEL2, flags);
- msr_write(cpu_id, MSR_PERFEVTSEL3, flags);*/
-
- /* TODO Robust implementation which also works if stuff is not there */
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
- lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id))
- {
- if ( cpuid_info.model == IVYBRIDGE_EP )
+ uint32_t unfreeze_reg = (cpuid_info.model == IVYBRIDGE_EP ? MSR_UNC_U_PMON_GLOBAL_CTL : MSR_UNC_PERF_GLOBAL_CTRL);
+ uint32_t ovf_reg = (cpuid_info.model == IVYBRIDGE_EP ? MSR_UNC_U_PMON_GLOBAL_STATUS : MSR_UNC_PERF_GLOBAL_OVF_CTRL);
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if ((flags != FREEZE_FLAG_ONLYFREEZE) && (eventSet->regTypeMask & ~(0xF)))
+ {
+ for (int j=UNCORE; j<NUM_UNITS; j++)
{
- /* Only root can access pci address space in direct mode */
- if (accessClient_mode != DAEMON_AM_DIRECT)
+ if (eventSet->regTypeMask & REG_TYPE_MASK(j))
{
- uint32_t uflags = 0x10100U; /* enable freeze (bit 16), freeze (bit 8) */
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_BOX_CTL, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_BOX_CTL, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_BOX_CTL, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_BOX_CTL, uflags);
-
- uflags = 0x0U;
- uflags |= (1<<22); /* enable flag */
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_CTL_0, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_CTL_1, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_CTL_2, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_CTL_3, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_CTL_0, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_CTL_1, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_CTL_2, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_CTL_3, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_CTL_0, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_CTL_1, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_CTL_2, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_CTL_3, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_CTL_0, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_CTL_1, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_CTL_2, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_CTL_3, uflags);
-
- uflags |= (1<<19); /* reset fixed counter */
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_FIXED_CTL, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_FIXED_CTL, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_FIXED_CTL, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_FIXED_CTL, uflags);
-
- /* iMC counters need to be manually reset to zero */
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_CTR_0_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_CTR_0_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_CTR_1_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_CTR_1_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_CTR_2_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_CTR_2_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_CTR_0_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_CTR_0_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_CTR_1_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_CTR_1_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_CTR_2_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_CTR_2_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_CTR_0_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_CTR_0_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_CTR_1_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_CTR_1_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_CTR_2_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_CTR_2_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_CTR_0_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_CTR_0_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_CTR_1_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_CTR_1_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_CTR_2_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_CTR_2_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
-
-#if 0
- /* FIXME: Not yet tested/ working due to BIOS issues on test
- * machines */
-
- /* QPI registers can be zeroed with single write */
- uflags = 0x0103UL; /* freeze (bit 8), reset */
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0, PCI_UNC_QPI_PMON_BOX_CTL, uflags);
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1, PCI_UNC_QPI_PMON_BOX_CTL, uflags);
- uflags = 0x0UL;
- uflags |= (1UL<<22); /* enable flag */
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0, PCI_UNC_QPI_PMON_CTL_0, uflags);
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0, PCI_UNC_QPI_PMON_CTL_1, uflags);
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0, PCI_UNC_QPI_PMON_CTL_2, uflags);
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0, PCI_UNC_QPI_PMON_CTL_3, uflags);
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1, PCI_UNC_QPI_PMON_CTL_0, uflags);
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1, PCI_UNC_QPI_PMON_CTL_1, uflags);
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1, PCI_UNC_QPI_PMON_CTL_2, uflags);
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1, PCI_UNC_QPI_PMON_CTL_3, uflags);
-
-
- /* Cbo counters */
- uflags = 0xF0103UL; /*enable freeze (bit 8), reset */
- msr_write(cpu_id, MSR_UNC_C0_PMON_BOX_CTL, uflags);
- msr_write(cpu_id, MSR_UNC_C1_PMON_BOX_CTL, uflags);
- msr_write(cpu_id, MSR_UNC_C2_PMON_BOX_CTL, uflags);
- msr_write(cpu_id, MSR_UNC_C3_PMON_BOX_CTL, uflags);
-
- switch ( cpuid_topology.numCoresPerSocket )
+ if ((box_map[j].ctrlRegister != 0x0) && (box_map[j].isPci))
+ {
+ DEBUG_PRINT(DEBUGLEV_DETAIL, Clearing %s registers of %s, (flags == FREEZE_FLAG_CLEAR_CTL ? "control" : "counter"), RegisterTypeNames[j]);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, box_map[j].device,
+ box_map[j].ctrlRegister, flags));
+ }
+ else if (box_map[j].ctrlRegister != 0x0)
{
- case 12:
- msr_write(cpu_id, MSR_UNC_C11_PMON_BOX_CTL, uflags);
- msr_write(cpu_id, MSR_UNC_C10_PMON_BOX_CTL, uflags);
- case 10:
- msr_write(cpu_id, MSR_UNC_C9_PMON_BOX_CTL, uflags);
- msr_write(cpu_id, MSR_UNC_C8_PMON_BOX_CTL, uflags);
- case 8:
- msr_write(cpu_id, MSR_UNC_C7_PMON_BOX_CTL, uflags);
- msr_write(cpu_id, MSR_UNC_C6_PMON_BOX_CTL, uflags);
- case 6:
- msr_write(cpu_id, MSR_UNC_C5_PMON_BOX_CTL, uflags);
- msr_write(cpu_id, MSR_UNC_C4_PMON_BOX_CTL, uflags);
+ DEBUG_PRINT(DEBUGLEV_DETAIL, Clearing %s registers of %s, (flags == FREEZE_FLAG_CLEAR_CTL ? "control" : "counter"), RegisterTypeNames[j]);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV,
+ box_map[j].ctrlRegister, flags));
}
-#endif
}
}
}
+ if (eventSet->regTypeMask & ~(0xF))
+ {
+ VERBOSEPRINTREG(cpu_id, ovf_reg, LLU_CAST 0x0ULL, CLEAR_UNCORE_OVF)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, ovf_reg, 0x0ULL));
+ VERBOSEPRINTREG(cpu_id, unfreeze_reg, LLU_CAST (1ULL<<29), UNFREEZE_UNCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, unfreeze_reg, (1ULL<<29)));
+ }
+ return 0;
}
-#define BOX_GATE_IVB(channel,label) \
- if (perfmon_verbose) { \
- printf("[%d] perfmon_setup_counter (##label): Write Register 0x%llX , Flags: 0x%llX \n", \
- cpu_id, \
- LLU_CAST reg, \
- LLU_CAST flags); \
- } \
- if(haveLock) { \
- uflags = (1UL<<22);\
- uflags |= (event->umask<<8) + event->eventId; \
- if (event->cfgBits == 0xFF) \
- { \
- uflags |= (1<<21); \
- } \
- pci_write(cpu_id, channel, reg, uflags); \
- }
-
-
-void perfmon_setupCounterThread_ivybridge(
+
+int perfmon_setupCounterThread_ivybridge(
int thread_id,
- PerfmonEvent* event,
- PerfmonCounterIndex index)
+ PerfmonEventSet* eventSet)
{
int haveLock = 0;
- uint64_t flags;
- uint32_t uflags;
- uint64_t reg = ivybridge_counter_map[index].configRegister;
- int cpu_id = perfmon_threadData[thread_id].processorId;
- uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
- uint64_t orig_fixed_flags = fixed_flags;
- perfmon_threadData[thread_id].counters[index].init = TRUE;
+ uint64_t fixed_flags = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
haveLock = 1;
}
- switch (ivybridge_counter_map[index].type)
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
{
- case PMC:
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
+ ivb_uncore_freeze(cpu_id, eventSet, FREEZE_FLAG_ONLYFREEZE);
- //flags = msr_read(cpu_id,reg);
- //flags &= ~(0xFFFFU); /* clear lower 16bits */
- flags = (1<<22)|(1<<16);
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ eventSet->events[i].threadCounter[thread_id].init = TRUE;
+ switch (eventSet->events[i].type)
+ {
+ case PMC:
+ ivb_pmc_setup(cpu_id, index, event);
+ break;
+
+ case FIXED:
+ fixed_flags |= ivb_fixed_setup(cpu_id, index, event);
+ break;
+
+ case POWER:
+ break;
+
+ case MBOX0:
+ case MBOX1:
+ case MBOX2:
+ case MBOX3:
+ case MBOX4:
+ case MBOX5:
+ case MBOX6:
+ case MBOX7:
+ case PBOX:
+ case RBOX0:
+ case RBOX1:
+ ivb_pci_box_setup(cpu_id, index, event);
+ break;
+
+ case BBOX0:
+ case BBOX1:
+ ivb_bbox_setup(cpu_id, index, event);
+ break;
+
+ case MBOX0FIX:
+ case MBOX1FIX:
+ case MBOX2FIX:
+ case MBOX3FIX:
+ case MBOX4FIX:
+ case MBOX5FIX:
+ case MBOX6FIX:
+ case MBOX7FIX:
+ ivb_mboxfix_setup(cpu_id, index, event);
+ break;
+
+ case SBOX0:
+ ivb_sbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_0);
+ break;
+ case SBOX1:
+ ivb_sbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_1);
+ break;
+ case SBOX2:
+ ivb_sbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_2);
+ break;
+
+ case CBOX0:
+ case CBOX1:
+ case CBOX2:
+ case CBOX3:
+ case CBOX4:
+ case CBOX5:
+ case CBOX6:
+ case CBOX7:
+ case CBOX8:
+ case CBOX9:
+ case CBOX10:
+ case CBOX11:
+ case CBOX12:
+ case CBOX13:
+ case CBOX14:
+ ivy_cbox_setup(cpu_id, index, event);
+ break;
+
+ case UBOX:
+ ivb_ubox_setup(cpu_id, index, event);
+ break;
+ case UBOXFIX:
+ ivb_uboxfix_setup(cpu_id, index, event);
+ break;
+
+ case WBOX:
+ ivb_wbox_setup(cpu_id, index, event);
+ break;
+
+ case IBOX0:
+ case IBOX1:
+ ivb_ibox_setup(cpu_id, index, event);
+ break;
+
+ default:
+ break;
+ }
+ }
+ if (fixed_flags > 0x0)
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
+ }
+ return 0;
+}
- /* Intel with standard 8 bit event mask: [7:0] */
- flags |= (event->umask<<8) + event->eventId;
- if (event->cfgBits != 0) /* set custom cfg and cmask */
- {
- flags &= ~(0xFFFFU<<16); /* clear upper 16bits */
- flags |= ((event->cmask<<8) + event->cfgBits)<<16;
- }
+int perfmon_startCountersThread_ivybridge(int thread_id, PerfmonEventSet* eventSet)
+{
+ int haveLock = 0;
+ uint64_t fixed_flags = 0x0ULL;
+ uint64_t tmp = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- if (perfmon_verbose)
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveLock = 1;
+ }
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- printf("[%d] perfmon_setup_counter PMC: Write Register 0x%llX , Flags: 0x%llX \n",
- cpu_id,
- LLU_CAST reg,
- LLU_CAST flags);
+ continue;
}
+ tmp = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ uint64_t counter1 = counter_map[index].counterRegister;
+ eventSet->events[i].threadCounter[thread_id].startData = 0;
+ eventSet->events[i].threadCounter[thread_id].counterData = 0;
+ switch (type)
+ {
+ case PMC:
+ if (eventSet->regTypeMask & REG_TYPE_MASK(PMC))
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+ fixed_flags |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr)); /* enable counter */
+ }
+ break;
+
+ case FIXED:
+ if (eventSet->regTypeMask & REG_TYPE_MASK(FIXED))
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+ fixed_flags |= (1ULL<<(index+32)); /* enable fixed counter */
+ }
+ break;
- msr_write(cpu_id, reg , flags);
- break;
+ case POWER:
+ if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
+ {
+ CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&tmp));
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST field64(tmp, 0, box_map[type].regWidth), START_POWER)
+ eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[type].regWidth);
+ }
+ break;
+
+ default:
+ break;
+ }
+ eventSet->events[i].threadCounter[thread_id].counterData = eventSet->events[i].threadCounter[thread_id].startData;
+ }
+ }
- case FIXED:
- fixed_flags |= (0x2ULL<<(index*4));
- break;
+ ivb_uncore_unfreeze(cpu_id, eventSet, FREEZE_FLAG_CLEAR_CTR);
- case POWER:
- break;
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST fixed_flags, UNFREEZE_PMC_AND_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, fixed_flags));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<63)|(1ULL<<62)|fixed_flags));
+ }
+ return 0;
+}
- case MBOX0:
- BOX_GATE_IVB(PCI_IMC_DEVICE_CH_0,MBOX0);
- break;
- case MBOX1:
- BOX_GATE_IVB(PCI_IMC_DEVICE_CH_1,MBOX1);
- break;
- case MBOX2:
- BOX_GATE_IVB(PCI_IMC_DEVICE_CH_2,MBOX2);
- break;
+uint64_t ivb_uncore_read(int cpu_id, RegisterIndex index, PerfmonEvent *event, int flags)
+{
+ uint64_t result = 0x0ULL;
+ uint64_t tmp = 0x0ULL;
+ RegisterType type = counter_map[index].type;
+ PciDeviceIndex dev = counter_map[index].device;
+ uint64_t counter1 = counter_map[index].counterRegister;
+ uint64_t counter2 = counter_map[index].counterRegister2;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return result;
+ }
+ if (box_map[type].isPci && !HPMcheck(dev, cpu_id))
+ {
+ return result;
+ }
- case MBOX3:
- BOX_GATE_IVB(PCI_IMC_DEVICE_CH_3,MBOX3);
- break;
+ CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter1, &tmp));
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, tmp, UNCORE_READ);
- case SBOX0:
+ if (flags & FREEZE_FLAG_CLEAR_CTR)
+ {
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0U));
+ }
+ if (counter2 != 0x0)
+ {
+ result = (tmp<<32);
+ tmp = 0x0ULL;
+ CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter2, &tmp));
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter2, tmp, UNCORE_READ);
+ result += (tmp & 0xFFFFFFFF);
+ if (flags & FREEZE_FLAG_CLEAR_CTR)
+ {
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter2, 0x0U));
+ }
+ }
+ else
+ {
+ result = tmp;
+ }
+ return result;
+}
- /* CTO_COUNT event requires programming of MATCH/MASK registers */
- if (event->eventId == 0x38)
- {
- if(haveLock)
- {
- //uflags = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_0, reg);
- //uflags &= ~(0xFFFFU);
- uflags = (1UL<<22);
- uflags |= (1UL<<21) + event->eventId; /* Set extension bit */
- printf("UFLAGS 0x%x \n",uflags);
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0, reg, uflags);
-
- /* program MATCH0 */
- uflags = 0x0UL;
- uflags = (event->cmask<<13) + (event->umask<<8);
- printf("MATCH UFLAGS 0x%x \n",uflags);
- pci_write(cpu_id, PCI_QPI_MASK_DEVICE_PORT_0, PCI_UNC_QPI_PMON_MATCH_0, uflags);
-
- /* program MASK0 */
- uflags = 0x0UL;
- uflags = (0x3F<<12) + (event->cfgBits<<4);
- printf("MASK UFLAGS 0x%x \n",uflags);
- pci_write(cpu_id, PCI_QPI_MASK_DEVICE_PORT_0, PCI_UNC_QPI_PMON_MASK_0, uflags);
- }
- }
- else
+int ivb_uncore_overflow(int cpu_id, RegisterIndex index, PerfmonEvent *event,
+ int* overflows, uint64_t result, uint64_t cur_result,
+ int global_offset, int box_offset)
+{
+ int test_local = 0;
+ uint64_t ovf_values = 0x0ULL;
+ RegisterType type = counter_map[index].type;
+ PciDeviceIndex dev = counter_map[index].device;
+ if (result < cur_result)
+ {
+ if (global_offset != -1)
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV,
+ MSR_UNC_U_PMON_GLOBAL_STATUS,
+ &ovf_values));
+ if (ovf_values & (1<<global_offset))
{
- BOX_GATE_IVB(PCI_QPI_DEVICE_PORT_0,SBOX0);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV,
+ MSR_UNC_U_PMON_GLOBAL_STATUS,
+ (1<<global_offset)));
+ test_local = 1;
}
+ }
+ else
+ {
+ test_local = 1;
+ }
- break;
-
- case SBOX1:
-
- /* CTO_COUNT event requires programming of MATCH/MASK registers */
- if (event->eventId == 0x38)
+ if (test_local)
+ {
+ ovf_values = 0x0ULL;
+ if (ivybridge_box_map[type].isPci)
{
- if(haveLock)
- {
- //uflags = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_1, reg);
- //uflags &= ~(0xFFFFU);
- uflags = (1UL<<22);
- uflags |= (1UL<<21) + event->eventId; /* Set extension bit */
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1, reg, uflags);
-
- /* program MATCH0 */
- uflags = 0x0UL;
- uflags = (event->cmask<<13) + (event->umask<<8);
- pci_write(cpu_id, PCI_QPI_MASK_DEVICE_PORT_1, PCI_UNC_QPI_PMON_MATCH_0, uflags);
-
- /* program MASK0 */
- uflags = 0x0UL;
- uflags = (0x3F<<12) + (event->cfgBits<<4);
- pci_write(cpu_id, PCI_QPI_MASK_DEVICE_PORT_1, PCI_UNC_QPI_PMON_MASK_0, uflags);
- }
+ CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev,
+ box_map[type].statusRegister,
+ &ovf_values));
}
else
{
- BOX_GATE_IVB(PCI_QPI_DEVICE_PORT_0,SBOX0);
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV,
+ box_map[type].statusRegister,
+ &ovf_values));
}
- break;
-
- case CBOX0:
- case CBOX1:
- case CBOX2:
- case CBOX3:
- case CBOX4:
- case CBOX5:
- case CBOX6:
- case CBOX7:
- case CBOX8:
- case CBOX9:
- case CBOX10:
- case CBOX11:
-
- if(haveLock)
+ if (ovf_values & (1<<box_offset))
{
- perfmon_threadData[thread_id].counters[index].init = TRUE;
- uflags = 0x0U;
-
- /* set local enable flag */
- uflags |= 1<<22;
- /* Intel with standard 8 bit event mask: [7:0] */
- uflags |= (event->umask<<8) + event->eventId;
- msr_write(cpu_id, reg , uflags);
-
- if (perfmon_verbose)
+ (*overflows)++;
+ if (ivybridge_box_map[type].isPci)
+ {
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev,
+ box_map[type].statusRegister,
+ (1<<box_offset)));
+ }
+ else
{
- printf("[%d] perfmon_setup_counter: Write Register 0x%llX , uFlags: 0x%lX \n",
- cpu_id,
- LLU_CAST reg,
- (unsigned long) uflags);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV,
+ box_map[type].statusRegister,
+ (1<<box_offset)));
}
}
- break;
-
- default:
- /* should never be reached */
- break;
- }
- if (fixed_flags != orig_fixed_flags)
- {
- msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
+ }
}
+ return 0;
}
-#define CBOX_START(NUM) \
-if(haveLock) { \
- msr_write(cpu_id, MSR_UNC_C##NUM##_PMON_BOX_CTL, uflags); \
-}
-
-#define MBOX_START(NUM) \
-if(haveLock) { \
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_##NUM, PCI_UNC_MC_PMON_BOX_CTL, uflags); \
-}
-
-
-
-void perfmon_startCountersThread_ivybridge(int thread_id)
+int perfmon_stopCountersThread_ivybridge(int thread_id, PerfmonEventSet* eventSet)
{
+ uint64_t counter_result = 0x0ULL;
int haveLock = 0;
- uint64_t flags = 0x0ULL;
- uint32_t uflags = 0x10000UL; /* Clear freeze bit */
- int cpu_id = perfmon_threadData[thread_id].processorId;
-
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ int cpu_id = groupSet->threads[thread_id].processorId;
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
haveLock = 1;
}
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
+ ivb_uncore_freeze(cpu_id, eventSet, FREEZE_FLAG_CLEAR_CTL);
- for ( int i=0; i<perfmon_numCountersIvybridge; i++ )
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- switch (ivybridge_counter_map[i].type)
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ counter_result= 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ PciDeviceIndex dev = counter_map[index].device;
+ uint64_t counter1 = counter_map[index].counterRegister;
+ uint64_t* current = &(eventSet->events[i].threadCounter[thread_id].counterData);
+ int* overflows = &(eventSet->events[i].threadCounter[thread_id].overflows);
+ switch (type)
{
case PMC:
- msr_write(cpu_id, ivybridge_counter_map[i].counterRegister, 0x0ULL);
- flags |= (1<<(i-OFFSET_PMC)); /* enable counter */
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, dev, counter1, &counter_result));
+ if (counter_result < *current)
+ {
+ uint64_t ovf_values = 0x0ULL;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, dev, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+ if (ovf_values & (1<<index-cpuid_info.perf_num_fixed_ctr))
+ {
+ (*overflows)++;
+ }
+ }
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC)
break;
-
case FIXED:
- msr_write(cpu_id, ivybridge_counter_map[i].counterRegister, 0x0ULL);
- flags |= (1ULL<<(i+32)); /* enable fixed counter */
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, dev, counter1, &counter_result));
+ if (counter_result < *current)
+ {
+ uint64_t ovf_values = 0x0ULL;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, dev, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+ if (ovf_values & (1<<(index+32)))
+ {
+ (*overflows)++;
+ }
+ }
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED)
break;
case POWER:
- if(haveLock)
+ if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
{
- perfmon_threadData[thread_id].counters[i].counterData =
- power_read(cpu_id, ivybridge_counter_map[i].counterRegister);
+ CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, STOP_POWER)
+ if (counter_result < *current)
+ {
+ (*overflows)++;
+ }
}
-
break;
- case MBOX0:
- MBOX_START(0);
+ case THERMAL:
+ CHECK_TEMP_READ_ERROR(thermal_read(cpu_id, (uint32_t*)&counter_result));
break;
- case MBOX1:
- MBOX_START(1);
+ case SBOX0FIX:
+ case SBOX1FIX:
+ case SBOX2FIX:
+ if (haveLock && HPMcheck(dev, cpu_id))
+ {
+ CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter1, &counter_result));
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, READ_SBOX_FIXED)
+ switch (extractBitField(counter_result,3,0))
+ {
+ case 0x2:
+ counter_result = 5600000000ULL;
+ break;
+ case 0x3:
+ counter_result = 6400000000ULL;
+ break;
+ case 0x4:
+ counter_result = 7200000000ULL;
+ break;
+ case 0x5:
+ counter_result = 8000000000ULL;
+ break;
+ case 0x6:
+ counter_result = 8800000000ULL;
+ break;
+ case 0x7:
+ counter_result = 9600000000ULL;
+ break;
+ default:
+ counter_result = 0x0ULL;
+ break;
+ }
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, READ_SBOX_FIXED_REAL)
+ }
break;
+ case MBOX0:
+ case MBOX1:
case MBOX2:
- MBOX_START(2);
- break;
-
case MBOX3:
- MBOX_START(3);
+ case MBOX4:
+ case MBOX5:
+ case MBOX6:
+ case MBOX7:
+ counter_result = ivb_uncore_read(cpu_id, index, event, FREEZE_FLAG_CLEAR_CTR);
+ ivb_uncore_overflow(cpu_id, index, event, overflows, counter_result,
+ *current, box_map[type].ovflOffset, getCounterTypeOffset(index)+1);
break;
- case MBOXFIX:
- break;
- case SBOX0:
- if(haveLock)
- {
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0, PCI_UNC_QPI_PMON_BOX_CTL, 0x0ULL);
- }
+ case MBOX0FIX:
+ case MBOX1FIX:
+ case MBOX2FIX:
+ case MBOX3FIX:
+ case MBOX4FIX:
+ case MBOX5FIX:
+ case MBOX6FIX:
+ case MBOX7FIX:
+ counter_result = ivb_uncore_read(cpu_id, index, event, FREEZE_FLAG_CLEAR_CTR);
+ ivb_uncore_overflow(cpu_id, index, event, overflows, counter_result,
+ *current, box_map[type].ovflOffset, 0);
break;
- case SBOX1:
- if(haveLock)
- {
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1, PCI_UNC_QPI_PMON_BOX_CTL, 0x0ULL);
- }
+ case IBOX1:
+ counter_result = ivb_uncore_read(cpu_id, index, event, FREEZE_FLAG_CLEAR_CTR);
+ ivb_uncore_overflow(cpu_id, index, event, overflows, counter_result,
+ *current, -1, getCounterTypeOffset(index)+2);
break;
+ case SBOX0:
+ case SBOX1:
+ case SBOX2:
case CBOX0:
- CBOX_START(0);
- break;
-
case CBOX1:
- CBOX_START(1);
- break;
-
case CBOX2:
- CBOX_START(2);
- break;
-
case CBOX3:
- CBOX_START(3);
- break;
-
case CBOX4:
- CBOX_START(4);
- break;
-
case CBOX5:
- CBOX_START(5);
- break;
-
case CBOX6:
- CBOX_START(6);
- break;
-
case CBOX7:
- CBOX_START(7);
- break;
-
case CBOX8:
- CBOX_START(8);
- break;
-
case CBOX9:
- CBOX_START(9);
- break;
-
case CBOX10:
- CBOX_START(10);
- break;
-
case CBOX11:
- CBOX_START(11);
+ case CBOX12:
+ case CBOX13:
+ case CBOX14:
+ case UBOX:
+ case UBOXFIX:
+ case BBOX0:
+ case BBOX1:
+ case WBOX:
+ case PBOX:
+ case RBOX0:
+ case RBOX1:
+ case RBOX2:
+ case IBOX0:
+ counter_result = ivb_uncore_read(cpu_id, index, event, FREEZE_FLAG_CLEAR_CTR);
+ ivb_uncore_overflow(cpu_id, index, event, overflows, counter_result,
+ *current, box_map[type].ovflOffset, getCounterTypeOffset(index));
break;
default:
- /* should never be reached */
break;
}
+ *current = field64(counter_result, 0, box_map[type].regWidth);
}
+ eventSet->events[i].threadCounter[thread_id].init = FALSE;
}
- if (perfmon_verbose)
- {
- printf("perfmon_start_counters: Write Register 0x%X , \
- Flags: 0x%llX \n",MSR_PERF_GLOBAL_CTRL, LLU_CAST flags);
- printf("perfmon_start_counters: Write Register 0x%X , \
- Flags: 0x%llX \n",MSR_UNCORE_PERF_GLOBAL_CTRL, LLU_CAST uflags);
- }
-
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
- msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x30000000FULL);
-}
-
-#define CBOX_STOP(NUM) \
-if(haveLock) { \
- msr_write(cpu_id, MSR_UNC_C##NUM##_PMON_BOX_CTL, uflags); \
- perfmon_threadData[thread_id].counters[i].counterData = \
- msr_read(cpu_id, westmereEX_counter_map[i].counterRegister); \
-}
-
-#define MBOX_STOP(NUM) \
-if(haveLock) { \
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_##NUM , PCI_UNC_MC_PMON_BOX_CTL, uflags); \
- counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_##NUM , ivybridge_counter_map[i].counterRegister); \
- counter_result = (counter_result<<32) + pci_read(cpu_id, PCI_IMC_DEVICE_CH_##NUM , ivybridge_counter_map[i].counterRegister2); \
- perfmon_threadData[thread_id].counters[i].counterData = counter_result; \
-}
-
-#define SBOX_STOP(NUM) \
-if(haveLock) { \
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_##NUM , PCI_UNC_QPI_PMON_BOX_CTL, (1<<8)); \
- counter_result = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_##NUM , ivybridge_counter_map[i].counterRegister); \
- counter_result = (counter_result<<32) + pci_read(cpu_id, PCI_QPI_DEVICE_PORT_##NUM , ivybridge_counter_map[i].counterRegister2); \
- perfmon_threadData[thread_id].counters[i].counterData = counter_result; \
+ return 0;
}
-
-void perfmon_stopCountersThread_ivybridge(int thread_id)
+int perfmon_readCountersThread_ivybridge(int thread_id, PerfmonEventSet* eventSet)
{
- uint64_t flags;
- uint32_t uflags = 0x10100UL; /* Set freeze bit */
uint64_t counter_result = 0x0ULL;
+ uint64_t pmc_flags = 0x0ULL;
int haveLock = 0;
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
haveLock = 1;
}
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &pmc_flags));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
+ ivb_uncore_freeze(cpu_id, eventSet, FREEZE_FLAG_ONLYFREEZE);
- for ( int i=0; i < NUM_COUNTERS_IVYBRIDGE; i++ )
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- switch (ivybridge_counter_map[i].type)
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ counter_result = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ PciDeviceIndex dev = counter_map[index].device;
+ uint64_t counter1 = counter_map[index].counterRegister;
+ uint64_t* current = &(eventSet->events[i].threadCounter[thread_id].counterData);
+ int* overflows = &(eventSet->events[i].threadCounter[thread_id].overflows);
+ switch (type)
{
case PMC:
-
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ if (counter_result < *current)
+ {
+ uint64_t ovf_values = 0x0ULL;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+ if (ovf_values & (1<<index-cpuid_info.perf_num_fixed_ctr))
+ {
+ (*overflows)++;
+ }
+ }
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC)
+ break;
case FIXED:
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, ivybridge_counter_map[i].counterRegister);
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ if (counter_result < *current)
+ {
+ uint64_t ovf_values = 0x0ULL;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+ if (ovf_values & (1<<(index+32)))
+ {
+ (*overflows)++;
+ }
+ }
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED)
break;
case POWER:
- if(haveLock)
+ if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
{
- perfmon_threadData[thread_id].counters[i].counterData =
- power_info.energyUnit *
- ( power_read(cpu_id, ivybridge_counter_map[i].counterRegister) -
- perfmon_threadData[thread_id].counters[i].counterData);
+ CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, STOP_POWER)
+ if (counter_result < *current)
+ {
+ (*overflows)++;
+ }
}
break;
case THERMAL:
- perfmon_threadData[thread_id].counters[i].counterData =
- thermal_read(cpu_id);
+ CHECK_TEMP_READ_ERROR(thermal_read(cpu_id, (uint32_t*)&counter_result));
break;
- case MBOX0:
- MBOX_STOP(0);
+ case SBOX0FIX:
+ case SBOX1FIX:
+ case SBOX2FIX:
+ if (haveLock && HPMcheck(dev, cpu_id))
+ {
+ CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter1, &counter_result));
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, READ_SBOX_FIXED)
+ switch (extractBitField(counter_result,3,0))
+ {
+ case 0x2:
+ counter_result = 5600000000ULL;
+ break;
+ case 0x3:
+ counter_result = 6400000000ULL;
+ break;
+ case 0x4:
+ counter_result = 7200000000ULL;
+ break;
+ case 0x5:
+ counter_result = 8000000000ULL;
+ break;
+ case 0x6:
+ counter_result = 8800000000ULL;
+ break;
+ case 0x7:
+ counter_result = 9600000000ULL;
+ break;
+ default:
+ counter_result = 0x0ULL;
+ break;
+ }
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, READ_SBOX_FIXED_REAL)
+ eventSet->events[i].threadCounter[thread_id].startData = 0;
+ }
break;
+ case MBOX0:
case MBOX1:
- MBOX_STOP(1);
- break;
-
case MBOX2:
- MBOX_STOP(2);
- break;
-
case MBOX3:
- MBOX_STOP(3);
+ case MBOX4:
+ case MBOX5:
+ case MBOX6:
+ case MBOX7:
+ counter_result = ivb_uncore_read(cpu_id, index, event, FREEZE_FLAG_ONLYFREEZE);
+ ivb_uncore_overflow(cpu_id, index, event, overflows, counter_result,
+ *current, box_map[type].ovflOffset, getCounterTypeOffset(index)+1);
break;
- case MBOXFIX:
- if(haveLock)
- {
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_FIXED_CTL, uflags);
-
- counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
- ivybridge_counter_map[i].counterRegister);
-
- counter_result = (counter_result<<32) +
- pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
- ivybridge_counter_map[i].counterRegister2);
- perfmon_threadData[thread_id].counters[i].counterData = counter_result;
- }
+ case MBOX0FIX:
+ case MBOX1FIX:
+ case MBOX2FIX:
+ case MBOX3FIX:
+ case MBOX4FIX:
+ case MBOX5FIX:
+ case MBOX6FIX:
+ case MBOX7FIX:
+ counter_result = ivb_uncore_read(cpu_id, index, event, FREEZE_FLAG_ONLYFREEZE);
+ ivb_uncore_overflow(cpu_id, index, event, overflows, counter_result,
+ *current, box_map[type].ovflOffset, 0);
break;
- case SBOX0:
- SBOX_STOP(0);
+ case IBOX1:
+ counter_result = ivb_uncore_read(cpu_id, index, event, FREEZE_FLAG_ONLYFREEZE);
+ ivb_uncore_overflow(cpu_id, index, event, overflows, counter_result,
+ *current, -1, getCounterTypeOffset(index)+2);
break;
+ case SBOX0:
case SBOX1:
- SBOX_STOP(1);
- break;
-
+ case SBOX2:
case CBOX0:
- CBOX_STOP(0);
- break;
-
case CBOX1:
- CBOX_STOP(1);
- break;
-
case CBOX2:
- CBOX_STOP(2);
- break;
-
case CBOX3:
- CBOX_STOP(3);
- break;
-
case CBOX4:
- CBOX_STOP(4);
- break;
-
case CBOX5:
- CBOX_STOP(5);
- break;
-
case CBOX6:
- CBOX_STOP(6);
- break;
-
case CBOX7:
- CBOX_STOP(7);
- break;
-
case CBOX8:
- CBOX_STOP(8);
- break;
-
case CBOX9:
- CBOX_STOP(9);
- break;
-
case CBOX10:
- CBOX_STOP(10);
- break;
-
case CBOX11:
- CBOX_STOP(11);
+ case CBOX12:
+ case CBOX13:
+ case CBOX14:
+ case UBOX:
+ case UBOXFIX:
+ case BBOX0:
+ case BBOX1:
+ case WBOX:
+ case PBOX:
+ case RBOX0:
+ case RBOX1:
+ case RBOX2:
+ case IBOX0:
+ counter_result = ivb_uncore_read(cpu_id, index, event, FREEZE_FLAG_ONLYFREEZE);
+ ivb_uncore_overflow(cpu_id, index, event, overflows, counter_result,
+ *current, box_map[type].ovflOffset, getCounterTypeOffset(index));
break;
-
default:
/* should never be reached */
break;
}
+ *current = field64(counter_result, 0, box_map[type].regWidth);
}
}
- flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
- // printf ("Status: 0x%llX \n", LLU_CAST flags);
- if ( (flags & 0x3) || (flags & (0x3ULL<<32)) )
+ ivb_uncore_unfreeze(cpu_id, eventSet, FREEZE_FLAG_ONLYFREEZE);
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
{
- printf ("Overflow occured \n");
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, pmc_flags));
}
+ return 0;
}
-void perfmon_readCountersThread_ivybridge(int thread_id)
+
+int perfmon_finalizeCountersThread_ivybridge(int thread_id, PerfmonEventSet* eventSet)
{
- uint64_t counter_result = 0x0ULL;
int haveLock = 0;
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ int haveTileLock = 0;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+ uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
haveLock = 1;
}
- for ( int i=0; i<NUM_COUNTERS_IVYBRIDGE; i++ )
+ if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ haveTileLock = 1;
+ }
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- if ((ivybridge_counter_map[i].type == PMC) || (ivybridge_counter_map[i].type == FIXED))
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, ivybridge_counter_map[i].counterRegister);
+ continue;
}
- else
+ RegisterIndex index = eventSet->events[i].index;
+ PciDeviceIndex dev = counter_map[index].device;
+ uint64_t reg = counter_map[index].configRegister;
+
+ switch(type)
{
- if(haveLock)
- {
- switch (ivybridge_counter_map[i].type)
+ case PMC:
+ ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+ if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
{
- case POWER:
- perfmon_threadData[thread_id].counters[i].counterData =
- power_info.energyUnit *
- power_read(cpu_id, ivybridge_counter_map[i].counterRegister);
- break;
-
- case MBOX0:
- counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
- ivybridge_counter_map[i].counterRegister);
-
- counter_result = (counter_result<<32) +
- pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
- ivybridge_counter_map[i].counterRegister2);
-
- perfmon_threadData[thread_id].counters[i].counterData = counter_result;
- break;
-
- case MBOX1:
- counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_1,
- ivybridge_counter_map[i].counterRegister);
-
- counter_result = (counter_result<<32) +
- pci_read(cpu_id, PCI_IMC_DEVICE_CH_1,
- ivybridge_counter_map[i].counterRegister2);
-
- perfmon_threadData[thread_id].counters[i].counterData = counter_result;
- break;
-
- case MBOX2:
- counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_2,
- ivybridge_counter_map[i].counterRegister);
-
- counter_result = (counter_result<<32) +
- pci_read(cpu_id, PCI_IMC_DEVICE_CH_2,
- ivybridge_counter_map[i].counterRegister2);
-
- perfmon_threadData[thread_id].counters[i].counterData = counter_result;
- break;
-
- case MBOX3:
- counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_3,
- ivybridge_counter_map[i].counterRegister);
-
- counter_result = (counter_result<<32) +
- pci_read(cpu_id, PCI_IMC_DEVICE_CH_3,
- ivybridge_counter_map[i].counterRegister2);
-
- perfmon_threadData[thread_id].counters[i].counterData = counter_result;
- break;
-
- default:
- /* should never be reached */
- break;
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
}
- }
+ else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+ }
+ break;
+ case FIXED:
+ ovf_values_core |= (1ULL<<(index+32));
+ break;
+ default:
+ break;
}
+ if ((reg) && (((type == PMC)||(type == FIXED))||((type >= UNCORE) && (haveLock))))
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL));
+ }
+ eventSet->events[i].threadCounter[thread_id].init = FALSE;
}
}
-}
+ if (haveLock && eventSet->regTypeMask & ~(0xFULL))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_UNC_U_PMON_GLOBAL_STATUS, LLU_CAST 0x0ULL, CLEAR_UNCORE_OVF)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_U_PMON_GLOBAL_STATUS, 0x0ULL));
+ VERBOSEPRINTREG(cpu_id, MSR_UNC_U_PMON_GLOBAL_CTL, LLU_CAST 0x0ULL, CLEAR_UNCORE_CTRL)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_U_PMON_GLOBAL_CTL, 0x0ULL));
+ }
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST ovf_values_core, CLEAR_GLOBAL_OVF)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST 0x0ULL, CLEAR_GLOBAL_CTRL)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
+ return 0;
+}
diff --git a/src/includes/perfmon_ivybridgeEP_counters.h b/src/includes/perfmon_ivybridgeEP_counters.h
new file mode 100644
index 0000000..896530c
--- /dev/null
+++ b/src/includes/perfmon_ivybridgeEP_counters.h
@@ -0,0 +1,316 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: perfmon_ivybridgeEP_counters.h
+ *
+ * Description: Counter header file of perfmon module for Intel Ivy Bridge EP.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#define NUM_COUNTERS_CORE_IVYBRIDGEEP 8
+#define NUM_COUNTERS_UNCORE_IVYBRIDGEEP 81
+#define NUM_COUNTERS_IVYBRIDGEEP 161
+
+#define IVBEP_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK|\
+ EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define IVBEP_VALID_OPTIONS_FIXED EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_COUNT_KERNEL_MASK
+#define IVBEP_VALID_OPTIONS_UBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define IVBEP_VALID_OPTIONS_CBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK|\
+ EVENT_OPTION_TID_MASK|EVENT_OPTION_STATE_MASK|EVENT_OPTION_NID_MASK|\
+ EVENT_OPTION_OPCODE_MASK|EVENT_OPTION_MATCH0_MASK
+#define IVBEP_VALID_OPTIONS_WBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK|\
+ EVENT_OPTION_OCCUPANCY_MASK|EVENT_OPTION_OCCUPANCY_FILTER_MASK|EVENT_OPTION_OCCUPANCY_EDGE_MASK|\
+ EVENT_OPTION_OCCUPANCY_INVERT_MASK
+#define IVBEP_VALID_OPTIONS_MBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define IVBEP_VALID_OPTIONS_SBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK|\
+ EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK|EVENT_OPTION_MASK0_MASK|\
+ EVENT_OPTION_MASK0_MASK
+#define IVBEP_VALID_OPTIONS_BBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK|\
+ EVENT_OPTION_OPCODE_MASK|EVENT_OPTION_MATCH0_MASK
+#define IVBEP_VALID_OPTIONS_PBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define IVBEP_VALID_OPTIONS_RBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define IVBEP_VALID_OPTIONS_IBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+
+static RegisterMap ivybridgeEP_counter_map[NUM_COUNTERS_IVYBRIDGEEP] = {
+ /* Fixed Counters: instructions retired, cycles unhalted core */
+ {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, IVBEP_VALID_OPTIONS_FIXED},
+ {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, IVBEP_VALID_OPTIONS_FIXED},
+ {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, IVBEP_VALID_OPTIONS_FIXED},
+ /* PMC Counters: 4 48bit wide */
+ {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, IVBEP_VALID_OPTIONS_PMC},
+ {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, IVBEP_VALID_OPTIONS_PMC},
+ {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, IVBEP_VALID_OPTIONS_PMC},
+ {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, IVBEP_VALID_OPTIONS_PMC},
+ /* Temperature Sensor*/
+ {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ /* RAPL counters */
+ {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ /* CBOX counters, 44bits wide*/
+ {"CBOX0C0", PMC12, CBOX0, MSR_UNC_C0_PMON_CTL0, MSR_UNC_C0_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX0C1", PMC13, CBOX0, MSR_UNC_C0_PMON_CTL1, MSR_UNC_C0_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX0C2", PMC14, CBOX0, MSR_UNC_C0_PMON_CTL2, MSR_UNC_C0_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX0C3", PMC15, CBOX0, MSR_UNC_C0_PMON_CTL3, MSR_UNC_C0_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX1C0", PMC16, CBOX1, MSR_UNC_C1_PMON_CTL0, MSR_UNC_C1_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX1C1", PMC17, CBOX1, MSR_UNC_C1_PMON_CTL1, MSR_UNC_C1_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX1C2", PMC18, CBOX1, MSR_UNC_C1_PMON_CTL2, MSR_UNC_C1_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX1C3", PMC19, CBOX1, MSR_UNC_C1_PMON_CTL3, MSR_UNC_C1_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX2C0", PMC20, CBOX2, MSR_UNC_C2_PMON_CTL0, MSR_UNC_C2_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX2C1", PMC21, CBOX2, MSR_UNC_C2_PMON_CTL1, MSR_UNC_C2_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX2C2", PMC22, CBOX2, MSR_UNC_C2_PMON_CTL2, MSR_UNC_C2_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX2C3", PMC23, CBOX2, MSR_UNC_C2_PMON_CTL3, MSR_UNC_C2_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX3C0", PMC24, CBOX3, MSR_UNC_C3_PMON_CTL0, MSR_UNC_C3_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX3C1", PMC25, CBOX3, MSR_UNC_C3_PMON_CTL1, MSR_UNC_C3_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX3C2", PMC26, CBOX3, MSR_UNC_C3_PMON_CTL2, MSR_UNC_C3_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX3C3", PMC27, CBOX3, MSR_UNC_C3_PMON_CTL3, MSR_UNC_C3_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX4C0", PMC28, CBOX4, MSR_UNC_C4_PMON_CTL0, MSR_UNC_C4_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX4C1", PMC29, CBOX4, MSR_UNC_C4_PMON_CTL1, MSR_UNC_C4_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX4C2", PMC30, CBOX4, MSR_UNC_C4_PMON_CTL2, MSR_UNC_C4_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX4C3", PMC31, CBOX4, MSR_UNC_C4_PMON_CTL3, MSR_UNC_C4_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX5C0", PMC32, CBOX5, MSR_UNC_C5_PMON_CTL0, MSR_UNC_C5_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX5C1", PMC33, CBOX5, MSR_UNC_C5_PMON_CTL1, MSR_UNC_C5_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX5C2", PMC34, CBOX5, MSR_UNC_C5_PMON_CTL2, MSR_UNC_C5_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX5C3", PMC35, CBOX5, MSR_UNC_C5_PMON_CTL3, MSR_UNC_C5_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX6C0", PMC36, CBOX6, MSR_UNC_C6_PMON_CTL0, MSR_UNC_C6_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX6C1", PMC37, CBOX6, MSR_UNC_C6_PMON_CTL1, MSR_UNC_C6_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX6C2", PMC38, CBOX6, MSR_UNC_C6_PMON_CTL2, MSR_UNC_C6_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX6C3", PMC39, CBOX6, MSR_UNC_C6_PMON_CTL3, MSR_UNC_C6_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX7C0", PMC40, CBOX7, MSR_UNC_C7_PMON_CTL0, MSR_UNC_C7_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX7C1", PMC41, CBOX7, MSR_UNC_C7_PMON_CTL1, MSR_UNC_C7_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX7C2", PMC42, CBOX7, MSR_UNC_C7_PMON_CTL2, MSR_UNC_C7_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX7C3", PMC43, CBOX7, MSR_UNC_C7_PMON_CTL3, MSR_UNC_C7_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX8C0", PMC44, CBOX8, MSR_UNC_C8_PMON_CTL0, MSR_UNC_C8_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX8C1", PMC45, CBOX8, MSR_UNC_C8_PMON_CTL1, MSR_UNC_C8_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX8C2", PMC46, CBOX8, MSR_UNC_C8_PMON_CTL2, MSR_UNC_C8_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX8C3", PMC47, CBOX8, MSR_UNC_C8_PMON_CTL3, MSR_UNC_C8_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX9C0", PMC48, CBOX9, MSR_UNC_C9_PMON_CTL0, MSR_UNC_C9_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX9C1", PMC49, CBOX9, MSR_UNC_C9_PMON_CTL1, MSR_UNC_C9_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX9C2", PMC50, CBOX9, MSR_UNC_C9_PMON_CTL2, MSR_UNC_C9_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX9C3", PMC51, CBOX9, MSR_UNC_C9_PMON_CTL3, MSR_UNC_C9_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX10C0", PMC52, CBOX10, MSR_UNC_C10_PMON_CTL0, MSR_UNC_C10_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX10C1", PMC53, CBOX10, MSR_UNC_C10_PMON_CTL1, MSR_UNC_C10_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX10C2", PMC54, CBOX10, MSR_UNC_C10_PMON_CTL2, MSR_UNC_C10_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX10C3", PMC55, CBOX10, MSR_UNC_C10_PMON_CTL3, MSR_UNC_C10_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX11C0", PMC56, CBOX11, MSR_UNC_C11_PMON_CTL0, MSR_UNC_C11_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX11C1", PMC57, CBOX11, MSR_UNC_C11_PMON_CTL1, MSR_UNC_C11_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX11C2", PMC58, CBOX11, MSR_UNC_C11_PMON_CTL2, MSR_UNC_C11_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX11C3", PMC59, CBOX11, MSR_UNC_C11_PMON_CTL3, MSR_UNC_C11_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX12C0", PMC60, CBOX12, MSR_UNC_C12_PMON_CTL0, MSR_UNC_C12_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX12C1", PMC61, CBOX12, MSR_UNC_C12_PMON_CTL1, MSR_UNC_C12_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX12C2", PMC62, CBOX12, MSR_UNC_C12_PMON_CTL2, MSR_UNC_C12_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX12C3", PMC63, CBOX12, MSR_UNC_C12_PMON_CTL3, MSR_UNC_C12_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX13C0", PMC64, CBOX13, MSR_UNC_C13_PMON_CTL0, MSR_UNC_C13_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX13C1", PMC65, CBOX13, MSR_UNC_C13_PMON_CTL1, MSR_UNC_C13_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX13C2", PMC66, CBOX13, MSR_UNC_C13_PMON_CTL2, MSR_UNC_C13_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX13C3", PMC67, CBOX13, MSR_UNC_C13_PMON_CTL3, MSR_UNC_C13_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX14C0", PMC68, CBOX14, MSR_UNC_C14_PMON_CTL0, MSR_UNC_C14_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX14C1", PMC69, CBOX14, MSR_UNC_C14_PMON_CTL1, MSR_UNC_C14_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX14C2", PMC70, CBOX14, MSR_UNC_C14_PMON_CTL2, MSR_UNC_C14_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ {"CBOX14C3", PMC71, CBOX14, MSR_UNC_C14_PMON_CTL3, MSR_UNC_C14_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+ /* Uncore management Counters: 2 48bit wide counters */
+ {"UBOX0", PMC72, UBOX, MSR_UNC_U_PMON_CTL0, MSR_UNC_U_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_UBOX},
+ {"UBOX1", PMC73, UBOX, MSR_UNC_U_PMON_CTL1, MSR_UNC_U_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_UBOX},
+ {"UBOXFIX", PMC74, UBOXFIX, MSR_UNC_U_UCLK_FIXED_CTL, MSR_UNC_U_UCLK_FIXED_CTR, 0, 0, 0},
+ /* PCU Counters: 4 48bit wide counters */
+ {"WBOX0", PMC75, WBOX, MSR_UNC_PCU_PMON_CTL0, MSR_UNC_PCU_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_WBOX},
+ {"WBOX1", PMC76, WBOX, MSR_UNC_PCU_PMON_CTL1, MSR_UNC_PCU_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_WBOX},
+ {"WBOX2", PMC77, WBOX, MSR_UNC_PCU_PMON_CTL2, MSR_UNC_PCU_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_WBOX},
+ {"WBOX3", PMC78, WBOX, MSR_UNC_PCU_PMON_CTL3, MSR_UNC_PCU_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_WBOX},
+ {"WBOX0FIX", PMC79, WBOX0FIX, 0, MSR_UNC_PCU_PMON_FIXED_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"WBOX1FIX", PMC80, WBOX1FIX, 0, MSR_UNC_PCU_PMON_FIXED_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+ /* IMC Counters: 4 48bit wide per memory channel, split in two reads */
+ {"MBOX0C0",PMC81, MBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_0, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX0C1",PMC82, MBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_0, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX0C2",PMC83, MBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_0, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX0C3",PMC84, MBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_0, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX0FIX",PMC85, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_NONE_MASK},
+ {"MBOX1C0",PMC86, MBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_1, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX1C1",PMC87, MBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_1, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX1C2",PMC88, MBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_1, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX1C3",PMC89, MBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_1, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX1FIX",PMC90, MBOX1FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_1, EVENT_OPTION_NONE_MASK},
+ {"MBOX2C0",PMC91, MBOX2, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_2, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX2C1",PMC92, MBOX2, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_2, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX2C2",PMC93, MBOX2, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_2, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX2C3",PMC94, MBOX2, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_2, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX2FIX",PMC95, MBOX2FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_2, EVENT_OPTION_NONE_MASK},
+ {"MBOX3C0",PMC96, MBOX3, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_3, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX3C1",PMC97, MBOX3, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_3, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX3C2",PMC98, MBOX3, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_3, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX3C3",PMC99, MBOX3, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_3, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX3FIX",PMC100, MBOX3FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_3, EVENT_OPTION_NONE_MASK},
+ {"MBOX4C0",PMC101, MBOX4, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_0, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX4C1",PMC102, MBOX4, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_0, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX4C2",PMC103, MBOX4, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_0, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX4C3",PMC104, MBOX4, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_0, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX4FIX",PMC105, MBOX4FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_0, EVENT_OPTION_NONE_MASK},
+ {"MBOX5C0",PMC106, MBOX5, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_1, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX5C1",PMC107, MBOX5, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_1, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX5C2",PMC108, MBOX5, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_1, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX5C3",PMC109, MBOX5, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_1, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX5FIX",PMC110, MBOX5FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_1, EVENT_OPTION_NONE_MASK},
+ {"MBOX6C0",PMC111, MBOX6, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_2, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX6C1",PMC112, MBOX6, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_2, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX6C2",PMC113, MBOX6, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_2, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX6C3",PMC114, MBOX6, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_2, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX6FIX",PMC115, MBOX6FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_2, EVENT_OPTION_NONE_MASK},
+ {"MBOX7C0",PMC116, MBOX7, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_3, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX7C1",PMC117, MBOX7, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_3, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX7C2",PMC118, MBOX7, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_3, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX7C3",PMC119, MBOX7, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_3, IVBEP_VALID_OPTIONS_MBOX},
+ {"MBOX7FIX",PMC120, MBOX7FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_3, EVENT_OPTION_NONE_MASK},
+ /* QPI counters four 48bit wide per port, split in two reads */
+ {"SBOX0C0",PMC121, SBOX0, PCI_UNC_QPI_PMON_CTL_0, PCI_UNC_QPI_PMON_CTR_0_A, PCI_UNC_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_0, IVBEP_VALID_OPTIONS_SBOX},
+ {"SBOX0C1",PMC122, SBOX0, PCI_UNC_QPI_PMON_CTL_1, PCI_UNC_QPI_PMON_CTR_1_A, PCI_UNC_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_0, IVBEP_VALID_OPTIONS_SBOX},
+ {"SBOX0C2",PMC123, SBOX0, PCI_UNC_QPI_PMON_CTL_2, PCI_UNC_QPI_PMON_CTR_2_A, PCI_UNC_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_0, IVBEP_VALID_OPTIONS_SBOX},
+ {"SBOX0C3",PMC124, SBOX0, PCI_UNC_QPI_PMON_CTL_3, PCI_UNC_QPI_PMON_CTR_3_A, PCI_UNC_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_0, IVBEP_VALID_OPTIONS_SBOX},
+ {"SBOX1C0",PMC125, SBOX1, PCI_UNC_QPI_PMON_CTL_0, PCI_UNC_QPI_PMON_CTR_0_A, PCI_UNC_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_1, IVBEP_VALID_OPTIONS_SBOX},
+ {"SBOX1C1",PMC126, SBOX1, PCI_UNC_QPI_PMON_CTL_1, PCI_UNC_QPI_PMON_CTR_1_A, PCI_UNC_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_1, IVBEP_VALID_OPTIONS_SBOX},
+ {"SBOX1C2",PMC127, SBOX1, PCI_UNC_QPI_PMON_CTL_2, PCI_UNC_QPI_PMON_CTR_2_A, PCI_UNC_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_1, IVBEP_VALID_OPTIONS_SBOX},
+ {"SBOX1C3",PMC128, SBOX1, PCI_UNC_QPI_PMON_CTL_3, PCI_UNC_QPI_PMON_CTR_3_A, PCI_UNC_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_1, IVBEP_VALID_OPTIONS_SBOX},
+ {"SBOX2C0",PMC129, SBOX2, PCI_UNC_QPI_PMON_CTL_0, PCI_UNC_QPI_PMON_CTR_0_A, PCI_UNC_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_2, IVBEP_VALID_OPTIONS_SBOX},
+ {"SBOX2C1",PMC130, SBOX2, PCI_UNC_QPI_PMON_CTL_1, PCI_UNC_QPI_PMON_CTR_1_A, PCI_UNC_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_2, IVBEP_VALID_OPTIONS_SBOX},
+ {"SBOX2C2",PMC131, SBOX2, PCI_UNC_QPI_PMON_CTL_2, PCI_UNC_QPI_PMON_CTR_2_A, PCI_UNC_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_2, IVBEP_VALID_OPTIONS_SBOX},
+ {"SBOX2C3",PMC132, SBOX2, PCI_UNC_QPI_PMON_CTL_3, PCI_UNC_QPI_PMON_CTR_3_A, PCI_UNC_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_2, IVBEP_VALID_OPTIONS_SBOX},
+ {"SBOX0FIX",PMC133, SBOX0FIX, 0, PCI_UNC_QPI_RATE_STATUS, 0, PCI_QPI_MISC_DEVICE_PORT_0, EVENT_OPTION_NONE_MASK},
+ {"SBOX1FIX",PMC134, SBOX1FIX, 0, PCI_UNC_QPI_RATE_STATUS, 0, PCI_QPI_MISC_DEVICE_PORT_0, EVENT_OPTION_NONE_MASK},
+ {"SBOX2FIX",PMC135, SBOX2FIX, 0, PCI_UNC_QPI_RATE_STATUS, 0, PCI_QPI_MISC_DEVICE_PORT_2, EVENT_OPTION_NONE_MASK},
+ /* HA counters four 48bit wide per counter, split in two reads */
+ {"BBOX0C0", PMC136, BBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_HA_PMON_CTR_0_A, PCI_UNC_HA_PMON_CTR_0_B, PCI_HA_DEVICE_0, IVBEP_VALID_OPTIONS_BBOX},
+ {"BBOX0C1", PMC137, BBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_HA_PMON_CTR_1_A, PCI_UNC_HA_PMON_CTR_1_B, PCI_HA_DEVICE_0, IVBEP_VALID_OPTIONS_BBOX},
+ {"BBOX0C2", PMC138, BBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_HA_PMON_CTR_2_A, PCI_UNC_HA_PMON_CTR_2_B, PCI_HA_DEVICE_0, IVBEP_VALID_OPTIONS_BBOX},
+ {"BBOX0C3", PMC139, BBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_HA_PMON_CTR_3_A, PCI_UNC_HA_PMON_CTR_3_B, PCI_HA_DEVICE_0, IVBEP_VALID_OPTIONS_BBOX},
+ {"BBOX1C0", PMC140, BBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_HA_PMON_CTR_0_A, PCI_UNC_HA_PMON_CTR_0_B, PCI_HA_DEVICE_1, IVBEP_VALID_OPTIONS_BBOX},
+ {"BBOX1C1", PMC141, BBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_HA_PMON_CTR_1_A, PCI_UNC_HA_PMON_CTR_1_B, PCI_HA_DEVICE_1, IVBEP_VALID_OPTIONS_BBOX},
+ {"BBOX1C2", PMC142, BBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_HA_PMON_CTR_2_A, PCI_UNC_HA_PMON_CTR_2_B, PCI_HA_DEVICE_1, IVBEP_VALID_OPTIONS_BBOX},
+ {"BBOX1C3", PMC143, BBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_HA_PMON_CTR_3_A, PCI_UNC_HA_PMON_CTR_3_B, PCI_HA_DEVICE_1, IVBEP_VALID_OPTIONS_BBOX},
+ /* R2PCIe counters four 44bit wide per counter, split in two reads */
+ {"PBOX0", PMC144, PBOX, PCI_UNC_R2PCIE_PMON_CTL_0, PCI_UNC_R2PCIE_PMON_CTR_0_A, PCI_UNC_R2PCIE_PMON_CTR_0_B, PCI_R2PCIE_DEVICE, IVBEP_VALID_OPTIONS_PBOX},
+ {"PBOX1", PMC145, PBOX, PCI_UNC_R2PCIE_PMON_CTL_1, PCI_UNC_R2PCIE_PMON_CTR_1_A, PCI_UNC_R2PCIE_PMON_CTR_1_B, PCI_R2PCIE_DEVICE, IVBEP_VALID_OPTIONS_PBOX},
+ {"PBOX2", PMC146, PBOX, PCI_UNC_R2PCIE_PMON_CTL_2, PCI_UNC_R2PCIE_PMON_CTR_2_A, PCI_UNC_R2PCIE_PMON_CTR_2_B, PCI_R2PCIE_DEVICE, IVBEP_VALID_OPTIONS_PBOX},
+ {"PBOX3", PMC147, PBOX, PCI_UNC_R2PCIE_PMON_CTL_3, PCI_UNC_R2PCIE_PMON_CTR_3_A, PCI_UNC_R2PCIE_PMON_CTR_3_B, PCI_R2PCIE_DEVICE, IVBEP_VALID_OPTIONS_PBOX},
+ /* R3QPI counters four 44bit wide per counter, split in two reads */
+ {"RBOX0C0", PMC148, RBOX0, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_0, IVBEP_VALID_OPTIONS_RBOX},
+ {"RBOX0C1", PMC149, RBOX0, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_0, IVBEP_VALID_OPTIONS_RBOX},
+ {"RBOX0C2", PMC150, RBOX0, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_0, IVBEP_VALID_OPTIONS_RBOX},
+ {"RBOX1C0", PMC151, RBOX1, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_1, IVBEP_VALID_OPTIONS_RBOX},
+ {"RBOX1C1", PMC152, RBOX1, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_1, IVBEP_VALID_OPTIONS_RBOX},
+ {"RBOX1C2", PMC153, RBOX1, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_1, IVBEP_VALID_OPTIONS_RBOX},
+ {"RBOX2C0", PMC154, RBOX2, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_2, IVBEP_VALID_OPTIONS_RBOX},
+ {"RBOX2C1", PMC155, RBOX2, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_2, IVBEP_VALID_OPTIONS_RBOX},
+ {"RBOX2C2", PMC156, RBOX2, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_2, IVBEP_VALID_OPTIONS_RBOX},
+ /* IRP counters four 44bit wide per counter */
+ {"IBOX0C0", PMC157, IBOX0, PCI_UNC_IRP0_PMON_CTL_0, PCI_UNC_IRP0_PMON_CTR_0, 0, PCI_IRP_DEVICE, IVBEP_VALID_OPTIONS_IBOX},
+ {"IBOX0C1", PMC158, IBOX0, PCI_UNC_IRP0_PMON_CTL_1, PCI_UNC_IRP0_PMON_CTR_1, 0, PCI_IRP_DEVICE, IVBEP_VALID_OPTIONS_IBOX},
+ {"IBOX1C0", PMC159, IBOX1, PCI_UNC_IRP1_PMON_CTL_0, PCI_UNC_IRP1_PMON_CTR_0, 0, PCI_IRP_DEVICE, IVBEP_VALID_OPTIONS_IBOX},
+ {"IBOX1C1", PMC160, IBOX1, PCI_UNC_IRP1_PMON_CTL_1, PCI_UNC_IRP1_PMON_CTR_1, 0, PCI_IRP_DEVICE, IVBEP_VALID_OPTIONS_IBOX},
+};
+
+static BoxMap ivybridgeEP_box_map[NUM_UNITS] = {
+ [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+ [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+ [THERMAL] = {0, 0, 0, 0, 0, MSR_DEV, 8},
+ [POWER] = {0, 0, 0, 0, 0, MSR_DEV, 32},
+ [MBOX0] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 20, 1, PCI_IMC_DEVICE_0_CH_0, 48},
+ [MBOX0FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 20, 1, PCI_IMC_DEVICE_0_CH_0, 48},
+ [MBOX1] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 20, 1, PCI_IMC_DEVICE_0_CH_1, 48},
+ [MBOX1FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 20, 1, PCI_IMC_DEVICE_0_CH_1, 48},
+ [MBOX2] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 20, 1, PCI_IMC_DEVICE_0_CH_2, 48},
+ [MBOX2FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 20, 1, PCI_IMC_DEVICE_0_CH_2, 48},
+ [MBOX3] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 20, 1, PCI_IMC_DEVICE_0_CH_3, 48},
+ [MBOX3FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 20, 1, PCI_IMC_DEVICE_0_CH_3, 48},
+ [MBOX4] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 21, 1, PCI_IMC_DEVICE_1_CH_0, 48},
+ [MBOX4FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 21, 1, PCI_IMC_DEVICE_1_CH_0, 48},
+ [MBOX5] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 21, 1, PCI_IMC_DEVICE_1_CH_1, 48},
+ [MBOX5FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 21, 1, PCI_IMC_DEVICE_1_CH_1, 48},
+ [MBOX6] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 21, 1, PCI_IMC_DEVICE_1_CH_2, 48},
+ [MBOX6FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 21, 1, PCI_IMC_DEVICE_1_CH_2, 48},
+ [MBOX7] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 21, 1, PCI_IMC_DEVICE_1_CH_3, 48},
+ [MBOX7FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 21, 1, PCI_IMC_DEVICE_1_CH_3, 48},
+ [CBOX0] = {MSR_UNC_C0_PMON_BOX_CTL, 0, 0, 3, 0, 0, 44, MSR_UNC_C0_PMON_BOX_FILTER, MSR_UNC_C0_PMON_BOX_FILTER1},
+ [CBOX1] = {MSR_UNC_C1_PMON_BOX_CTL, 0, 0, 4, 0, 0, 44, MSR_UNC_C1_PMON_BOX_FILTER, MSR_UNC_C1_PMON_BOX_FILTER1},
+ [CBOX2] = {MSR_UNC_C2_PMON_BOX_CTL, 0, 0, 5, 0, 0, 44, MSR_UNC_C2_PMON_BOX_FILTER, MSR_UNC_C2_PMON_BOX_FILTER1},
+ [CBOX3] = {MSR_UNC_C3_PMON_BOX_CTL, 0, 0, 6, 0, 0, 44, MSR_UNC_C3_PMON_BOX_FILTER, MSR_UNC_C3_PMON_BOX_FILTER1},
+ [CBOX4] = {MSR_UNC_C4_PMON_BOX_CTL, 0, 0, 7, 0, 0, 44, MSR_UNC_C4_PMON_BOX_FILTER, MSR_UNC_C4_PMON_BOX_FILTER1},
+ [CBOX5] = {MSR_UNC_C5_PMON_BOX_CTL, 0, 0, 8, 0, 0, 44, MSR_UNC_C5_PMON_BOX_FILTER, MSR_UNC_C5_PMON_BOX_FILTER1},
+ [CBOX6] = {MSR_UNC_C6_PMON_BOX_CTL, 0, 0, 9, 0, 0, 44, MSR_UNC_C6_PMON_BOX_FILTER, MSR_UNC_C6_PMON_BOX_FILTER1},
+ [CBOX7] = {MSR_UNC_C7_PMON_BOX_CTL, 0, 0, 10, 0, 0, 44, MSR_UNC_C7_PMON_BOX_FILTER, MSR_UNC_C7_PMON_BOX_FILTER1},
+ [CBOX8] = {MSR_UNC_C8_PMON_BOX_CTL, 0, 0, 11, 0, 0, 44, MSR_UNC_C8_PMON_BOX_FILTER, MSR_UNC_C8_PMON_BOX_FILTER1},
+ [CBOX9] = {MSR_UNC_C9_PMON_BOX_CTL, 0, 0, 12, 0, 0, 44, MSR_UNC_C9_PMON_BOX_FILTER, MSR_UNC_C9_PMON_BOX_FILTER1},
+ [CBOX10] = {MSR_UNC_C10_PMON_BOX_CTL, 0, 0, 13, 0, 0, 44, MSR_UNC_C10_PMON_BOX_FILTER, MSR_UNC_C10_PMON_BOX_FILTER1},
+ [CBOX11] = {MSR_UNC_C11_PMON_BOX_CTL, 0, 0, 14, 0, 0, 44, MSR_UNC_C11_PMON_BOX_FILTER, MSR_UNC_C11_PMON_BOX_FILTER1},
+ [CBOX12] = {MSR_UNC_C12_PMON_BOX_CTL, 0, 0, 15, 0, 0, 44, MSR_UNC_C12_PMON_BOX_FILTER, MSR_UNC_C12_PMON_BOX_FILTER1},
+ [CBOX13] = {MSR_UNC_C13_PMON_BOX_CTL, 0, 0, 16, 0, 0, 44, MSR_UNC_C13_PMON_BOX_FILTER, MSR_UNC_C13_PMON_BOX_FILTER1},
+ [CBOX14] = {MSR_UNC_C14_PMON_BOX_CTL, 0, 0, 17, 0, 0, 44, MSR_UNC_C14_PMON_BOX_FILTER, MSR_UNC_C14_PMON_BOX_FILTER1},
+ [BBOX0] = {PCI_UNC_HA_PMON_BOX_CTL, PCI_UNC_HA_PMON_BOX_STATUS, PCI_UNC_HA_PMON_BOX_STATUS, 18, 1, PCI_HA_DEVICE_0, 48},
+ [BBOX1] = {PCI_UNC_HA_PMON_BOX_CTL, PCI_UNC_HA_PMON_BOX_STATUS, PCI_UNC_HA_PMON_BOX_STATUS, 19, 1, PCI_HA_DEVICE_1, 48},
+ [SBOX0] = {PCI_UNC_QPI_PMON_BOX_CTL, PCI_UNC_QPI_PMON_BOX_STATUS, PCI_UNC_QPI_PMON_BOX_STATUS, 22, 1, PCI_QPI_DEVICE_PORT_0, 48},
+ [SBOX1] = {PCI_UNC_QPI_PMON_BOX_CTL, PCI_UNC_QPI_PMON_BOX_STATUS, PCI_UNC_QPI_PMON_BOX_STATUS, 23, 1, PCI_QPI_DEVICE_PORT_1, 48},
+ [SBOX2] = {PCI_UNC_QPI_PMON_BOX_CTL, PCI_UNC_QPI_PMON_BOX_STATUS, PCI_UNC_QPI_PMON_BOX_STATUS, -1, 1, PCI_QPI_DEVICE_PORT_2, 48},
+ [SBOX0FIX] = {0, 0, 0, 0, 1, PCI_QPI_MISC_DEVICE_PORT_0, 64},
+ [SBOX1FIX] = {0, 0, 0, 0, 1, PCI_QPI_MISC_DEVICE_PORT_0, 64},
+ [SBOX2FIX] = {0, 0, 0, 0, 1, PCI_QPI_MISC_DEVICE_PORT_2, 64},
+ [WBOX] = {MSR_UNC_PCU_PMON_BOX_CTL, MSR_UNC_PCU_PMON_BOX_STATUS, MSR_UNC_PCU_PMON_BOX_STATUS, 2, 0, 0, 48, MSR_UNC_PCU_PMON_BOX_FILTER},
+ [WBOX0FIX] = {0, 0, 0, 0, 0, 0, 64},
+ [WBOX1FIX] = {0, 0, 0, 0, 0, 0, 64},
+ [UBOX] = {0, MSR_UNC_U_PMON_BOX_STATUS, MSR_UNC_U_PMON_BOX_STATUS, 1, 0, 0, 44},
+ [UBOXFIX] = {0, MSR_UNC_U_PMON_BOX_STATUS, MSR_UNC_U_PMON_BOX_STATUS, 0, 0, 0, 44},
+ [PBOX] = {PCI_UNC_R2PCIE_PMON_BOX_CTL, PCI_UNC_R2PCIE_PMON_BOX_STATUS, PCI_UNC_R2PCIE_PMON_BOX_STATUS, 26, 1,PCI_R2PCIE_DEVICE, 44},
+ [RBOX0] = {PCI_UNC_R3QPI_PMON_BOX_CTL, PCI_UNC_R3QPI_PMON_BOX_STATUS, PCI_UNC_R3QPI_PMON_BOX_STATUS, 24, 1,PCI_R3QPI_DEVICE_LINK_0, 44},
+ [RBOX1] = {PCI_UNC_R3QPI_PMON_BOX_CTL, PCI_UNC_R3QPI_PMON_BOX_STATUS, PCI_UNC_R3QPI_PMON_BOX_STATUS, 25, 1,PCI_R3QPI_DEVICE_LINK_1, 44},
+ [RBOX2] = {PCI_UNC_R3QPI_PMON_BOX_CTL, PCI_UNC_R3QPI_PMON_BOX_STATUS, PCI_UNC_R3QPI_PMON_BOX_STATUS, -1, 1,PCI_R3QPI_DEVICE_LINK_2, 44},
+ [IBOX0] = {PCI_UNC_IRP_PMON_BOX_CTL, PCI_UNC_IRP_PMON_BOX_STATUS, PCI_UNC_IRP_PMON_BOX_STATUS, -1, 1, PCI_IRP_DEVICE, 44},
+ [IBOX1] = {PCI_UNC_IRP_PMON_BOX_CTL, PCI_UNC_IRP_PMON_BOX_STATUS, PCI_UNC_IRP_PMON_BOX_STATUS, -1, 1, PCI_IRP_DEVICE, 44},
+};
+
+static PciDevice ivybridgeEP_pci_devices[MAX_NUM_PCI_DEVICES] = {
+ [MSR_DEV] = {NODEVTYPE, "", "", ""},
+ [PCI_R3QPI_DEVICE_LINK_0] = {R3QPI, "13.5", "PCI_R3QPI_DEVICE_LINK_0", "RBOX0", 0x0e36},
+ [PCI_R3QPI_DEVICE_LINK_1] = {R3QPI, "13.6", "PCI_R3QPI_DEVICE_LINK_1", "RBOX1", 0x0e37},
+ [PCI_R3QPI_DEVICE_LINK_2] = {R3QPI, "12.5", "PCI_R3QPI_DEVICE_LINK_2", "RBOX2", 0x0e3e},
+ [PCI_R2PCIE_DEVICE] = {R2PCIE, "13.1", "PCI_R2PCIE_DEVICE", "PBOX0", 0x0e34},
+ [PCI_IMC_DEVICE_0_CH_0] = {IMC, "10.4", "PCI_IMC_DEVICE_0_CH_0", "MBOX0", 0x0eb4},
+ [PCI_IMC_DEVICE_0_CH_1] = {IMC, "10.5", "PCI_IMC_DEVICE_0_CH_1", "MBOX1", 0x0eb5},
+ [PCI_IMC_DEVICE_0_CH_2] = {IMC, "10.0", "PCI_IMC_DEVICE_0_CH_2", "MBOX2", 0x0eb0},
+ [PCI_IMC_DEVICE_0_CH_3] = {IMC, "10.1", "PCI_IMC_DEVICE_0_CH_3", "MBOX3", 0x0eb1},
+ [PCI_HA_DEVICE_0] = {HA, "0e.1", "PCI_HA_DEVICE_0", "BBOX0", 0x0e30},
+ [PCI_HA_DEVICE_1] = {HA, "1c.1", "PCI_HA_DEVICE_1", "BBOX1", 0x0e38},
+ [PCI_IMC_DEVICE_1_CH_0] = {IMC, "1e.4", "PCI_IMC_DEVICE_1_CH_0", "MBOX4", 0x0ef4},
+ [PCI_IMC_DEVICE_1_CH_1] = {IMC, "1e.5", "PCI_IMC_DEVICE_1_CH_1", "MBOX5", 0x0ef5},
+ [PCI_IMC_DEVICE_1_CH_2] = {IMC, "1e.0", "PCI_IMC_DEVICE_1_CH_2", "MBOX6", 0x0ef0},
+ [PCI_IMC_DEVICE_1_CH_3] = {IMC, "1e.1", "PCI_IMC_DEVICE_1_CH_3", "MBOX7", 0x0ef1},
+ [PCI_IRP_DEVICE] = {IRP, "05.6", "PCI_IRP_DEVICE", NULL, 0x0e39},
+ [PCI_QPI_DEVICE_PORT_0] = {QPI, "08.2", "PCI_QPI_DEVICE_PORT_0", "SBOX0", 0x0e32},
+ [PCI_QPI_DEVICE_PORT_1] = {QPI, "09.2", "PCI_QPI_DEVICE_PORT_1", "SBOX1", 0x0e33},
+ [PCI_QPI_DEVICE_PORT_2] = {QPI, "0a.2", "PCI_QPI_DEVICE_PORT_2", "SBOX2", 0x0e3a},
+ [PCI_QPI_MASK_DEVICE_PORT_0] = {QPI, "08.6", "PCI_QPI_MASK_DEVICE_PORT_0", NULL, 0x0e86},
+ [PCI_QPI_MASK_DEVICE_PORT_1] = {QPI, "09.6", "PCI_QPI_MASK_DEVICE_PORT_1", NULL, 0x0e96},
+ [PCI_QPI_MASK_DEVICE_PORT_2] = {QPI, "0a.6", "PCI_QPI_MASK_DEVICE_PORT_2", NULL, 0x0ec6},
+ [PCI_QPI_MISC_DEVICE_PORT_0] = {QPI, "08.0", "PCI_QPI_MISC_DEVICE_PORT_0/1", "SBOX01FIX",0x0e80},
+ [PCI_QPI_MISC_DEVICE_PORT_2] = {QPI, "0a.0", "PCI_QPI_MISC_DEVICE_PORT_2", "SBOX2FIX", 0x0ec0},
+};
diff --git a/src/includes/perfmon_ivybridgeEP_events.txt b/src/includes/perfmon_ivybridgeEP_events.txt
new file mode 100644
index 0000000..e71e1cf
--- /dev/null
+++ b/src/includes/perfmon_ivybridgeEP_events.txt
@@ -0,0 +1,2072 @@
+# =======================================================================================
+#
+# Filename: perfmon_ivybridgeEP_events.txt
+#
+# Description: Event list for Intel Ivy Bridge EP/EN/EX
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
+# Author: Jan Treibig (jt), jan.treibig at gmail.com
+# Thomas Roehl (tr), thomas.roehl at googlemail.com
+# Project: likwid
+#
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation, either version 3 of the License, or (at your option) any later
+# version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
+
+EVENT_TEMP_CORE 0x00 TMP0
+UMASK_TEMP_CORE 0x00
+
+EVENT_PWR_PKG_ENERGY 0x00 PWR0
+UMASK_PWR_PKG_ENERGY 0x00
+
+EVENT_PWR_PP0_ENERGY 0x00 PWR1
+UMASK_PWR_PP0_ENERGY 0x00
+
+EVENT_PWR_DRAM_ENERGY 0x00 PWR3
+UMASK_PWR_DRAM_ENERGY 0x00
+
+EVENT_INSTR_RETIRED 0x00 FIXC0
+UMASK_INSTR_RETIRED_ANY 0x00
+
+EVENT_CPU_CLK_UNHALTED 0x00 FIXC1
+UMASK_CPU_CLK_UNHALTED_CORE 0x00
+
+EVENT_CPU_CLK_UNHALTED 0x00 FIXC2
+UMASK_CPU_CLK_UNHALTED_REF 0x00
+
+EVENT_LD_BLOCKS 0x03 PMC
+UMASK_LD_BLOCKS_STORE_FORWARD 0x02
+UMASK_LD_BLOCKS_NO_SR 0x08
+
+EVENT_MISALIGN_MEM_REF 0x05 PMC
+UMASK_MISALIGN_MEM_REF_LOADS 0x01
+UMASK_MISALIGN_MEM_REF_STORES 0x02
+UMASK_MISALIGN_MEM_REF_ANY 0x03
+
+EVENT_LD_BLOCKS_PARTIAL 0x07 PMC
+UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS 0x01
+
+EVENT_DTLB_LOAD_MISSES 0x08 PMC
+UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK 0x81
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED 0x82
+UMASK_DTLB_LOAD_MISSES_WALK_DURATION 0x84
+
+EVENT_INT_MISC 0x0D PMC
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_INT_MISC_RECOVERY_CYCLES 0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_INT_MISC_RECOVERY_COUNT 0x03
+
+EVENT_UOPS_ISSUED 0x0E PMC
+UMASK_UOPS_ISSUED_ANY 0x01
+UMASK_UOPS_ISSUED_FLAGS_MERGE 0x10
+UMASK_UOPS_ISSUED_SLOW_LEA 0x20
+UMASK_UOPS_ISSUED_SINGLE_MUL 0x40
+DEFAULT_OPTIONS_UOPS_ISSUED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_ISSUED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_ANY 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_FLAGS_MERGE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_FLAGS_MERGE 0x10
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_SLOW_LEA EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_SLOW_LEA 0x20
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_SINGLE_MUL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_SINGLE_MUL 0x40
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_TOTAL_CYCLES 0x01
+
+EVENT_FP_COMP_OPS_EXE 0x10 PMC
+UMASK_FP_COMP_OPS_EXE_X87 0x01
+UMASK_FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE 0x10
+UMASK_FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE 0x20
+UMASK_FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE 0x40
+UMASK_FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE 0x80
+
+EVENT_SIMD_FP_256_PACKED 0x11 PMC
+UMASK_SIMD_FP_256_PACKED_SINGLE 0x01
+UMASK_SIMD_FP_256_PACKED_DOUBLE 0x02
+
+EVENT_ARITH 0x14 PMC
+UMASK_ARITH_FPU_DIV_ACTIVE 0x01
+DEFAULT_OPTIONS_ARITH_NUM_DIV EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_ARITH_NUM_DIV 0x01
+
+EVENT_L2_RQSTS 0x24 PMC
+UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD_HIT 0x01
+UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD 0x03
+UMASK_L2_RQSTS_RFO_HITS 0x04
+UMASK_L2_RQSTS_RFO_MISS 0x08
+UMASK_L2_RQSTS_RFO_ANY 0x0C
+UMASK_L2_RQSTS_CODE_RD_HITS 0x10
+UMASK_L2_RQSTS_CODE_RD_MISS 0x20
+UMASK_L2_RQSTS_ALL_CODE_CODE_RD 0x30
+UMASK_L2_RQSTS_PF_HIT 0x40
+UMASK_L2_RQSTS_PF_MISS 0x80
+UMASK_L2_RQSTS_ALL_PF 0xC0
+UMASK_L2_RQSTS_MISS 0xAA
+
+EVENT_L2_STORE_LOCK_RQSTS 0x27 PMC
+UMASK_L2_STORE_LOCK_RQSTS_MISS 0x01
+UMASK_L2_STORE_LOCK_RQSTS_HIT_M 0x08
+UMASK_L2_STORE_LOCK_RQSTS_ALL 0x0F
+
+EVENT_L1D_WB_RQST 0x28 PMC
+UMASK_L1D_WB_RQST_HIT_E 0x04
+UMASK_L1D_WB_RQST_HIT_M 0x08
+UMASK_L1D_WB_RQST_ALL 0x0F
+
+EVENT_L3_LAT_CACHE 0x2E PMC
+UMASK_L3_LAT_CACHE_REFERENCE 0x4F
+UMASK_L3_LAT_CACHE_MISS 0x41
+
+EVENT_CPU_CLOCK_UNHALTED 0x3C PMC
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P 0x00
+UMASK_CPU_CLOCK_UNHALTED_REF_P 0x01
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0x2,EVENT_OPTION_INVERT=0x1
+UMASK_CPU_CLOCK_UNHALTED_TOTAL_CYCLES 0x00
+
+EVENT_L1D_PEND_MISS 0x48 PMC1
+UMASK_L1D_PEND_MISS_PENDING 0x01
+
+EVENT_DTLB_STORE_MISSES 0x49 PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK 0x01
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED 0x02
+UMASK_DTLB_STORE_MISSES_WALK_DURATION 0x04
+UMASK_DTLB_STORE_MISSES_STLB_HIT 0x10
+
+EVENT_LOAD_HIT_PRE 0x4C PMC
+UMASK_LOAD_HIT_PRE_SW_PF 0x01
+UMASK_LOAD_HIT_PRE_HW_PF 0x02
+
+EVENT_L1D 0x51 PMC
+UMASK_L1D_REPLACEMENT 0x01
+UMASK_L1D_ALLOCATED_IN_M 0x02
+UMASK_L1D_M_EVICT 0x04
+UMASK_L1D_ALL_M_REPLACEMENT 0x08
+
+EVENT_MOVE_ELIMINATION 0x58 PMC
+UMASK_MOVE_ELIMINATION_INT_NOT_ELIMINATED 0x04
+UMASK_MOVE_ELIMINATION_SIMD_NOT_ELIMINATED 0x08
+UMASK_MOVE_ELIMINATION_INT_ELIMINATED 0x01
+UMASK_MOVE_ELIMINATION_SIMD_ELIMINATED 0x02
+
+EVENT_CPL_CYCLES 0x5C PMC
+UMASK_CPL_CYCLES_RING0 0x01
+UMASK_CPL_CYCLES_RING123 0x02
+
+EVENT_RS_EVENTS 0x5E PMC
+UMASK_RS_EVENTS_EMPTY_CYCLES 0x01
+
+EVENT_OFFCORE_REQUESTS_OUTSTANDING 0x60 PMC
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD 0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD 0x02
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO 0x04
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD 0x08
+
+EVENT_CACHE_LOCK_CYCLES 0x63 PMC
+UMASK_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION 0x01
+DEFAULT_OPTIONS_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_COUNT EVENT_OPTION_EDGE=1
+UMASK_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_COUNT 0x01
+UMASK_CACHE_LOCK_CYCLES_CACHE_LOCK_DURATION 0x02
+DEFAULT_OPTIONS_CACHE_LOCK_CYCLES_CACHE_LOCK_COUNT EVENT_OPTION_EDGE=1
+UMASK_CACHE_LOCK_CYCLES_CACHE_LOCK_COUNT 0x02
+
+EVENT_IDQ 0x79 PMC
+UMASK_IDQ_EMPTY 0x02
+UMASK_IDQ_MITE_UOPS 0x04
+UMASK_IDQ_DSB_UOPS 0x08
+UMASK_IDQ_MS_DSB_UOPS 0x10
+UMASK_IDQ_MS_MITE_UOPS 0x20
+UMASK_IDQ_MS_UOPS 0x30
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS 0x18
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_ALL_DSB_CYCLES_4_UOPS 0x18
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_ALL_MITE_CYCLES_ANY_UOPS 0x24
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS 0x24
+UMASK_IDQ_ALL_MITE_ALL_UOPS 0x3C
+
+EVENT_ICACHE 0x80 PMC
+UMASK_ICACHE_HITS 0x01
+UMASK_ICACHE_MISSES 0x02
+UMASK_ICACHE_ACCESSES 0x03
+UMASK_ICACHE_IFETCH_STALL 0x04
+
+EVENT_ITLB_MISSES 0x85 PMC
+UMASK_ITLB_MISSES_CAUSES_A_WALK 0x01
+UMASK_ITLB_MISSES_WALK_COMPLETED 0x02
+UMASK_ITLB_MISSES_WALK_DURATION 0x04
+UMASK_ITLB_MISSES_STLB_HIT 0x10
+
+EVENT_ILD_STALL 0x87 PMC
+UMASK_ILD_STALL_LCP 0x01
+UMASK_ILD_STALL_IQ_FULL 0x04
+
+EVENT_BR_INST_EXEC 0x88 PMC
+UMASK_BR_INST_EXEC_COND_TAKEN 0x81
+UMASK_BR_INST_EXEC_COND_NON_TAKEN 0x41
+UMASK_BR_INST_EXEC_DIRECT_JMP_TAKEN 0x82
+UMASK_BR_INST_EXEC_DIRECT_JMP_NON_TAKEN 0x42
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN 0x84
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN 0x44
+UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN 0x88
+UMASK_BR_INST_EXEC_RETURN_NEAR_NON_TAKEN 0x48
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN 0x90
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_NON_TAKEN 0x50
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN 0xA0
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN 0x60
+UMASK_BR_INST_EXEC_ALL_BRANCHES 0xFF
+
+EVENT_BR_MISP_EXEC 0x89 PMC
+UMASK_BR_MISP_EXEC_COND_TAKEN 0x81
+UMASK_BR_MISP_EXEC_COND_NON_TAKEN 0x41
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN 0x84
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN 0x44
+UMASK_BR_MISP_EXEC_RETURN_NEAR_TAKEN 0x88
+UMASK_BR_MISP_EXEC_RETURN_NEAR_NON_TAKEN 0x48
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_TAKEN 0x90
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_NON_TAKEN 0x50
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN 0xA0
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN 0x60
+UMASK_BR_MISP_EXEC_ALL_BRANCHES 0xFF
+
+EVENT_IDQ_UOPS_NOT_DELIVERED 0x9C PMC
+UMASK_IDQ_UOPS_NOT_DELIVERED_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x3
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x2
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK 0x01
+
+EVENT_UOPS_DISPATCHED_PORT 0xA1 PMC
+UMASK_UOPS_DISPATCHED_PORT_PORT_0 0x01
+UMASK_UOPS_DISPATCHED_PORT_PORT_1 0x02
+UMASK_UOPS_DISPATCHED_PORT_PORT_2_LD 0x04
+UMASK_UOPS_DISPATCHED_PORT_PORT_2_STA 0x08
+UMASK_UOPS_DISPATCHED_PORT_PORT_2 0x0C
+UMASK_UOPS_DISPATCHED_PORT_PORT_3_LD 0x10
+UMASK_UOPS_DISPATCHED_PORT_PORT_3_STA 0x20
+UMASK_UOPS_DISPATCHED_PORT_PORT_3 0x30
+UMASK_UOPS_DISPATCHED_PORT_PORT_4 0x40
+UMASK_UOPS_DISPATCHED_PORT_PORT_5 0x80
+DEFAULT_OPTIONS_UOPS_DISPATCHED_PORT_ARITH_PORTS_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_DISPATCHED_PORT_ARITH_PORTS_CORE 0x83
+UMASK_UOPS_DISPATCHED_PORT_DATA_PORTS 0x7C
+UMASK_UOPS_DISPATCHED_PORT_ALL_PORTS 0xFF
+
+EVENT_RESOURCE_STALLS 0xA2 PMC
+UMASK_RESOURCE_STALLS_ANY 0x01
+UMASK_RESOURCE_STALLS_RS 0x04
+UMASK_RESOURCE_STALLS_SB 0x08
+UMASK_RESOURCE_STALLS_ROB 0x10
+
+EVENT_CYCLE_ACTIVITY 0xA3 PMC
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_L2_PENDING EVENT_OPTION_THRESHOLD=0x01
+UMASK_CYCLE_ACTIVITY_CYCLES_L2_PENDING 0x01
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_LDM_PENDING EVENT_OPTION_THRESHOLD=0x02
+UMASK_CYCLE_ACTIVITY_CYCLES_LDM_PENDING 0x02
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_L1D_PENDING EVENT_OPTION_THRESHOLD=0x08
+UMASK_CYCLE_ACTIVITY_L1D_PENDING 0x08
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE EVENT_OPTION_THRESHOLD=0x04
+UMASK_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE 0x04
+
+EVENT_DSB2MITE_SWITCHES 0xAB PMC
+UMASK_DSB2MITE_SWITCHES_COUNT 0x01
+UMASK_DSB2MITE_SWITCHES_PENALTY_CYCLES 0x02
+
+EVENT_DSB_FILL 0xAC PMC
+UMASK_DSB_FILL_EXCEED_DSB_LINES 0x08
+
+EVENT_ITLB 0xAE PMC
+UMASK_ITLB_ITLB_FLUSH 0x01
+
+EVENT_OFFCORE_REQUESTS 0xB0 PMC
+UMASK_OFFCORE_REQUESTS_DEMAND_DATA_RD 0x01
+UMASK_OFFCORE_REQUESTS_DEMAND_CODE_RD 0x02
+UMASK_OFFCORE_REQUESTS_DEMAND_RFO 0x04
+UMASK_OFFCORE_REQUESTS_ALL_DATA_RD 0x08
+
+EVENT_UOPS_EXECUTED 0xB1 PMC
+UMASK_UOPS_EXECUTED_THREAD 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC 0x01
+UMASK_UOPS_EXECUTED_CORE 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_USED_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_TOTAL_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC 0x02
+
+EVENT_TLB_FLUSH 0xBD PMC
+UMASK_TLB_FLUSH_DTLB_THREAD 0x01
+UMASK_TLB_FLUSH_STLB_ANY 0x20
+
+EVENT_INST_RETIRED 0xC0 PMC1
+UMASK_INST_RETIRED_ANY_P 0x00
+UMASK_INST_RETIRED_ALL 0x01
+
+EVENT_OTHER_ASSISTS 0xC1 PMC
+UMASK_OTHER_ASSISTS_AVX_STORE 0x08
+UMASK_OTHER_ASSISTS_AVX_TO_SSE 0x10
+UMASK_OTHER_ASSISTS_SSE_TO_AVX 0x20
+
+EVENT_UOPS_RETIRED 0xC2 PMC
+UMASK_UOPS_RETIRED_ALL 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL 0x01
+UMASK_UOPS_RETIRED_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_RETIRED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL 0x01
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_TOTAL_CYCLES 0x01
+
+EVENT_MACHINE_CLEARS 0xC3 PMC
+UMASK_MACHINE_CLEARS_CYCLES 0x01
+DEFAULT_OPTIONS_MACHINE_CLEARS_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_MACHINE_CLEARS_COUNT 0x01
+UMASK_MACHINE_CLEARS_MEMORY_ORDERING 0x02
+UMASK_MACHINE_CLEARS_SMC 0x04
+UMASK_MACHINE_CLEARS_MASKMOV 0x20
+
+EVENT_BR_INST_RETIRED 0xC4 PMC
+UMASK_BR_INST_RETIRED_ALL_BRANCHES 0x00
+UMASK_BR_INST_RETIRED_CONDITIONAL 0x01
+UMASK_BR_INST_RETIRED_NEAR_CALL 0x02
+UMASK_BR_INST_RETIRED_NEAR_RETURN 0x08
+UMASK_BR_INST_RETIRED_NOT_TAKEN 0x10
+UMASK_BR_INST_RETIRED_NEAR_TAKEN 0x20
+UMASK_BR_INST_RETIRED_FAR_BRANCH 0x40
+
+EVENT_BR_MISP_RETIRED 0xC5 PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES 0x00
+UMASK_BR_MISP_RETIRED_CONDITIONAL 0x01
+UMASK_BR_MISP_RETIRED_NEAR_CALL 0x02
+UMASK_BR_MISP_RETIRED_NOT_TAKEN 0x10
+UMASK_BR_MISP_RETIRED_TAKEN 0x20
+
+EVENT_FP_ASSIST 0xCA PMC
+UMASK_FP_ASSIST_X87_OUTPUT 0x02
+UMASK_FP_ASSIST_X87_INPUT 0x04
+UMASK_FP_ASSIST_SIMD_OUTPUT 0x08
+UMASK_FP_ASSIST_SIMD_INPUT 0x10
+UMASK_FP_ASSIST_ANY 0x1E
+
+EVENT_ROB_MISC_EVENT_LBR_INSERTS 0xCC PMC
+UMASK_ROB_MISC_EVENT_LBR_INSERTS 0x20
+
+EVENT_MEM_UOPS_RETIRED 0xD0 PMC
+UMASK_MEM_UOPS_RETIRED_LOADS 0x81
+UMASK_MEM_UOPS_RETIRED_STORES 0x82
+UMASK_MEM_UOPS_RETIRED_LOADS_STLB_MISS 0x11
+UMASK_MEM_UOPS_RETIRED_STORES_STLB_MISS 0x12
+UMASK_MEM_UOPS_RETIRED_LOADS_LOCK 0x21
+UMASK_MEM_UOPS_RETIRED_STORES_LOCK 0x22
+UMASK_MEM_UOPS_RETIRED_LOADS_SPLIT 0x41
+UMASK_MEM_UOPS_RETIRED_STORES_SPLIT 0x42
+
+EVENT_MEM_LOAD_UOPS_RETIRED 0xD1 PMC
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT 0x01
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_MISS 0x08
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_ALL 0x09
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_HIT 0x02
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_MISS 0x10
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_ALL 0x12
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT 0x04
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS 0x20
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL 0x24
+UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB 0x40
+UMASK_MEM_LOAD_UOPS_RETIRED_ALL 0x7F
+
+EVENT_MEM_LOAD_UOPS_LLC_HIT_RETIRED 0xD2 PMC
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_MISS 0x01
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HIT 0x02
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM 0x04
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_NONE 0x08
+
+EVENT_BACLEARS 0xE6 PMC
+UMASK_BACLEARS_ANY 0x1F
+
+EVENT_L2_TRANS 0xF0 PMC
+UMASK_L2_TRANS_DEMAND_DATA_RD 0x01
+UMASK_L2_TRANS_RFO 0x02
+UMASK_L2_TRANS_CODE_RD 0x04
+UMASK_L2_TRANS_ALL_PREF 0x08
+UMASK_L2_TRANS_L1D_WB 0x10
+UMASK_L2_TRANS_L2_FILL 0x20
+UMASK_L2_TRANS_L2_WB 0x40
+UMASK_L2_TRANS_ALL_REQUESTS 0x80
+
+EVENT_L2_LINES_IN 0xF1 PMC
+UMASK_L2_LINES_IN_I 0x01
+UMASK_L2_LINES_IN_S 0x02
+UMASK_L2_LINES_IN_E 0x04
+UMASK_L2_LINES_IN_ALL 0x07
+
+EVENT_L2_LINES_OUT 0xF2 PMC
+UMASK_L2_LINES_OUT_DEMAND_CLEAN 0x01
+UMASK_L2_LINES_OUT_DEMAND_DIRTY 0x02
+UMASK_L2_LINES_OUT_PF_CLEAN 0x04
+UMASK_L2_LINES_OUT_PF_DIRTY 0x08
+UMASK_L2_LINES_OUT_DIRTY_ALL 0x0A
+UMASK_L2_LINES_OUT_CLEAN_ALL 0x05
+UMASK_L2_LINES_OUT_ALL 0x0F
+
+EVENT_MEM_LOAD_UOPS_LLC_MISS_RETIRED 0xD3 PMC
+UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_LOCAL_DRAM 0x01
+UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_DRAM 0x0C
+UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_HITM 0x10
+UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_FWD 0x20
+
+EVENT_OFFCORE_RESPONSE_0 0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_LOCAL_DRAM EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_LOCAL_DRAM EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x78040
+UMASK_OFFCORE_RESPONSE_0_LOCAL_DRAM 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_REMOTE_DRAM EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_REMOTE_DRAM EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x7FF80
+UMASK_OFFCORE_RESPONSE_0_REMOTE_DRAM 0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY 0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY 0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY 0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY 0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY 0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY 0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY 0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY 0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY 0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY 0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY 0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY 0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY 0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1 0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_LOCAL_DRAM EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_LOCAL_DRAM EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x78040
+UMASK_OFFCORE_RESPONSE_1_LOCAL_DRAM 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_REMOTE_DRAM EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_REMOTE_DRAM EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x7FF80
+UMASK_OFFCORE_RESPONSE_1_REMOTE_DRAM 0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY 0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY 0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY 0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY 0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY 0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY 0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY 0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY 0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY 0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY 0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY 0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY 0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY 0x01 0x0F 0x10
+
+EVENT_CBOX_CLOCKTICKS 0x00 CBOX
+UMASK_CBOX_CLOCKTICKS 0x00
+
+EVENT_COUNTER0_OCCUPANCY 0x1F CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1|CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX8C2|CBOX9C2|CBOX10C2|CBOX11C2|CBOX12C2|CBOX13C2|CBOX14C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3|CBOX8C3|CBOX9C3|CBOX10C3|CBOX11C3|CBOX12C3|CBOX13C3|CBOX14C3
+UMASK_COUNTER0_OCCUPANCY 0x00
+
+EVENT_LLC_LOOKUP 0x34 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+DEFAULT_OPTIONS_LLC_LOOKUP EVENT_OPTION_STATE=0x1F
+OPTIONS_LLC_LOOKUP_DATA_READ EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_DATA_READ 0x03
+OPTIONS_LLC_LOOKUP_WRITE EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_WRITE 0x05
+OPTIONS_LLC_LOOKUP_REMOTE_SNOOP EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_REMOTE_SNOOP 0x09
+OPTIONS_LLC_LOOKUP_ANY EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_ANY 0x11
+OPTIONS_LLC_LOOKUP_NID EVENT_OPTION_NID_MASK|EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_NID 0x41
+
+EVENT_LLC_VICTIMS 0x37 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_LLC_VICTIMS_M_STATE 0x01
+UMASK_LLC_VICTIMS_E_STATE 0x02
+UMASK_LLC_VICTIMS_S_STATE 0x04
+UMASK_LLC_VICTIMS_ANY 0x07
+UMASK_LLC_VICTIMS_MISS 0x08
+OPTIONS_LLC_VICTIMS_NID EVENT_OPTION_NID_MASK
+UMASK_LLC_VICTIMS_NID 0x40
+
+EVENT_CBO_MISC 0x39 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_CBO_MISC_RSPI_WAS_FSE 0x01
+UMASK_CBO_MISC_WC_ALIASING 0x02
+UMASK_CBO_MISC_STARTED 0x04
+UMASK_CBO_MISC_RFO_HIT_S 0x08
+
+EVENT_RING_AD_USED 0x1B CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX8C2|CBOX9C2|CBOX10C2|CBOX11C2|CBOX12C2|CBOX13C2|CBOX14C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3|CBOX8C3|CBOX9C3|CBOX10C3|CBOX11C3|CBOX12C3|CBOX13C3|CBOX14C3
+UMASK_RING_AD_USED_0_UP_EVEN 0x01
+UMASK_RING_AD_USED_0_UP_ODD 0x02
+UMASK_RING_AD_USED_0_DOWN_EVEN 0x04
+UMASK_RING_AD_USED_0_DOWN_ODD 0x08
+UMASK_RING_AD_USED_1_UP_EVEN 0x10
+UMASK_RING_AD_USED_1_UP_ODD 0x20
+UMASK_RING_AD_USED_1_DOWN_EVEN 0x40
+UMASK_RING_AD_USED_1_DOWN_ODD 0x80
+UMASK_RING_AD_USED_DOWN 0xCC
+UMASK_RING_AD_USED_UP 0x33
+UMASK_RING_AD_USED_ANY 0xFF
+
+EVENT_RING_AK_USED 0x1C CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX8C2|CBOX9C2|CBOX10C2|CBOX11C2|CBOX12C2|CBOX13C2|CBOX14C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3|CBOX8C3|CBOX9C3|CBOX10C3|CBOX11C3|CBOX12C3|CBOX13C3|CBOX14C3
+UMASK_RING_AK_USED_0_UP_EVEN 0x01
+UMASK_RING_AK_USED_0_UP_ODD 0x02
+UMASK_RING_AK_USED_0_DOWN_EVEN 0x04
+UMASK_RING_AK_USED_0_DOWN_ODD 0x08
+UMASK_RING_AK_USED_1_UP_EVEN 0x10
+UMASK_RING_AK_USED_1_UP_ODD 0x20
+UMASK_RING_AK_USED_1_DOWN_EVEN 0x40
+UMASK_RING_AK_USED_1_DOWN_ODD 0x80
+UMASK_RING_AK_USED_DOWN 0xCC
+UMASK_RING_AK_USED_UP 0x33
+UMASK_RING_AK_USED_ANY 0xFF
+
+EVENT_RING_BL_USED 0x1D CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX8C2|CBOX9C2|CBOX10C2|CBOX11C2|CBOX12C2|CBOX13C2|CBOX14C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3|CBOX8C3|CBOX9C3|CBOX10C3|CBOX11C3|CBOX12C3|CBOX13C3|CBOX14C3
+UMASK_RING_BL_USED_0_UP_EVEN 0x01
+UMASK_RING_BL_USED_0_UP_ODD 0x02
+UMASK_RING_BL_USED_0_DOWN_EVEN 0x04
+UMASK_RING_BL_USED_0_DOWN_ODD 0x08
+UMASK_RING_BL_USED_1_UP_EVEN 0x10
+UMASK_RING_BL_USED_1_UP_ODD 0x20
+UMASK_RING_BL_USED_1_DOWN_EVEN 0x40
+UMASK_RING_BL_USED_1_DOWN_ODD 0x80
+UMASK_RING_BL_USED_DOWN 0xCC
+UMASK_RING_BL_USED_UP 0x33
+UMASK_RING_BL_USED_ANY 0xFF
+
+EVENT_RING_BOUNCES 0x05 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_RING_BOUNCES_AK_IRQ 0x02
+UMASK_RING_BOUNCES_AK_CORE 0x04
+UMASK_RING_BOUNCES_BL_CORE 0x08
+UMASK_RING_BOUNCES_IV_CORE 0x01
+
+EVENT_RING_IV_USED 0x1E CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX8C2|CBOX9C2|CBOX10C2|CBOX11C2|CBOX12C2|CBOX13C2|CBOX14C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3|CBOX8C3|CBOX9C3|CBOX10C3|CBOX11C3|CBOX12C3|CBOX13C3|CBOX14C3
+UMASK_RING_IV_USED_ANY 0xFF
+UMASK_RING_IV_USED_UP 0x33
+UMASK_RING_IV_USED_DOWN 0xCC
+
+EVENT_RING_SRC_THRTL 0x07 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_RING_SRC_THRTL 0x00
+
+EVENT_RXR_EXT_STARVED 0x12 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_RXR_EXT_STARVED_IRQ 0x01
+UMASK_RXR_EXT_STARVED_IPQ 0x02
+UMASK_RXR_EXT_STARVED_PRQ 0x04
+UMASK_RXR_EXT_STARVED_ISMQ_BIDS 0x08
+
+EVENT_RXR_INSERTS 0x13 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_RXR_INSERTS_IRQ 0x01
+UMASK_RXR_INSERTS_IRQ_REJECTED 0x02
+UMASK_RXR_INSERTS_IPQ 0x04
+UMASK_RXR_INSERTS_VFIFO 0x10
+
+EVENT_RXR_IPQ_RETRY 0x31 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_RXR_IPQ_RETRY_ANY 0x01
+UMASK_RXR_IPQ_RETRY_FULL 0x02
+UMASK_RXR_IPQ_RETRY_ADDR_CONFLICT 0x04
+UMASK_RXR_IPQ_RETRY_QPI_CREDITS 0x10
+
+EVENT_RXR_IRQ_RETRY 0x32 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_RXR_IRQ_RETRY_ANY 0x01
+UMASK_RXR_IRQ_RETRY_FULL 0x02
+UMASK_RXR_IRQ_RETRY_ADDR_CONFLICT 0x04
+UMASK_RXR_IRQ_RETRY_RTID 0x08
+UMASK_RXR_IRQ_RETRY_QPI_CREDITS 0x10
+UMASK_RXR_IRQ_RETRY_HO_CREDITS 0x20
+
+EVENT_RXR_ISMQ_RETRY 0x33 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_RXR_ISMQ_RETRY_ANY 0x01
+UMASK_RXR_ISMQ_RETRY_FULL 0x02
+UMASK_RXR_ISMQ_RETRY_RTID 0x08
+UMASK_RXR_ISMQ_RETRY_QPI_CREDITS 0x10
+UMASK_RXR_ISMQ_RETRY_HO_CREDITS 0x20
+UMASK_RXR_ISMQ_RETRY_WB_CREDITS 0x80
+
+EVENT_RXR_OCCUPANCY 0x11 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0
+UMASK_RXR_OCCUPANCY_IRQ 0x01
+UMASK_RXR_OCCUPANCY_IRQ_REJECTED 0x02
+UMASK_RXR_OCCUPANCY_IPQ 0x04
+UMASK_RXR_OCCUPANCY_VIFO 0x10
+
+EVENT_TOR_INSERTS 0x35 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_TOR_INSERTS_OPCODE 0x01
+UMASK_TOR_INSERTS_MISS_OPCODE 0x03
+UMASK_TOR_INSERTS_EVICTION 0x04
+UMASK_TOR_INSERTS_ALL 0x08
+UMASK_TOR_INSERTS_WB 0x10
+UMASK_TOR_INSERTS_MISS_ALL 0x0A
+UMASK_TOR_INSERTS_MISS_LOCAL 0x2A
+UMASK_TOR_INSERTS_MISS_LOCAL_OPCODE 0x23
+UMASK_TOR_INSERTS_NID_OPCODE 0x41
+UMASK_TOR_INSERTS_NID_EVICTION 0x44
+UMASK_TOR_INSERTS_NID_ALL 0x48
+UMASK_TOR_INSERTS_NID_WB 0x50
+UMASK_TOR_INSERTS_NID_MISS_OPCODE 0x43
+UMASK_TOR_INSERTS_NID_MISS_ALL 0x4A
+UMASK_TOR_INSERTS_REMOTE_OPCODE 0x81
+UMASK_TOR_INSERTS_MISS_REMOTE_OPCODE 0x83
+UMASK_TOR_INSERTS_REMOTE 0x88
+UMASK_TOR_INSERTS_MISS_REMOTE 0x8A
+
+EVENT_TOR_OCCUPANCY 0x36 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0
+UMASK_TOR_OCCUPANCY_OPCODE 0x01
+UMASK_TOR_OCCUPANCY_MISS_OPCODE 0x03
+UMASK_TOR_OCCUPANCY_EVICTION 0x04
+UMASK_TOR_OCCUPANCY_ALL 0x08
+UMASK_TOR_OCCUPANCY_MISS_ALL 0x0A
+UMASK_TOR_OCCUPANCY_WB 0x10
+UMASK_TOR_OCCUPANCY_LOCAL_OPCODE 0x21
+UMASK_TOR_OCCUPANCY_MISS_LOCAL_OPCODE 0x23
+UMASK_TOR_OCCUPANCY_LOCAL 0x28
+UMASK_TOR_OCCUPANCY_MISS_LOCAL 0x2A
+UMASK_TOR_OCCUPANCY_NID_OPCODE 0x41
+UMASK_TOR_OCCUPANCY_NID_EVICTION 0x44
+UMASK_TOR_OCCUPANCY_NID_ALL 0x48
+UMASK_TOR_OCCUPANCY_NID_MISS_OPCODE 0x43
+UMASK_TOR_OCCUPANCY_NID_MISS_ALL 0x4A
+UMASK_TOR_OCCUPANCY_NID_WB 0x50
+UMASK_TOR_OCCUPANCY_REMOTE_OPCODE 0x81
+UMASK_TOR_OCCUPANCY_MISS_REMOTE_OPCODE 0x83
+UMASK_TOR_OCCUPANCY_REMOTE 0x88
+UMASK_TOR_OCCUPANCY_MISS_REMOTE 0x8A
+
+EVENT_TXR_ADS_USED 0x04 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_TXR_ADS_USED_AD 0x01
+UMASK_TXR_ADS_USED_AK 0x02
+UMASK_TXR_ADS_USED_BL 0x04
+
+EVENT_TXR_INSERTS 0x02 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_TXR_INSERTS_AD_CACHE 0x01
+UMASK_TXR_INSERTS_AK_CACHE 0x02
+UMASK_TXR_INSERTS_BL_CACHE 0x04
+UMASK_TXR_INSERTS_IV_CACHE 0x08
+UMASK_TXR_INSERTS_AD_CORE 0x10
+UMASK_TXR_INSERTS_AK_CORE 0x20
+UMASK_TXR_INSERTS_BL_CORE 0x40
+
+EVENT_DRAM_CLOCKTICKS 0x00 MBOX0FIX|MBOX1FIX|MBOX2FIX|MBOX3FIX
+UMASK_DRAM_CLOCKTICKS 0x00
+
+EVENT_ACT_COUNT 0x01 MBOX
+UMASK_ACT_COUNT_RD 0x01
+UMASK_ACT_COUNT_WR 0x02
+UMASK_ACT_COUNT_BYP 0x08
+
+EVENT_BYP_CMDS 0xA1 MBOX
+UMASK_BYP_CMDS_ACT 0x01
+UMASK_BYP_CMDS_CAS 0x02
+UMASK_BYP_CMDS_PRE 0x04
+
+EVENT_CAS_COUNT 0x04 MBOX
+UMASK_CAS_COUNT_RD_REG 0x01
+UMASK_CAS_COUNT_RD_UNDERFILL 0x02
+UMASK_CAS_COUNT_RD 0x03
+UMASK_CAS_COUNT_WR_WMM 0x04
+UMASK_CAS_COUNT_WR_RMM 0x08
+UMASK_CAS_COUNT_WR 0x0C
+UMASK_CAS_COUNT_ALL 0x0F
+UMASK_CAS_COUNT_RD_WMM 0x01
+UMASK_CAS_COUNT_RD_RMM 0x02
+
+EVENT_DRAM_PRE_ALL 0x06 MBOX
+UMASK_DRAM_PRE_ALL 0x00
+
+EVENT_DRAM_REFRESH 0x05 MBOX
+UMASK_DRAM_REFRESH_PANIC 0x02
+UMASK_DRAM_REFRESH_HIGH 0x04
+
+EVENT_ECC_CORRECTABLE_ERRORS 0x09 MBOX
+UMASK_ECC_CORRECTABLE_ERRORS 0x00
+
+EVENT_MAJOR_MODES 0x07 MBOX
+UMASK_MAJOR_MODES_READ 0x01
+UMASK_MAJOR_MODES_WRITE 0x02
+UMASK_MAJOR_MODES_PARTIAL 0x04
+UMASK_MAJOR_MODES_ISOCH 0x08
+
+EVENT_POWER_CHANNEL_DLLOFF 0x84 MBOX
+UMASK_POWER_CHANNEL_DLLOFF 0x00
+
+EVENT_POWER_CHANNEL_PPD 0x85 MBOX
+UMASK_POWER_CHANNEL_PPD 0x00
+
+EVENT_POWER_CKE_CYCLES 0x83 MBOX
+UMASK_POWER_CKE_CYCLES_RANK0 0x01
+UMASK_POWER_CKE_CYCLES_RANK1 0x02
+UMASK_POWER_CKE_CYCLES_RANK2 0x04
+UMASK_POWER_CKE_CYCLES_RANK3 0x08
+UMASK_POWER_CKE_CYCLES_RANK4 0x10
+UMASK_POWER_CKE_CYCLES_RANK5 0x20
+UMASK_POWER_CKE_CYCLES_RANK6 0x40
+UMASK_POWER_CKE_CYCLES_RANK7 0x80
+
+EVENT_POWER_CRITICAL_THROTTLE_CYCLES 0x86 MBOX
+UMASK_POWER_CRITICAL_THROTTLE_CYCLES 0x00
+
+EVENT_POWER_PCU_THROTTLING 0x42 MBOX
+UMASK_POWER_PCU_THROTTLING 0x00
+
+EVENT_POWER_SELF_REFRESH 0x43 MBOX
+UMASK_POWER_SELF_REFRESH 0x00
+
+EVENT_POWER_THROTTLE_CYCLES 0x41 MBOX
+UMASK_POWER_THROTTLE_CYCLES_RANK0 0x01
+UMASK_POWER_THROTTLE_CYCLES_RANK1 0x02
+UMASK_POWER_THROTTLE_CYCLES_RANK2 0x04
+UMASK_POWER_THROTTLE_CYCLES_RANK3 0x08
+UMASK_POWER_THROTTLE_CYCLES_RANK4 0x10
+UMASK_POWER_THROTTLE_CYCLES_RANK5 0x20
+UMASK_POWER_THROTTLE_CYCLES_RANK6 0x40
+UMASK_POWER_THROTTLE_CYCLES_RANK7 0x80
+
+EVENT_PREEMPTION 0x08 MBOX
+UMASK_PREEMPTION_RD_PREEMPT_RD 0x01
+UMASK_PREEMPTION_RD_PREEMPT_WR 0x02
+
+EVENT_PRE_COUNT 0x02 MBOX
+UMASK_PRE_COUNT_PAGE_MISS 0x01
+UMASK_PRE_COUNT_PAGE_CLOSE 0x02
+
+EVENT_RD_CAS_PRIO 0xA0 MBOX
+UMASK_RD_CAS_PRIO_LOW 0x01
+UMASK_RD_CAS_PRIO_MED 0x02
+UMASK_RD_CAS_PRIO_HIGH 0x04
+UMASK_RD_CAS_PRIO_PANIC 0x08
+
+EVENT_RD_CAS_RANK0 0xB0 MBOX
+UMASK_RD_CAS_RANK0_BANK0 0x01
+UMASK_RD_CAS_RANK0_BANK1 0x02
+UMASK_RD_CAS_RANK0_BANK2 0x04
+UMASK_RD_CAS_RANK0_BANK3 0x08
+UMASK_RD_CAS_RANK0_BANK4 0x10
+UMASK_RD_CAS_RANK0_BANK5 0x20
+UMASK_RD_CAS_RANK0_BANK6 0x40
+UMASK_RD_CAS_RANK0_BANK7 0x80
+
+EVENT_RD_CAS_RANK1 0xB1 MBOX
+UMASK_RD_CAS_RANK1_BANK0 0x01
+UMASK_RD_CAS_RANK1_BANK1 0x02
+UMASK_RD_CAS_RANK1_BANK2 0x04
+UMASK_RD_CAS_RANK1_BANK3 0x08
+UMASK_RD_CAS_RANK1_BANK4 0x10
+UMASK_RD_CAS_RANK1_BANK5 0x20
+UMASK_RD_CAS_RANK1_BANK6 0x40
+UMASK_RD_CAS_RANK1_BANK7 0x80
+
+EVENT_RD_CAS_RANK2 0xB2 MBOX
+UMASK_RD_CAS_RANK2_BANK0 0x01
+UMASK_RD_CAS_RANK2_BANK1 0x02
+UMASK_RD_CAS_RANK2_BANK2 0x04
+UMASK_RD_CAS_RANK2_BANK3 0x08
+UMASK_RD_CAS_RANK2_BANK4 0x10
+UMASK_RD_CAS_RANK2_BANK5 0x20
+UMASK_RD_CAS_RANK2_BANK6 0x40
+UMASK_RD_CAS_RANK2_BANK7 0x80
+
+EVENT_RD_CAS_RANK3 0xB3 MBOX
+UMASK_RD_CAS_RANK3_BANK0 0x01
+UMASK_RD_CAS_RANK3_BANK1 0x02
+UMASK_RD_CAS_RANK3_BANK2 0x04
+UMASK_RD_CAS_RANK3_BANK3 0x08
+UMASK_RD_CAS_RANK3_BANK4 0x10
+UMASK_RD_CAS_RANK3_BANK5 0x20
+UMASK_RD_CAS_RANK3_BANK6 0x40
+UMASK_RD_CAS_RANK3_BANK7 0x80
+
+EVENT_RD_CAS_RANK4 0xB4 MBOX
+UMASK_RD_CAS_RANK4_BANK0 0x01
+UMASK_RD_CAS_RANK4_BANK1 0x02
+UMASK_RD_CAS_RANK4_BANK2 0x04
+UMASK_RD_CAS_RANK4_BANK3 0x08
+UMASK_RD_CAS_RANK4_BANK4 0x10
+UMASK_RD_CAS_RANK4_BANK5 0x20
+UMASK_RD_CAS_RANK4_BANK6 0x40
+UMASK_RD_CAS_RANK4_BANK7 0x80
+
+EVENT_RD_CAS_RANK5 0xB5 MBOX
+UMASK_RD_CAS_RANK5_BANK0 0x01
+UMASK_RD_CAS_RANK5_BANK1 0x02
+UMASK_RD_CAS_RANK5_BANK2 0x04
+UMASK_RD_CAS_RANK5_BANK3 0x08
+UMASK_RD_CAS_RANK5_BANK4 0x10
+UMASK_RD_CAS_RANK5_BANK5 0x20
+UMASK_RD_CAS_RANK5_BANK6 0x40
+UMASK_RD_CAS_RANK5_BANK7 0x80
+
+EVENT_RD_CAS_RANK6 0xB6 MBOX
+UMASK_RD_CAS_RANK6_BANK0 0x01
+UMASK_RD_CAS_RANK6_BANK1 0x02
+UMASK_RD_CAS_RANK6_BANK2 0x04
+UMASK_RD_CAS_RANK6_BANK3 0x08
+UMASK_RD_CAS_RANK6_BANK4 0x10
+UMASK_RD_CAS_RANK6_BANK5 0x20
+UMASK_RD_CAS_RANK6_BANK6 0x40
+UMASK_RD_CAS_RANK6_BANK7 0x80
+
+EVENT_RD_CAS_RANK7 0xB7 MBOX
+UMASK_RD_CAS_RANK7_BANK0 0x01
+UMASK_RD_CAS_RANK7_BANK1 0x02
+UMASK_RD_CAS_RANK7_BANK2 0x04
+UMASK_RD_CAS_RANK7_BANK3 0x08
+UMASK_RD_CAS_RANK7_BANK4 0x10
+UMASK_RD_CAS_RANK7_BANK5 0x20
+UMASK_RD_CAS_RANK7_BANK6 0x40
+UMASK_RD_CAS_RANK7_BANK7 0x80
+
+EVENT_RPQ_CYCLES_NE 0x11 MBOX
+UMASK_RPQ_CYCLES_NE 0x00
+
+EVENT_RPQ_INSERTS 0x10 MBOX
+UMASK_RPQ_INSERTS 0x00
+
+EVENT_VMSE_MXB_WR_OCCUPANCY 0x91 MBOX
+UMASK_VMSE_MXB_WR_OCCUPANCY 0x00
+
+EVENT_VMSE_WR_PUSH 0x90 MBOX
+UMASK_VMSE_WR_PUSH 0x00
+
+EVENT_WMM_TO_RMM 0xC0 MBOX
+UMASK_WMM_TO_RMM 0x00
+
+EVENT_WPQ_CYCLES_FULL 0x22 MBOX
+UMASK_WPQ_CYCLES_FULL 0x00
+
+EVENT_WPQ_CYCLES_NE 0x21 MBOX
+UMASK_WPQ_CYCLES_NE 0x00
+
+EVENT_WPQ_INSERTS 0x20 MBOX
+UMASK_WPQ_INSERTS 0x00
+
+EVENT_WPQ_READ_HIT 0x23 MBOX
+UMASK_WPQ_READ_HIT 0x00
+
+EVENT_WPQ_WRITE_HIT 0x24 MBOX
+UMASK_WPQ_WRITE_HIT 0x00
+
+EVENT_WRONG_MM 0xC1 MBOX
+UMASK_WRONG_MM 0x00
+
+EVENT_WR_CAS_RANK0 0xB8 MBOX
+UMASK_WR_CAS_RANK0_BANK0 0x01
+UMASK_WR_CAS_RANK0_BANK1 0x02
+UMASK_WR_CAS_RANK0_BANK2 0x04
+UMASK_WR_CAS_RANK0_BANK3 0x08
+UMASK_WR_CAS_RANK0_BANK4 0x10
+UMASK_WR_CAS_RANK0_BANK5 0x20
+UMASK_WR_CAS_RANK0_BANK6 0x40
+UMASK_WR_CAS_RANK0_BANK7 0x80
+
+EVENT_WR_CAS_RANK1 0xB9 MBOX
+UMASK_WR_CAS_RANK1_BANK0 0x01
+UMASK_WR_CAS_RANK1_BANK1 0x02
+UMASK_WR_CAS_RANK1_BANK2 0x04
+UMASK_WR_CAS_RANK1_BANK3 0x08
+UMASK_WR_CAS_RANK1_BANK4 0x10
+UMASK_WR_CAS_RANK1_BANK5 0x20
+UMASK_WR_CAS_RANK1_BANK6 0x40
+UMASK_WR_CAS_RANK1_BANK7 0x80
+
+EVENT_WR_CAS_RANK2 0xBA MBOX
+UMASK_WR_CAS_RANK2_BANK0 0x01
+UMASK_WR_CAS_RANK2_BANK1 0x02
+UMASK_WR_CAS_RANK2_BANK2 0x04
+UMASK_WR_CAS_RANK2_BANK3 0x08
+UMASK_WR_CAS_RANK2_BANK4 0x10
+UMASK_WR_CAS_RANK2_BANK5 0x20
+UMASK_WR_CAS_RANK2_BANK6 0x40
+UMASK_WR_CAS_RANK2_BANK7 0x80
+
+EVENT_WR_CAS_RANK3 0xBB MBOX
+UMASK_WR_CAS_RANK3_BANK0 0x01
+UMASK_WR_CAS_RANK3_BANK1 0x02
+UMASK_WR_CAS_RANK3_BANK2 0x04
+UMASK_WR_CAS_RANK3_BANK3 0x08
+UMASK_WR_CAS_RANK3_BANK4 0x10
+UMASK_WR_CAS_RANK3_BANK5 0x20
+UMASK_WR_CAS_RANK3_BANK6 0x40
+UMASK_WR_CAS_RANK3_BANK7 0x80
+
+EVENT_WR_CAS_RANK4 0xBC MBOX
+UMASK_WR_CAS_RANK4_BANK0 0x01
+UMASK_WR_CAS_RANK4_BANK1 0x02
+UMASK_WR_CAS_RANK4_BANK2 0x04
+UMASK_WR_CAS_RANK4_BANK3 0x08
+UMASK_WR_CAS_RANK4_BANK4 0x10
+UMASK_WR_CAS_RANK4_BANK5 0x20
+UMASK_WR_CAS_RANK4_BANK6 0x40
+UMASK_WR_CAS_RANK4_BANK7 0x80
+
+EVENT_WR_CAS_RANK5 0xBD MBOX
+UMASK_WR_CAS_RANK5_BANK0 0x01
+UMASK_WR_CAS_RANK5_BANK1 0x02
+UMASK_WR_CAS_RANK5_BANK2 0x04
+UMASK_WR_CAS_RANK5_BANK3 0x08
+UMASK_WR_CAS_RANK5_BANK4 0x10
+UMASK_WR_CAS_RANK5_BANK5 0x20
+UMASK_WR_CAS_RANK5_BANK6 0x40
+UMASK_WR_CAS_RANK5_BANK7 0x80
+
+EVENT_WR_CAS_RANK6 0xBE MBOX
+UMASK_WR_CAS_RANK6_BANK0 0x01
+UMASK_WR_CAS_RANK6_BANK1 0x02
+UMASK_WR_CAS_RANK6_BANK2 0x04
+UMASK_WR_CAS_RANK6_BANK3 0x08
+UMASK_WR_CAS_RANK6_BANK4 0x10
+UMASK_WR_CAS_RANK6_BANK5 0x20
+UMASK_WR_CAS_RANK6_BANK6 0x40
+UMASK_WR_CAS_RANK6_BANK7 0x80
+
+EVENT_WR_CAS_RANK7 0xBF MBOX
+UMASK_WR_CAS_RANK7_BANK0 0x01
+UMASK_WR_CAS_RANK7_BANK1 0x02
+UMASK_WR_CAS_RANK7_BANK2 0x04
+UMASK_WR_CAS_RANK7_BANK3 0x08
+UMASK_WR_CAS_RANK7_BANK4 0x10
+UMASK_WR_CAS_RANK7_BANK5 0x20
+UMASK_WR_CAS_RANK7_BANK6 0x40
+UMASK_WR_CAS_RANK7_BANK7 0x80
+
+
+EVENT_QPI_RATE 0x00 SBOX0FIX|SBOX1FIX|SBOX2FIX
+UMASK_QPI_RATE 0x00
+
+EVENT_SBOX_CLOCKTICKS 0x14 SBOX0|SBOX1|SBOX2
+UMASK_SBOX_CLOCKTICKS 0x00
+
+EVENT_CTO_COUNT 0x38 SBOX0|SBOX1|SBOX2
+OPTIONS_CTO_COUNT EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_CTO_COUNT 0x00 0x01
+
+EVENT_DIRECT2CORE 0x13 SBOX0|SBOX1|SBOX2
+OPTIONS_DIRECT2CORE_SUCCESS_RBT_HIT EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_DIRECT2CORE_SUCCESS_RBT_HIT 0x01
+OPTIONS_DIRECT2CORE_FAILURE_CREDITS EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_DIRECT2CORE_FAILURE_CREDITS 0x02
+OPTIONS_DIRECT2CORE_FAILURE_RBT_HIT EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_DIRECT2CORE_FAILURE_RBT_HIT 0x04
+OPTIONS_DIRECT2CORE_FAILURE_CREDITS_RBT EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_DIRECT2CORE_FAILURE_CREDITS_RBT 0x08
+OPTIONS_DIRECT2CORE_FAILURE_MISS EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_DIRECT2CORE_FAILURE_MISS 0x10
+OPTIONS_DIRECT2CORE_FAILURE_CREDITS_MISS EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_DIRECT2CORE_FAILURE_CREDITS_MISS 0x20
+OPTIONS_DIRECT2CORE_FAILURE_RBT_MISS EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_DIRECT2CORE_FAILURE_RBT_MISS 0x40
+OPTIONS_DIRECT2CORE_FAILURE_CREDITS_RBT_MISS EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_DIRECT2CORE_FAILURE_CREDITS_RBT_MISS 0x80
+
+EVENT_L1_POWER_CYCLES 0x12 SBOX0|SBOX1|SBOX2
+UMASK_L1_POWER_CYCLES 0x00
+
+EVENT_RXL0P_POWER_CYCLES 0x10 SBOX0|SBOX1|SBOX2
+UMASK_RXL0P_POWER_CYCLES 0x00
+
+EVENT_RXL0_POWER_CYCLES 0x0F SBOX0|SBOX1|SBOX2
+UMASK_RXL0_POWER_CYCLES 0x00
+
+EVENT_RXL_BYPASSED 0x09 SBOX0|SBOX1|SBOX2
+UMASK_RXL_BYPASSED 0x00
+
+EVENT_RXL_CREDITS_CONSUMED_VN0 0x1E SBOX0|SBOX1|SBOX2
+UMASK_RXL_CREDITS_CONSUMED_VN0_DRS 0x01 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN0_NCB 0x02 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN0_NCS 0x04 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN0_HOM 0x08 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN0_SNP 0x10 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN0_NDR 0x20 0x01
+
+EVENT_RXL_CREDITS_CONSUMED_VN1 0x39 SBOX0|SBOX1|SBOX2
+UMASK_RXL_CREDITS_CONSUMED_VN1_DRS 0x01 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN1_NCB 0x02 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN1_NCS 0x04 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN1_HOM 0x08 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN1_SNP 0x10 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN1_NDR 0x20 0x01
+
+EVENT_RXL_CREDITS_CONSUMED_VNA 0x1D SBOX0|SBOX1|SBOX2
+UMASK_RXL_CREDITS_CONSUMED_VNA 0x00 0x01
+
+EVENT_RXL_CYCLES_NE 0x0A SBOX0|SBOX1|SBOX2
+UMASK_RXL_CYCLES_NE 0x00
+
+EVENT_RXL_FLITS_G0 0x01 SBOX0|SBOX1|SBOX2
+UMASK_RXL_FLITS_G0_IDLE 0x01
+UMASK_RXL_FLITS_G0_DATA 0x02
+UMASK_RXL_FLITS_G0_NON_DATA 0x04
+
+EVENT_RXL_FLITS_G1 0x02 SBOX0|SBOX1|SBOX2
+UMASK_RXL_FLITS_G1_SNP 0x01 0x01
+UMASK_RXL_FLITS_G1_HOM_REQ 0x02 0x01
+UMASK_RXL_FLITS_G1_HOM_NONREQ 0x04 0x01
+UMASK_RXL_FLITS_G1_HOM 0x06 0x01
+UMASK_RXL_FLITS_G1_DRS_DATA 0x08 0x01
+UMASK_RXL_FLITS_G1_DRS_NONDATA 0x10 0x01
+UMASK_RXL_FLITS_G1_DRS 0x18 0x01
+
+EVENT_RXL_FLITS_G2 0x03 SBOX0|SBOX1|SBOX2
+UMASK_RXL_FLITS_G2_NDR_AD 0x01 0x01
+UMASK_RXL_FLITS_G2_NDR_AK 0x02 0x01
+UMASK_RXL_FLITS_G2_NCB_DATA 0x04 0x01
+UMASK_RXL_FLITS_G2_NCB_NONDATA 0x08 0x01
+UMASK_RXL_FLITS_G2_NCB 0x0C 0x01
+UMASK_RXL_FLITS_G2_NCS 0x10 0x01
+
+EVENT_RXL_INSERTS 0x08 SBOX0|SBOX1|SBOX2
+UMASK_RXL_INSERTS 0x00
+
+EVENT_RXL_INSERTS_DRS 0x09 SBOX0|SBOX1|SBOX2
+UMASK_RXL_INSERTS_DRS_VN0 0x01 0x01
+UMASK_RXL_INSERTS_DRS_VN1 0x02 0x01
+
+EVENT_RXL_INSERTS_HOM 0x0C SBOX0|SBOX1|SBOX2
+UMASK_RXL_INSERTS_HOM_VN0 0x01 0x01
+UMASK_RXL_INSERTS_HOM_VN1 0x02 0x01
+
+EVENT_RXL_INSERTS_NCB 0x0A SBOX0|SBOX1|SBOX2
+UMASK_RXL_INSERTS_NCB_VN0 0x01 0x01
+UMASK_RXL_INSERTS_NCB_VN1 0x02 0x01
+
+EVENT_RXL_INSERTS_NCS 0x0B SBOX0|SBOX1|SBOX2
+UMASK_RXL_INSERTS_NCS_VN0 0x01 0x01
+UMASK_RXL_INSERTS_NCS_VN1 0x02 0x01
+
+EVENT_RXL_INSERTS_NDR 0x0E SBOX0|SBOX1|SBOX2
+UMASK_RXL_INSERTS_NDR_VN0 0x01 0x01
+UMASK_RXL_INSERTS_NDR_VN1 0x02 0x01
+
+EVENT_RXL_INSERTS_SNP 0x0D SBOX0|SBOX1|SBOX2
+UMASK_RXL_INSERTS_SNP_VN0 0x01 0x01
+UMASK_RXL_INSERTS_SNP_VN1 0x02 0x01
+
+EVENT_RXL_OCCUPANCY 0x0B SBOX0|SBOX1|SBOX2
+UMASK_RXL_OCCUPANCY 0x00
+
+EVENT_RXL_OCCUPANCY_DRS 0x15 SBOX0|SBOX1|SBOX2
+UMASK_RXL_OCCUPANCY_DRS_VN0 0x01 0x01
+UMASK_RXL_OCCUPANCY_DRS_VN1 0x02 0x01
+
+EVENT_RXL_OCCUPANCY_HOM 0x18 SBOX0|SBOX1|SBOX2
+UMASK_RXL_OCCUPANCY_HOM_VN0 0x01 0x01
+UMASK_RXL_OCCUPANCY_HOM_VN1 0x02 0x01
+
+EVENT_RXL_OCCUPANCY_NCB 0x16 SBOX0|SBOX1|SBOX2
+UMASK_RXL_OCCUPANCY_NCB_VN0 0x01 0x01
+UMASK_RXL_OCCUPANCY_NCB_VN1 0x02 0x01
+
+EVENT_RXL_OCCUPANCY_NCS 0x17 SBOX0|SBOX1|SBOX2
+UMASK_RXL_OCCUPANCY_NCS_VN0 0x01 0x01
+UMASK_RXL_OCCUPANCY_NCS_VN1 0x02 0x01
+
+EVENT_RXL_OCCUPANCY_NDR 0x1A SBOX0|SBOX1|SBOX2
+UMASK_RXL_OCCUPANCY_NDR_VN0 0x01 0x01
+UMASK_RXL_OCCUPANCY_NDR_VN1 0x02 0x01
+
+EVENT_RXL_OCCUPANCY_SNP 0x19 SBOX0|SBOX1|SBOX2
+UMASK_RXL_OCCUPANCY_SNP_VN0 0x01 0x01
+UMASK_RXL_OCCUPANCY_SNP_VN1 0x02 0x01
+
+EVENT_TXL0P_POWER_CYCLES 0x0D SBOX0|SBOX1|SBOX2
+UMASK_TXL0P_POWER_CYCLES 0x00
+
+EVENT_TXL0_POWER_CYCLES 0x0C SBOX0|SBOX1|SBOX2
+UMASK_TXL0_POWER_CYCLES 0x00
+
+EVENT_TXL_BYPASSED 0x05 SBOX0|SBOX1|SBOX2
+UMASK_TXL_BYPASSED 0x00
+
+EVENT_TXL_CYCLES_NE 0x06 SBOX0|SBOX1|SBOX2
+UMASK_TXL_CYCLES_NE 0x00
+
+EVENT_TXL_FLITS_G0 0x00 SBOX0|SBOX1|SBOX2
+UMASK_TXL_FLITS_G0_DATA 0x02
+UMASK_TXL_FLITS_G0_NON_DATA 0x04
+
+EVENT_TXL_FLITS_G1 0x00 SBOX0|SBOX1|SBOX2
+UMASK_TXL_FLITS_G1_SNP 0x01 0x01
+UMASK_TXL_FLITS_G1_HOM_REQ 0x02 0x01
+UMASK_TXL_FLITS_G1_HOM_NONREQ 0x04 0x01
+UMASK_TXL_FLITS_G1_HOM 0x06 0x01
+UMASK_TXL_FLITS_G1_DRS_DATA 0x08 0x01
+UMASK_TXL_FLITS_G1_DRS_NONDATA 0x10 0x01
+UMASK_TXL_FLITS_G1_DRS 0x18 0x01
+
+EVENT_TXL_FLITS_G2 0x01 SBOX0|SBOX1|SBOX2
+UMASK_TXL_FLITS_G2_NDR_AD 0x01 0x01
+UMASK_TXL_FLITS_G2_NDR_AK 0x02 0x01
+UMASK_TXL_FLITS_G2_NCB_DATA 0x04 0x01
+UMASK_TXL_FLITS_G2_NCB_NONDATA 0x08 0x01
+UMASK_TXL_FLITS_G2_NCB 0x0C 0x01
+UMASK_TXL_FLITS_G2_NCS 0x10 0x01
+
+EVENT_TXL_INSERTS 0x04 SBOX0|SBOX1|SBOX2
+UMASK_TXL_INSERTS 0x00
+
+EVENT_TXL_OCCUPANCY 0x07 SBOX0|SBOX1|SBOX2
+UMASK_TXL_OCCUPANCY 0x00
+
+EVENT_TXL_AD_HOM_CREDIT_ACQUIRED 0x26 SBOX0|SBOX1|SBOX2
+UMASK_TXL_AD_HOM_CREDIT_ACQUIRED_VN0 0x01 0x01
+UMASK_TXL_AD_HOM_CREDIT_ACQUIRED_VN1 0x02 0x01
+
+EVENT_TXL_AD_HOM_CREDIT_OCCUPANCY 0x22 SBOX0|SBOX1|SBOX2
+UMASK_TXL_AD_HOM_CREDIT_OCCUPANCY_VN0 0x01 0x01
+UMASK_TXL_AD_HOM_CREDIT_OCCUPANCY_VN1 0x02 0x01
+
+EVENT_TXL_AD_NDR_CREDIT_ACQUIRED 0x28 SBOX0|SBOX1|SBOX2
+UMASK_TXL_AD_NDR_CREDIT_ACQUIRED_VN0 0x01 0x01
+UMASK_TXL_AD_NDR_CREDIT_ACQUIRED_VN1 0x02 0x01
+
+EVENT_TXL_AD_NDR_CREDIT_OCCUPANCY 0x24 SBOX0|SBOX1|SBOX2
+UMASK_TXL_AD_NDR_CREDIT_OCCUPANCY_VN0 0x01 0x01
+UMASK_TXL_AD_NDR_CREDIT_OCCUPANCY_VN1 0x02 0x01
+
+EVENT_TXL_AD_SNP_CREDIT_ACQUIRED 0x27 SBOX0|SBOX1|SBOX2
+UMASK_TXL_AD_SNP_CREDIT_ACQUIRED_VN0 0x01 0x01
+UMASK_TXL_AD_SNP_CREDIT_ACQUIRED_VN1 0x02 0x01
+
+EVENT_TXL_AD_SNP_CREDIT_OCCUPANCY 0x23 SBOX0|SBOX1|SBOX2
+UMASK_TXL_AD_SNP_CREDIT_OCCUPANCY_VN0 0x01 0x01
+UMASK_TXL_AD_SNP_CREDIT_OCCUPANCY_VN1 0x02 0x01
+
+EVENT_TXL_AK_NDR_CREDIT_ACQUIRED 0x29 SBOX0|SBOX1|SBOX2
+UMASK_TXL_AK_NDR_CREDIT_ACQUIRED 0x00 0x01
+
+EVENT_TXL_AK_NDR_CREDIT_OCCUPANCY 0x25 SBOX0|SBOX1|SBOX2
+UMASK_TXL_AK_NDR_CREDIT_OCCUPANCY 0x00 0x01
+
+EVENT_TXL_BL_DRS_CREDIT_ACQUIRED 0x2A SBOX0|SBOX1|SBOX2
+UMASK_TXL_BL_DRS_CREDIT_ACQUIRED_VN0 0x01 0x01
+UMASK_TXL_BL_DRS_CREDIT_ACQUIRED_VN1 0x02 0x01
+UMASK_TXL_BL_DRS_CREDIT_ACQUIRED_VN_SHR 0x04 0x01
+
+EVENT_TXL_BL_DRS_CREDIT_OCCUPANCY 0x1F SBOX0|SBOX1|SBOX2
+UMASK_TXL_BL_DRS_CREDIT_OCCUPANCY_VN0 0x01 0x01
+UMASK_TXL_BL_DRS_CREDIT_OCCUPANCY_VN1 0x02 0x01
+UMASK_TXL_BL_DRS_CREDIT_OCCUPANCY_VN_SHR 0x04 0x01
+
+EVENT_TXL_BL_NCB_CREDIT_ACQUIRED 0x2B SBOX0|SBOX1|SBOX2
+UMASK_TXL_BL_NCB_CREDIT_ACQUIRED_VN0 0x01 0x01
+UMASK_TXL_BL_NCB_CREDIT_ACQUIRED_VN1 0x02 0x01
+
+EVENT_TXL_BL_NCB_CREDIT_OCCUPANCY 0x20 SBOX0|SBOX1|SBOX2
+UMASK_TXL_BL_NCB_CREDIT_OCCUPANCY_VN0 0x01 0x01
+UMASK_TXL_BL_NCB_CREDIT_OCCUPANCY_VN1 0x02 0x01
+
+EVENT_TXL_BL_NCS_CREDIT_ACQUIRED 0x2C SBOX0|SBOX1|SBOX2
+UMASK_TXL_BL_NCS_CREDIT_ACQUIRED_VN0 0x01 0x01
+UMASK_TXL_BL_NCS_CREDIT_ACQUIRED_VN1 0x02 0x01
+
+EVENT_TXL_BL_NCS_CREDIT_OCCUPANCY 0x21 SBOX0|SBOX1|SBOX2
+UMASK_TXL_BL_NCS_CREDIT_OCCUPANCY_VN0 0x01 0x01
+UMASK_TXL_BL_NCS_CREDIT_OCCUPANCY_VN1 0x02 0x01
+
+EVENT_VNA_CREDIT_RETURNS 0x1C SBOX0|SBOX1|SBOX2
+UMASK_VNA_CREDIT_RETURNS 0x00 0x01
+
+EVENT_VNA_CREDIT_RETURN_OCCUPANCY 0x1B SBOX0|SBOX1|SBOX2
+UMASK_VNA_CREDIT_RETURN_OCCUPANCY 0x00 0x01
+
+EVENT_UNCORE_CLOCK 0x00 UBOXFIX
+UMASK_UNCORE_CLOCK 0x00
+
+EVENT_EVENT_MSG 0x42 UBOX
+UMASK_EVENT_MSG_VLW_RCVD 0x01
+UMASK_EVENT_MSG_MSI_RCVD 0x02
+UMASK_EVENT_MSG_IPI_RCVD 0x02
+UMASK_EVENT_MSG_DOORBELL_RCVD 0x08
+UMASK_EVENT_MSG_INT_PRIO 0x10
+
+EVENT_LOCK_CYCLES 0x44 UBOX
+UMASK_LOCK_CYCLES 0x00
+
+EVENT_PHOLD_CYCLES 0x45 UBOX
+UMASK_PHOLD_CYCLES_ASSERT_TO_ACK 0x01
+
+EVENT_RACU_REQUESTS 0x46 UBOX
+UMASK_RACU_REQUESTS 0x00
+
+EVENT_BBOX_CLOCKTICKS 0x00 BBOX0|BBOX1
+UMASK_BBOX_CLOCKTICKS 0x00
+
+EVENT_ADDR_OPC_MATCH 0x20 BBOX0|BBOX1
+OPTIONS_ADDR_OPC_MATCH_ADDR EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+EVENT_ADDR_OPC_MATCH_ADDR 0x01
+OPTIONS_ADDR_OPC_MATCH_OPC EVENT_OPTION_OPCODE_MASK
+EVENT_ADDR_OPC_MATCH_OPC 0x02
+OPTIONS_ADDR_OPC_MATCH_FILT EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK|EVENT_OPTION_OPCODE_MASK
+EVENT_ADDR_OPC_MATCH_FILT 0x03
+OPTIONS_ADDR_OPC_MATCH_AD EVENT_OPTION_OPCODE_MASK
+EVENT_ADDR_OPC_MATCH_AD 0x02
+OPTIONS_ADDR_OPC_MATCH_BL EVENT_OPTION_OPCODE_MASK
+EVENT_ADDR_OPC_MATCH_BL 0x02
+OPTIONS_ADDR_OPC_MATCH_AK EVENT_OPTION_OPCODE_MASK
+EVENT_ADDR_OPC_MATCH_AK 0x02
+
+EVENT_BT_BYPASS 0x52 BBOX0|BBOX1
+UMASK_BT_BYPASS 0x00
+
+EVENT_BT_CYCLES_NE 0x42 BBOX0|BBOX1
+UMASK_BT_CYCLES_NE 0x00
+
+EVENT_BT_OCCUPANCY 0x43 BBOX0|BBOX1
+UMASK_BT_OCCUPANCY_LOCAL 0x01
+UMASK_BT_OCCUPANCY_REMOTE 0x02
+UMASK_BT_OCCUPANCY_READS_LOCAL 0x04
+UMASK_BT_OCCUPANCY_READS_REMOTE 0x08
+UMASK_BT_OCCUPANCY_WRITES_LOCAL 0x10
+UMASK_BT_OCCUPANCY_WRITES_REMOTE 0x20
+
+EVENT_BYPASS_IMC 0x14 BBOX0|BBOX1
+UMASK_BYPASS_IMC_TAKEN 0x01
+UMASK_BYPASS_IMC_NOT_TAKEN 0x02
+
+EVENT_CONFLICT_CYCLES 0x0B BBOX0|BBOX1
+UMASK_CONFLICT_CYCLES_CONFLICT 0x02
+UMASK_CONFLICT_CYCLES_LAST 0x04
+UMASK_CONFLICT_CYCLES_ACKCNFLTS 0x08
+UMASK_CONFLICT_CYCLES_CMP_FWDS 0x10
+
+EVENT_DIRECT2CORE_COUNT 0x11 BBOX0|BBOX1
+UMASK_DIRECT2CORE_COUNT 0x00
+
+EVENT_DIRECT2CORE_CYCLES_DISABLED 0x12 BBOX0|BBOX1
+UMASK_DIRECT2CORE_CYCLES_DISABLED 0x00
+
+EVENT_DIRECT2CORE_TXN_OVERRIDE 0x13 BBOX0|BBOX1
+UMASK_DIRECT2CORE_TXN_OVERRIDE 0x00
+
+EVENT_DIRECTORY_LAT_OPT 0x41 BBOX0|BBOX1
+UMASK_DIRECTORY_LAT_OPT 0x00
+
+EVENT_DIRECTORY_LOOKUP 0x0C BBOX0|BBOX1
+UMASK_DIRECTORY_LOOKUP_SNP 0x01
+UMASK_DIRECTORY_LOOKUP_NO_SNP 0x02
+
+EVENT_DIRECTORY_UPDATE 0x0D BBOX0|BBOX1
+UMASK_DIRECTORY_UPDATE_SET 0x01
+UMASK_DIRECTORY_UPDATE_CLEAR 0x02
+UMASK_DIRECTORY_UPDATE_ANY 0x03
+
+EVENT_IGR_CREDITS_AD_QPI2 0x59 BBOX0|BBOX1
+UMASK_IGR_CREDITS_AD_QPI2 0x00
+
+EVENT_IGR_CREDITS_BL_QPI2 0x5A BBOX0|BBOX1
+UMASK_IGR_CREDITS_BL_QPI2 0x00
+
+EVENT_IGR_NO_CREDIT_CYCLES 0x22 BBOX0|BBOX1
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI0 0x01
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI1 0x02
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI0 0x04
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI1 0x08
+
+EVENT_IMC_READS 0x17 BBOX0|BBOX1
+UMASK_IMC_READS_NORMAL 0x01
+
+EVENT_IMC_RETRY 0x1E BBOX0|BBOX1
+UMASK_IMC_RETRY 0x00
+
+EVENT_IMC_WRITES 0x1A BBOX0|BBOX1
+UMASK_IMC_WRITES_FULL 0x01
+UMASK_IMC_WRITES_PARTIAL 0x02
+UMASK_IMC_WRITES_FULL_ISOCH 0x04
+UMASK_IMC_WRITES_PARTIAL_ISOCH 0x08
+UMASK_IMC_WRITES_ALL 0x0F
+
+EVENT_IODC_CONFLICTS 0x57 BBOX0|BBOX1
+UMASK_IODC_CONFLICTS_ANY 0x01
+UMASK_IODC_CONFLICTS_LAST 0x04
+
+EVENT_IODC_INSERTS 0x56 BBOX0|BBOX1
+UMASK_IODC_INSERTS 0x00
+
+EVENT_IODC_OLEN_WBMTOI 0x58 BBOX0|BBOX1
+UMASK_IODC_OLEN_WBMTOI 0x00
+
+EVENT_OSB 0x53 BBOX0|BBOX1
+UMASK_OSB_READS_LOCAL 0x02
+UMASK_OSB_INVITOE_LOCAL 0x04
+UMASK_OSB_REMOTE 0x08
+
+EVENT_OSB_EDR 0x54 BBOX0|BBOX1
+UMASK_OSB_EDR_ALL 0x01
+UMASK_OSB_EDR_READS_LOCAL_I 0x02
+UMASK_OSB_EDR_READS_REMOTE_I 0x04
+UMASK_OSB_EDR_READS_LOCAL_S 0x08
+UMASK_OSB_EDR_READS_REMOTE_S 0x10
+
+EVENT_REQUESTS 0x01 BBOX0|BBOX1
+UMASK_REQUESTS_READS_LOCAL 0x01
+UMASK_REQUESTS_READS_REMOTE 0x02
+UMASK_REQUESTS_READS 0x03
+UMASK_REQUESTS_WRITES_LOCAL 0x04
+UMASK_REQUESTS_WRITES_REMOTE 0x08
+UMASK_REQUESTS_WRITES 0x0C
+UMASK_REQUESTS_INVITOE_LOCAL 0x10
+UMASK_REQUESTS_INVITOE_REMOTE 0x20
+UMASK_REQUESTS_INVITOE 0x30
+
+EVENT_RING_AD_USED 0x3E BBOX
+UMASK_RING_AD_USED_CW_VR0_EVEN 0x01
+UMASK_RING_AD_USED_CW_VR0_ODD 0x02
+UMASK_RING_AD_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_AD_USED_CCW_VR0_ODD 0x08
+UMASK_RING_AD_USED_CW_VR1_EVEN 0x10
+UMASK_RING_AD_USED_CW_VR1_ODD 0x20
+UMASK_RING_AD_USED_CCW_VR1_EVEN 0x40
+UMASK_RING_AD_USED_CCW_VR1_ODD 0x80
+UMASK_RING_AD_USED_CW 0x33
+UMASK_RING_AD_USED_CCW 0xCC
+UMASK_RING_AD_USED_ANY 0xFF
+
+EVENT_RING_AK_USED 0x3F BBOX
+UMASK_RING_AK_USED_CW_VR0_EVEN 0x01
+UMASK_RING_AK_USED_CW_VR0_ODD 0x02
+UMASK_RING_AK_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_AK_USED_CCW_VR0_ODD 0x08
+UMASK_RING_AK_USED_CW_VR1_EVEN 0x10
+UMASK_RING_AK_USED_CW_VR1_ODD 0x20
+UMASK_RING_AK_USED_CCW_VR1_EVEN 0x40
+UMASK_RING_AK_USED_CCW_VR1_ODD 0x80
+UMASK_RING_AK_USED_CW 0x33
+UMASK_RING_AK_USED_CCW 0xCC
+UMASK_RING_AK_USED_ANY 0xFF
+
+EVENT_RING_BL_USED 0x40 BBOX
+UMASK_RING_BL_USED_CW_VR0_EVEN 0x01
+UMASK_RING_BL_USED_CW_VR0_ODD 0x02
+UMASK_RING_BL_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_BL_USED_CCW_VR0_ODD 0x08
+UMASK_RING_BL_USED_CW_VR1_EVEN 0x10
+UMASK_RING_BL_USED_CW_VR1_ODD 0x20
+UMASK_RING_BL_USED_CCW_VR1_EVEN 0x40
+UMASK_RING_BL_USED_CCW_VR1_ODD 0x80
+UMASK_RING_BL_USED_CW 0x33
+UMASK_RING_BL_USED_CCW 0xCC
+UMASK_RING_BL_USED_ANY 0xFF
+
+EVENT_RPQ_CYCLES_NO_REG_CREDITS 0x15 BBOX0|BBOX1
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN0 0x01
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN1 0x02
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN2 0x04
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN3 0x08
+
+EVENT_SNOOP_RESP 0x21 BBOX0|BBOX1
+UMASK_SNOOP_RESP_RSPI 0x01
+UMASK_SNOOP_RESP_RSPS 0x02
+UMASK_SNOOP_RESP_RSPIFWD 0x04
+UMASK_SNOOP_RESP_RSPSFWD 0x08
+UMASK_SNOOP_RESP_RSP_WB 0x10
+UMASK_SNOOP_RESP_RSP_FWD_WB 0x20
+UMASK_SNOOP_RESP_RSPCNFLCT 0x40
+
+EVENT_SNP_RESP_RECV_LOCAL 0x60 BBOX0|BBOX1
+UMASK_SNP_RESP_RECV_LOCAL_RSPI 0x01
+UMASK_SNP_RESP_RECV_LOCAL_RSPS 0x02
+UMASK_SNP_RESP_RECV_LOCAL_RSPIFWD 0x04
+UMASK_SNP_RESP_RECV_LOCAL_RSPSFWD 0x08
+UMASK_SNP_RESP_RECV_LOCAL_RSPXWB 0x10
+UMASK_SNP_RESP_RECV_LOCAL_RSPxFWDxWB 0x20
+UMASK_SNP_RESP_RECV_LOCAL_RSPCNFLCT 0x40
+UMASK_SNP_RESP_RECV_LOCAL_OTHER 0x80
+
+EVENT_TAD_REQUESTS_G0 0x1B BBOX0|BBOX1
+UMASK_TAD_REQUESTS_G0_REGION0 0x01
+UMASK_TAD_REQUESTS_G0_REGION1 0x02
+UMASK_TAD_REQUESTS_G0_REGION2 0x04
+UMASK_TAD_REQUESTS_G0_REGION3 0x08
+UMASK_TAD_REQUESTS_G0_REGION4 0x10
+UMASK_TAD_REQUESTS_G0_REGION5 0x20
+UMASK_TAD_REQUESTS_G0_REGION6 0x40
+UMASK_TAD_REQUESTS_G0_REGION7 0x80
+
+EVENT_TAD_REQUESTS_G1 0x1C BBOX0|BBOX1
+UMASK_TAD_REQUESTS_G1_REGION8 0x01
+UMASK_TAD_REQUESTS_G1_REGION9 0x02
+UMASK_TAD_REQUESTS_G1_REGION10 0x04
+UMASK_TAD_REQUESTS_G1_REGION11 0x08
+
+EVENT_TXR_AD_CYCLES_FULL 0x2A BBOX0|BBOX1
+UMASK_TXR_AD_CYCLES_FULL_SCHED0 0x01
+UMASK_TXR_AD_CYCLES_FULL_SCHED1 0x02
+UMASK_TXR_AD_CYCLES_FULL_ALL 0x03
+
+EVENT_TXR_AK 0x0E BBOX0|BBOX1
+UMASK_TXR_AK 0x00
+
+EVENT_TXR_AK_CYCLES_FULL 0x32 BBOX0|BBOX1
+UMASK_TXR_AK_CYCLES_FULL_SCHED0 0x01
+UMASK_TXR_AK_CYCLES_FULL_SCHED1 0x02
+UMASK_TXR_AK_CYCLES_FULL_ALL 0x03
+
+EVENT_TXR_BL 0x10 BBOX0|BBOX1
+UMASK_TXR_BL_DRS_CACHE 0x01
+UMASK_TXR_BL_DRS_CORE 0x02
+UMASK_TXR_BL_DRS_QPI 0x04
+
+EVENT_TXR_BL_CYCLES_FULL 0x36 BBOX0|BBOX1
+UMASK_TXR_BL_CYCLES_FULL_SCHED0 0x01
+UMASK_TXR_BL_CYCLES_FULL_SCHED1 0x02
+UMASK_TXR_BL_CYCLES_FULL_ALL 0x03
+
+EVENT_TXR_BL_OCCUPANCY 0x34 BBOX0|BBOX1
+UMASK_TXR_BL_OCCUPANCY_SCHED0 0x01
+UMASK_TXR_BL_OCCUPANCY_SCHED1 0x02
+
+EVENT_WPQ_CYCLES_NO_REG_CREDITS 0x18 BBOX0|BBOX1
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0 0x01
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN1 0x02
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN2 0x04
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN3 0x08
+
+EVENT_CORES_IN_C3 0x00 WBOX0FIX
+UMASK_CORES_IN_C3 0x00
+
+EVENT_CORES_IN_C6 0x00 WBOX1FIX
+UMASK_CORES_IN_C6 0x00
+
+EVENT_WBOX_CLOCKTICKS 0x00 WBOX
+UMASK_WBOX_CLOCKTICKS 0x00
+
+EVENT_CORE0_TRANSITION_CYCLES 0x70 WBOX
+UMASK_CORE0_TRANSITION_CYCLES 0x00
+
+EVENT_CORE1_TRANSITION_CYCLES 0x71 WBOX
+UMASK_CORE1_TRANSITION_CYCLES 0x00
+
+EVENT_CORE2_TRANSITION_CYCLES 0x72 WBOX
+UMASK_CORE2_TRANSITION_CYCLES 0x00
+
+EVENT_CORE3_TRANSITION_CYCLES 0x73 WBOX
+UMASK_CORE3_TRANSITION_CYCLES 0x00
+
+EVENT_CORE4_TRANSITION_CYCLES 0x74 WBOX
+UMASK_CORE4_TRANSITION_CYCLES 0x00
+
+EVENT_CORE5_TRANSITION_CYCLES 0x75 WBOX
+UMASK_CORE5_TRANSITION_CYCLES 0x00
+
+EVENT_CORE6_TRANSITION_CYCLES 0x76 WBOX
+UMASK_CORE6_TRANSITION_CYCLES 0x00
+
+EVENT_CORE7_TRANSITION_CYCLES 0x77 WBOX
+UMASK_CORE7_TRANSITION_CYCLES 0x00
+
+EVENT_CORE8_TRANSITION_CYCLES 0x78 WBOX
+UMASK_CORE8_TRANSITION_CYCLES 0x00
+
+EVENT_CORE9_TRANSITION_CYCLES 0x79 WBOX
+UMASK_CORE9_TRANSITION_CYCLES 0x00
+
+EVENT_CORE10_TRANSITION_CYCLES 0x7A WBOX
+UMASK_CORE10_TRANSITION_CYCLES 0x00
+
+EVENT_CORE11_TRANSITION_CYCLES 0x7B WBOX
+UMASK_CORE11_TRANSITION_CYCLES 0x00
+
+EVENT_CORE12_TRANSITION_CYCLES 0x7C WBOX
+UMASK_CORE12_TRANSITION_CYCLES 0x00
+
+EVENT_CORE13_TRANSITION_CYCLES 0x7D WBOX
+UMASK_CORE13_TRANSITION_CYCLES 0x00
+
+EVENT_CORE14_TRANSITION_CYCLES 0x7E WBOX
+UMASK_CORE14_TRANSITION_CYCLES 0x00
+
+EVENT_DELAYED_C_STATE_ABORT_CORE0 0x17 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE0 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE1 0x18 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE1 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE2 0x19 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE2 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE3 0x1A WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE3 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE4 0x1B WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE4 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE5 0x1C WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE5 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE6 0x1D WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE6 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE7 0x1E WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE7 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE8 0x1F WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE8 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE9 0x20 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE9 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE10 0x21 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE10 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE11 0x22 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE11 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE12 0x23 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE12 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE13 0x24 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE13 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE14 0x25 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE14 0x00 0x01
+
+EVENT_DEMOTIONS_CORE0 0x1E WBOX
+OPTIONS_DEMOTIONS_CORE0 EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE0 0x00
+
+EVENT_DEMOTIONS_CORE1 0x1F WBOX
+OPTIONS_DEMOTIONS_CORE1 EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE1 0x00
+
+EVENT_DEMOTIONS_CORE2 0x20 WBOX
+OPTIONS_DEMOTIONS_CORE2 EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE2 0x00
+
+EVENT_DEMOTIONS_CORE3 0x21 WBOX
+OPTIONS_DEMOTIONS_CORE3 EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE3 0x00
+
+EVENT_DEMOTIONS_CORE4 0x22 WBOX
+OPTIONS_DEMOTIONS_CORE4 EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE4 0x00
+
+EVENT_DEMOTIONS_CORE5 0x23 WBOX
+OPTIONS_DEMOTIONS_CORE5 EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE5 0x00
+
+EVENT_DEMOTIONS_CORE6 0x24 WBOX
+OPTIONS_DEMOTIONS_CORE6 EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE6 0x00
+
+EVENT_DEMOTIONS_CORE7 0x25 WBOX
+OPTIONS_DEMOTIONS_CORE7 EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE7 0x00
+
+EVENT_DEMOTIONS_CORE8 0x40 WBOX
+OPTIONS_DEMOTIONS_CORE8 EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE8 0x00
+
+EVENT_DEMOTIONS_CORE9 0x41 WBOX
+OPTIONS_DEMOTIONS_CORE9 EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE9 0x00
+
+EVENT_DEMOTIONS_CORE10 0x42 WBOX
+OPTIONS_DEMOTIONS_CORE10 EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE10 0x00
+
+EVENT_DEMOTIONS_CORE11 0x43 WBOX
+OPTIONS_DEMOTIONS_CORE11 EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE11 0x00
+
+EVENT_DEMOTIONS_CORE12 0x44 WBOX
+OPTIONS_DEMOTIONS_CORE12 EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE12 0x00
+
+EVENT_DEMOTIONS_CORE13 0x45 WBOX
+OPTIONS_DEMOTIONS_CORE13 EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE13 0x00
+
+EVENT_DEMOTIONS_CORE14 0x46 WBOX
+OPTIONS_DEMOTIONS_CORE14 EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE14 0x00
+
+EVENT_FREQ_BAND0_CYCLES 0x0B WBOX
+OPTIONS_FREQ_BAND0_CYCLES EVENT_OPTION_MATCH0_MASK
+UMASK_FREQ_BAND0_CYCLES 0x00
+
+EVENT_FREQ_BAND1_CYCLES 0x0C WBOX
+OPTIONS_FREQ_BAND1_CYCLES EVENT_OPTION_MATCH0_MASK
+UMASK_FREQ_BAND1_CYCLES 0x00
+
+EVENT_FREQ_BAND2_CYCLES 0x0D WBOX
+OPTIONS_FREQ_BAND2_CYCLES EVENT_OPTION_MATCH0_MASK
+UMASK_FREQ_BAND2_CYCLES 0x00
+
+EVENT_FREQ_BAND3_CYCLES 0x0E WBOX
+OPTIONS_FREQ_BAND3_CYCLES EVENT_OPTION_MATCH0_MASK
+UMASK_FREQ_BAND3_CYCLES 0x00
+
+EVENT_FREQ_MAX_CURRENT_CYCLES 0x07 WBOX
+UMASK_FREQ_MAX_CURRENT_CYCLES 0x00
+
+EVENT_FREQ_MAX_LIMIT_THERMAL_CYCLES 0x04 WBOX
+UMASK_FREQ_MAX_LIMIT_THERMAL_CYCLES 0x00
+
+EVENT_FREQ_MAX_OS_CYCLES 0x06 WBOX
+UMASK_FREQ_MAX_OS_CYCLES 0x00
+
+EVENT_FREQ_MAX_POWER_CYCLES 0x05 WBOX
+UMASK_FREQ_MAX_POWER_CYCLES 0x00
+
+EVENT_FREQ_MIN_IO_P_CYCLES 0x61 WBOX
+UMASK_FREQ_MIN_IO_P_CYCLES 0x00
+
+EVENT_FREQ_MIN_PERF_P_CYCLES 0x02 WBOX
+UMASK_FREQ_MIN_PERF_P_CYCLES 0x00
+
+EVENT_FREQ_TRANS_CYCLES 0x60 WBOX
+UMASK_FREQ_TRANS_CYCLES 0x00
+
+EVENT_MEMORY_PHASE_SHEDDING_CYCLES 0x2F WBOX
+UMASK_MEMORY_PHASE_SHEDDING_CYCLES 0x00
+
+EVENT_PKG_C_EXIT_LATENCY 0x26 WBOX
+UMASK_PKG_C_EXIT_LATENCY 0x00 0x01
+
+EVENT_POWER_STATE_OCCUPANCY 0x80 WBOX
+UMASK_POWER_STATE_OCCUPANCY_CORES_C0 0x40
+UMASK_POWER_STATE_OCCUPANCY_CORES_C3 0x80
+UMASK_POWER_STATE_OCCUPANCY_CORES_C6 0xC0
+
+EVENT_PROCHOT_EXTERNAL_CYCLES 0x0A WBOX
+UMASK_PROCHOT_EXTERNAL_CYCLES 0x00
+
+EVENT_PROCHOT_INTERNAL_CYCLES 0x09 WBOX
+UMASK_PROCHOT_INTERNAL_CYCLES 0x00
+
+EVENT_TOTAL_TRANSITION_CYCLES 0x63 WBOX
+UMASK_TOTAL_TRANSITION_CYCLES 0x00
+
+EVENT_VOLT_TRANS_CYCLES_CHANGE 0x03 WBOX
+UMASK_VOLT_TRANS_CYCLES_CHANGE 0x00
+
+EVENT_VOLT_TRANS_CYCLES_DECREASE 0x02 WBOX
+UMASK_VOLT_TRANS_CYCLES_DECREASE 0x00
+
+EVENT_VOLT_TRANS_CYCLES_INCREASE 0x01 WBOX
+UMASK_VOLT_TRANS_CYCLES_INCREASE 0x00
+
+EVENT_VR_HOT_CYCLES 0x32 WBOX
+UMASK_VR_HOT_CYCLES 0x00
+
+EVENT_PBOX_CLOCKTICKS 0x01 PBOX
+UMASK_PBOX_CLOCKTICKS 0x00
+
+EVENT_RING_AD_USED 0x07 PBOX
+UMASK_RING_AD_USED_CW_VR0_EVEN 0x01
+UMASK_RING_AD_USED_CW_VR0_ODD 0x02
+UMASK_RING_AD_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_AD_USED_CCW_VR0_ODD 0x08
+UMASK_RING_AD_USED_CW_VR1_EVEN 0x10
+UMASK_RING_AD_USED_CW_VR1_ODD 0x20
+UMASK_RING_AD_USED_CCW_VR1_EVEN 0x40
+UMASK_RING_AD_USED_CCW_VR1_ODD 0x80
+UMASK_RING_AD_USED_CW 0x33
+UMASK_RING_AD_USED_CCW 0xCC
+UMASK_RING_AD_USED_ANY 0xFF
+
+EVENT_RING_AK_USED 0x08 PBOX
+UMASK_RING_AK_USED_CW_VR0_EVEN 0x01
+UMASK_RING_AK_USED_CW_VR0_ODD 0x02
+UMASK_RING_AK_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_AK_USED_CCW_VR0_ODD 0x08
+UMASK_RING_AK_USED_CW_VR1_EVEN 0x10
+UMASK_RING_AK_USED_CW_VR1_ODD 0x20
+UMASK_RING_AK_USED_CCW_VR1_EVEN 0x40
+UMASK_RING_AK_USED_CCW_VR1_ODD 0x80
+UMASK_RING_AK_USED_CW 0x33
+UMASK_RING_AK_USED_CCW 0xCC
+UMASK_RING_AK_USED_ANY 0xFF
+
+EVENT_RING_BL_USED 0x09 PBOX
+UMASK_RING_BL_USED_CW_VR0_EVEN 0x01
+UMASK_RING_BL_USED_CW_VR0_ODD 0x02
+UMASK_RING_BL_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_BL_USED_CCW_VR0_ODD 0x08
+UMASK_RING_BL_USED_CW_VR1_EVEN 0x10
+UMASK_RING_BL_USED_CW_VR1_ODD 0x20
+UMASK_RING_BL_USED_CCW_VR1_EVEN 0x40
+UMASK_RING_BL_USED_CCW_VR1_ODD 0x80
+UMASK_RING_BL_USED_CW 0x33
+UMASK_RING_BL_USED_CCW 0xCC
+UMASK_RING_BL_USED_ANY 0xFF
+
+EVENT_RING_IV_USED 0x09 PBOX
+UMASK_RING_IV_USED_CW 0x33
+UMASK_RING_IV_USED_CCW 0xCC
+UMASK_RING_IV_USED_ANY 0xFF
+
+EVENT_RXR_AK_BOUNCES 0x12 PBOX
+UMASK_RXR_AK_BOUNCES_CW 0x01
+UMASK_RXR_AK_BOUNCES_CCW 0x02
+
+EVENT_RXR_CYCLES_NE 0x10 PBOX
+UMASK_RXR_CYCLES_NE_NCB 0x10
+UMASK_RXR_CYCLES_NE_NCS 0x20
+
+EVENT_RXR_INSERTS 0x11 PBOX
+UMASK_RXR_INSERTS_NCB 0x10
+UMASK_RXR_INSERTS_NCS 0x20
+
+EVENT_RXR_OCCUPANCY 0x13 PBOX
+UMASK_RXR_OCCUPANCY_DRS 0x08
+
+EVENT_TXR_CYCLES_FULL 0x25 PBOX
+UMASK_TXR_CYCLES_FULL_AD 0x01
+UMASK_TXR_CYCLES_FULL_AK 0x02
+UMASK_TXR_CYCLES_FULL_BL 0x04
+
+EVENT_TXR_CYCLES_NE 0x23 PBOX
+UMASK_TXR_CYCLES_NE_AD 0x01
+UMASK_TXR_CYCLES_NE_AK 0x02
+UMASK_TXR_CYCLES_NE_BL 0x04
+
+EVENT_TXR_NACK_CW 0x26 PBOX
+UMASK_TXR_NACK_CW_AD 0x01
+UMASK_TXR_NACK_CW_AK 0x02
+UMASK_TXR_NACK_CW_BL 0x04
+
+EVENT_TXR_NACK_CCW 0x28 PBOX
+UMASK_TXR_NACK_CCW_AD 0x01
+UMASK_TXR_NACK_CCW_AK 0x02
+UMASK_TXR_NACK_CCW_BL 0x04
+
+EVENT_RBOX_CLOCKTICKS 0x01 RBOX
+UMASK_RBOX_CLOCKTICKS 0x00
+
+EVENT_C_LO_AD_CREDITS_EMPTY 0x2B RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO0 0x01
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO1 0x02
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO2 0x04
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO3 0x08
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO4 0x10
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO5 0x20
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO6 0x40
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO7 0x80
+
+EVENT_C_HI_AD_CREDITS_EMPTY 0x2C RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO8 0x01
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO9 0x02
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO10 0x04
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO11 0x08
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO12 0x10
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO13 0x20
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO14 0x40
+
+EVENT_HA_R2_BL_CREDITS_EMPTY 0x2F RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_HA_R2_BL_CREDITS_EMPTY_HA0 0x01
+UMASK_HA_R2_BL_CREDITS_EMPTY_HA1 0x02
+UMASK_HA_R2_BL_CREDITS_EMPTY_R2_NCB 0x04
+UMASK_HA_R2_BL_CREDITS_EMPTY_R2_NCS 0x08
+
+EVENT_QPI0_AD_CREDITS_EMPTY 0x29 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_QPI0_AD_CREDITS_EMPTY_VNA 0x01
+UMASK_QPI0_AD_CREDITS_EMPTY_VN0_HOM 0x02
+UMASK_QPI0_AD_CREDITS_EMPTY_VN0_SNP 0x04
+UMASK_QPI0_AD_CREDITS_EMPTY_VN0_NDR 0x08
+UMASK_QPI0_AD_CREDITS_EMPTY_VN1_HOM 0x10
+UMASK_QPI0_AD_CREDITS_EMPTY_VN1_SNP 0x20
+UMASK_QPI0_AD_CREDITS_EMPTY_VN1_NDR 0x40
+
+EVENT_QPI0_BL_CREDITS_EMPTY 0x2D RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_QPI0_BL_CREDITS_EMPTY_VNA 0x01
+UMASK_QPI0_BL_CREDITS_EMPTY_VN0_HOM 0x02
+UMASK_QPI0_BL_CREDITS_EMPTY_VN0_SNP 0x04
+UMASK_QPI0_BL_CREDITS_EMPTY_VN0_NDR 0x08
+UMASK_QPI0_BL_CREDITS_EMPTY_VN1_HOM 0x10
+UMASK_QPI0_BL_CREDITS_EMPTY_VN1_SNP 0x20
+UMASK_QPI0_BL_CREDITS_EMPTY_VN1_NDR 0x40
+
+EVENT_QPI1_AD_CREDITS_EMPTY 0x2A RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_QPI1_AD_CREDITS_EMPTY_VNA 0x01
+UMASK_QPI1_AD_CREDITS_EMPTY_VN0_HOM 0x02
+UMASK_QPI1_AD_CREDITS_EMPTY_VN0_SNP 0x04
+UMASK_QPI1_AD_CREDITS_EMPTY_VN0_NDR 0x08
+UMASK_QPI1_AD_CREDITS_EMPTY_VN1_HOM 0x10
+UMASK_QPI1_AD_CREDITS_EMPTY_VN1_SNP 0x20
+UMASK_QPI1_AD_CREDITS_EMPTY_VN1_NDR 0x40
+
+EVENT_QPI1_BL_CREDITS_EMPTY 0x2E RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_QPI1_BL_CREDITS_EMPTY_VNA 0x01
+UMASK_QPI1_BL_CREDITS_EMPTY_VN0_HOM 0x02
+UMASK_QPI1_BL_CREDITS_EMPTY_VN0_SNP 0x04
+UMASK_QPI1_BL_CREDITS_EMPTY_VN0_NDR 0x08
+UMASK_QPI1_BL_CREDITS_EMPTY_VN1_HOM 0x10
+UMASK_QPI1_BL_CREDITS_EMPTY_VN1_SNP 0x20
+UMASK_QPI1_BL_CREDITS_EMPTY_VN1_NDR 0x40
+
+EVENT_RING_AD_USED 0x07 RBOX
+UMASK_RING_AD_USED_CW_VR0_EVEN 0x01
+UMASK_RING_AD_USED_CW_VR0_ODD 0x02
+UMASK_RING_AD_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_AD_USED_CCW_VR0_ODD 0x08
+UMASK_RING_AD_USED_CW 0x33
+UMASK_RING_AD_USED_CCW 0xCC
+UMASK_RING_AD_USED_ANY 0xFF
+
+EVENT_RING_AK_USED 0x08 RBOX
+UMASK_RING_AK_USED_CW_VR0_EVEN 0x01
+UMASK_RING_AK_USED_CW_VR0_ODD 0x02
+UMASK_RING_AK_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_AK_USED_CCW_VR0_ODD 0x08
+UMASK_RING_AK_USED_CW 0x33
+UMASK_RING_AK_USED_CCW 0xCC
+UMASK_RING_AK_USED_ANY 0xFF
+
+EVENT_RING_BL_USED 0x09 RBOX
+UMASK_RING_BL_USED_CW_VR0_EVEN 0x01
+UMASK_RING_BL_USED_CW_VR0_ODD 0x02
+UMASK_RING_BL_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_BL_USED_CCW_VR0_ODD 0x08
+UMASK_RING_BL_USED_CW 0x33
+UMASK_RING_BL_USED_CCW 0xCC
+UMASK_RING_BL_USED_ANY 0xFF
+
+EVENT_RING_IV_USED 0x0A RBOX
+UMASK_RING_IV_USED_CW 0x33
+UMASK_RING_IV_USED_CCW 0xCC
+UMASK_RING_IV_USED_ANY 0xFF
+
+EVENT_RXR_AD_BYPASSED 0x12 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_RXR_AD_BYPASSED 0x00
+
+EVENT_RXR_CYCLES_NE 0x10 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_RXR_CYCLES_NE_HOM 0x01
+UMASK_RXR_CYCLES_NE_SNP 0x02
+UMASK_RXR_CYCLES_NE_NDR 0x04
+
+EVENT_RXR_INSERTS 0x11 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_RXR_INSERTS_HOM 0x01
+UMASK_RXR_INSERTS_SNP 0x02
+UMASK_RXR_INSERTS_NDR 0x04
+UMASK_RXR_INSERTS_DRS 0x08
+UMASK_RXR_INSERTS_NCB 0x10
+UMASK_RXR_INSERTS_NCS 0x20
+
+EVENT_RXR_OCCUPANCY 0x13 RBOX0C0|RBOX1C0
+UMASK_RXR_OCCUPANCY_HOM 0x01
+UMASK_RXR_OCCUPANCY_SNP 0x02
+UMASK_RXR_OCCUPANCY_NDR 0x04
+UMASK_RXR_OCCUPANCY_DRS 0x08
+UMASK_RXR_OCCUPANCY_NCB 0x10
+UMASK_RXR_OCCUPANCY_NCS 0x20
+
+EVENT_TXR_CYCLES_FULL 0x25 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_TXR_CYCLES_FULL 0x00
+
+EVENT_TXR_CYCLES_NE 0x23 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_TXR_CYCLES_NE 0x00
+
+EVENT_TXR_NACK_CW 0x26 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_TXR_NACK_CW_AD 0x01
+UMASK_TXR_NACK_CW_AK 0x02
+UMASK_TXR_NACK_CW_BL 0x04
+
+EVENT_TXR_NACK_CCW 0x28 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_TXR_NACK_CCW_AD 0x01
+UMASK_TXR_NACK_CCW_AK 0x02
+UMASK_TXR_NACK_CCW_BL 0x04
+
+EVENT_VN0_CREDITS_REJECT 0x37 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_VN0_CREDITS_REJECT_HOM 0x01
+UMASK_VN0_CREDITS_REJECT_SNP 0x02
+UMASK_VN0_CREDITS_REJECT_NDR 0x04
+UMASK_VN0_CREDITS_REJECT_DRS 0x08
+UMASK_VN0_CREDITS_REJECT_NCB 0x10
+UMASK_VN0_CREDITS_REJECT_NCS 0x20
+
+EVENT_VN0_CREDITS_USED 0x36 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_VN0_CREDITS_USED_HOM 0x01
+UMASK_VN0_CREDITS_USED_SNP 0x02
+UMASK_VN0_CREDITS_USED_NDR 0x04
+UMASK_VN0_CREDITS_USED_DRS 0x08
+UMASK_VN0_CREDITS_USED_NCB 0x10
+UMASK_VN0_CREDITS_USED_NCS 0x20
+
+EVENT_VN1_CREDITS_REJECT 0x39 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_VN1_CREDITS_REJECT_HOM 0x01
+UMASK_VN1_CREDITS_REJECT_SNP 0x02
+UMASK_VN1_CREDITS_REJECT_NDR 0x04
+UMASK_VN1_CREDITS_REJECT_DRS 0x08
+UMASK_VN1_CREDITS_REJECT_NCB 0x10
+UMASK_VN1_CREDITS_REJECT_NCS 0x20
+
+EVENT_VN1_CREDITS_USED 0x38 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_VN1_CREDITS_USED_HOM 0x01
+UMASK_VN1_CREDITS_USED_SNP 0x02
+UMASK_VN1_CREDITS_USED_NDR 0x04
+UMASK_VN1_CREDITS_USED_DRS 0x08
+UMASK_VN1_CREDITS_USED_NCB 0x10
+UMASK_VN1_CREDITS_USED_NCS 0x20
+
+EVENT_VNA_CREDITS_ACQUIRED 0x33 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_VNA_CREDITS_ACQUIRED_AD 0x01
+UMASK_VNA_CREDITS_ACQUIRED_BL 0x04
+
+EVENT_VNA_CREDITS_REJECT 0x34 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_VNA_CREDITS_REJECT_HOM 0x01
+UMASK_VNA_CREDITS_REJECT_SNP 0x02
+UMASK_VNA_CREDITS_REJECT_NDR 0x04
+UMASK_VNA_CREDITS_REJECT_DRS 0x08
+UMASK_VNA_CREDITS_REJECT_NCB 0x10
+UMASK_VNA_CREDITS_REJECT_NCS 0x20
+
+EVENT_VNA_CREDIT_CYCLES_OUT 0x31 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_VNA_CREDIT_CYCLES_OUT 0x00
+
+EVENT_VNA_CREDIT_CYCLES_USED 0x32 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_VNA_CREDIT_CYCLES_USED 0x00
+
+EVENT_IBOX_CLOCKTICKS 0x00 IBOX
+UMASK_IBOX_CLOCKTICKS 0x00
+
+EVENT_ADDRESS_MATCH 0x17 IBOX
+UMASK_ADDRESS_MATCH_STALL_COUNT 0x01
+UMASK_ADDRESS_MATCH_MERGE_COUNT 0x02
+
+EVENT_CACHE_ACK_PENDING_OCCUPANCY 0x14 IBOX
+UMASK_CACHE_ACK_PENDING_OCCUPANCY_ANY 0x01
+UMASK_CACHE_ACK_PENDING_OCCUPANCY_SOURCE 0x02
+
+EVENT_CACHE_OWN_OCCUPANCY 0x13 IBOX
+UMASK_CACHE_OWN_OCCUPANCY_ANY 0x01
+UMASK_CACHE_OWN_OCCUPANCY_SOURCE 0x02
+
+EVENT_CACHE_READ_OCCUPANCY 0x10 IBOX
+UMASK_CACHE_READ_OCCUPANCY_ANY 0x01
+UMASK_CACHE_READ_OCCUPANCY_SOURCE 0x02
+
+EVENT_CACHE_TOTAL_OCCUPANCY 0x12 IBOX
+UMASK_CACHE_TOTAL_OCCUPANCY_ANY 0x01
+UMASK_CACHE_TOTAL_OCCUPANCY_SOURCE 0x02
+
+EVENT_CACHE_WRITE_OCCUPANCY 0x11 IBOX
+UMASK_CACHE_WRITE_OCCUPANCY_ANY 0x01
+UMASK_CACHE_WRITE_OCCUPANCY_SOURCE 0x02
+
+EVENT_RXR_AK_CYCLES_FULL 0x0B IBOX
+UMASK_RXR_AK_CYCLES_FULL 0x00
+
+EVENT_RXR_AK_INSERTS 0x0A IBOX
+UMASK_RXR_AK_INSERTS 0x00
+
+EVENT_RXR_AK_OCCUPANCY 0x0C IBOX
+UMASK_RXR_AK_OCCUPANCY 0x00
+
+EVENT_RXR_BL_DRS_CYCLES_FULL 0x04 IBOX
+UMASK_RXR_BL_DRS_CYCLES_FULL 0x00
+
+EVENT_RXR_BL_DRS_INSERTS 0x01 IBOX
+UMASK_RXR_BL_DRS_INSERTS 0x00
+
+EVENT_RXR_BL_DRS_OCCUPANCY 0x07 IBOX
+UMASK_RXR_BL_DRS_OCCUPANCY 0x00
+
+EVENT_RXR_BL_NCB_CYCLES_FULL 0x05 IBOX
+UMASK_RXR_BL_NCB_CYCLES_FULL 0x00
+
+EVENT_RXR_BL_NCB_INSERTS 0x02 IBOX
+UMASK_RXR_BL_NCB_INSERTS 0x00
+
+EVENT_RXR_BL_NCB_OCCUPANCY 0x08 IBOX
+UMASK_RXR_BL_NCB_OCCUPANCY 0x00
+
+EVENT_RXR_BL_NCS_CYCLES_FULL 0x06 IBOX
+UMASK_RXR_BL_NCS_CYCLES_FULL 0x00
+
+EVENT_RXR_BL_NCS_INSERTS 0x03 IBOX
+UMASK_RXR_BL_NCS_INSERTS 0x00
+
+EVENT_RXR_BL_NCS_OCCUPANCY 0x09 IBOX
+UMASK_RXR_BL_NCS_OCCUPANCY 0x00
+
+EVENT_TICKLES 0x16 IBOX
+UMASK_TICKLES_LOST_OWNERSHIP 0x01
+UMASK_TICKLES_TOP_OF_QUEUE 0x02
+
+EVENT_TRANSACTIONS 0x15 IBOX
+UMASK_TRANSACTIONS_READS 0x01
+UMASK_TRANSACTIONS_WRITES 0x02
+UMASK_TRANSACTIONS_RD_PREFETCHES 0x04
+UMASK_TRANSACTIONS_ORDERINGQ 0x08
+
+EVENT_TXR_AD_STALL_CREDIT_CYCLES 0x18 IBOX
+UMASK_TXR_AD_STALL_CREDIT_CYCLES 0x00
+
+EVENT_TXR_BL_STALL_CREDIT_CYCLES 0x19 IBOX
+UMASK_TXR_BL_STALL_CREDIT_CYCLES 0x00
+
+EVENT_TXR_DATA_INSERTS_NCB 0x0E IBOX
+UMASK_TXR_DATA_INSERTS_NCB 0x00
+
+EVENT_TXR_DATA_INSERTS_NCS 0x0F IBOX
+UMASK_TXR_DATA_INSERTS_NCS 0x00
+
+EVENT_TXR_REQUEST_OCCUPANCY 0x0D IBOX
+UMASK_TXR_REQUEST_OCCUPANCY 0x00
+
+EVENT_WRITE_ORDERING_STALL_CYCLES 0x1A IBOX
+UMASK_WRITE_ORDERING_STALL_CYCLES 0x00
diff --git a/src/includes/perfmon_ivybridge_counters.h b/src/includes/perfmon_ivybridge_counters.h
index e63dfb0..742b230 100644
--- a/src/includes/perfmon_ivybridge_counters.h
+++ b/src/includes/perfmon_ivybridge_counters.h
@@ -3,15 +3,16 @@
*
* Filename: perfmon_ivybridge_counters.h
*
- * Description: Counter header file of perfmon module for Ivy Bridge.
+ * Description: Counter header file of perfmon module for Intel Ivy Bridge.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
* Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -30,46 +31,57 @@
#define NUM_COUNTERS_CORE_IVYBRIDGE 8
#define NUM_COUNTERS_UNCORE_IVYBRIDGE 12
-#define NUM_COUNTERS_IVYBRIDGE 32
+#define NUM_COUNTERS_IVYBRIDGE 23
-static PerfmonCounterMap ivybridge_counter_map[NUM_COUNTERS_IVYBRIDGE] = {
+
+#define IVB_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK|\
+ EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define IVB_VALID_OPTIONS_FIXED EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_COUNT_KERNEL_MASK
+#define IVB_VALID_OPTIONS_CBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+#define IVB_VALID_OPTIONS_UBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+
+static RegisterMap ivybridge_counter_map[NUM_COUNTERS_IVYBRIDGE] = {
/* Fixed Counters: instructions retired, cycles unhalted core */
- {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0},
- {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0},
- {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0},
+ {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, IVB_VALID_OPTIONS_FIXED},
+ {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, IVB_VALID_OPTIONS_FIXED},
+ {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, IVB_VALID_OPTIONS_FIXED},
/* PMC Counters: 4 48bit wide */
- {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
- {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0},
- {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0},
- {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0},
+ {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, IVB_VALID_OPTIONS_PMC},
+ {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, IVB_VALID_OPTIONS_PMC},
+ {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, IVB_VALID_OPTIONS_PMC},
+ {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, IVB_VALID_OPTIONS_PMC},
/* Temperature Sensor*/
- {"TMP0", PMC7, THERMAL, 0, 0, 0, 0},
+ {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
/* RAPL counters */
- {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0},
- {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0},
- {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS, 0, 0},
- {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS, 0, 0},
- /* IMC Counters: 4 48bit wide per memory channel, split in two reads */
- {"MBOX0C0",PMC12, MBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_2},
- {"MBOX0C1",PMC13, MBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_2},
- {"MBOX0C2",PMC14, MBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_2},
- {"MBOX0C3",PMC15, MBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_2},
- {"MBOX0FIX",PMC16, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_2},
- {"MBOX1C0",PMC17, MBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_3},
- {"MBOX1C1",PMC18, MBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_3},
- {"MBOX1C2",PMC19, MBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_3},
- {"MBOX1C3",PMC20, MBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_3},
- {"MBOX1FIX",PMC21, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_3},
- {"MBOX2C0",PMC22, MBOX2, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_0},
- {"MBOX2C1",PMC23, MBOX2, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_0},
- {"MBOX2C2",PMC24, MBOX2, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_0},
- {"MBOX2C3",PMC25, MBOX2, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_0},
- {"MBOX2FIX",PMC26, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_0},
- {"MBOX3C0",PMC27, MBOX3, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_1},
- {"MBOX3C1",PMC28, MBOX3, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_1},
- {"MBOX3C2",PMC29, MBOX3, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_1},
- {"MBOX3C3",PMC30, MBOX3, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_1},
- {"MBOX3FIX",PMC31, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_1},
+ {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"CBOX0C0", PMC12, CBOX0, MSR_UNC_CBO_0_PERFEVTSEL0, MSR_UNC_CBO_0_CTR0, 0, 0, IVB_VALID_OPTIONS_CBOX},
+ {"CBOX0C1", PMC13, CBOX0, MSR_UNC_CBO_0_PERFEVTSEL1, MSR_UNC_CBO_0_CTR1, 0, 0, IVB_VALID_OPTIONS_CBOX},
+ {"CBOX1C0", PMC14, CBOX1, MSR_UNC_CBO_1_PERFEVTSEL0, MSR_UNC_CBO_1_CTR0, 0, 0, IVB_VALID_OPTIONS_CBOX},
+ {"CBOX1C1", PMC15, CBOX1, MSR_UNC_CBO_1_PERFEVTSEL1, MSR_UNC_CBO_1_CTR1, 0, 0, IVB_VALID_OPTIONS_CBOX},
+ {"CBOX2C0", PMC16, CBOX2, MSR_UNC_CBO_2_PERFEVTSEL0, MSR_UNC_CBO_2_CTR0, 0, 0, IVB_VALID_OPTIONS_CBOX},
+ {"CBOX2C1", PMC17, CBOX2, MSR_UNC_CBO_2_PERFEVTSEL1, MSR_UNC_CBO_2_CTR1, 0, 0, IVB_VALID_OPTIONS_CBOX},
+ {"CBOX3C0", PMC18, CBOX3, MSR_UNC_CBO_3_PERFEVTSEL0, MSR_UNC_CBO_3_CTR0, 0, 0, IVB_VALID_OPTIONS_CBOX},
+ {"CBOX3C1", PMC19, CBOX3, MSR_UNC_CBO_3_PERFEVTSEL1, MSR_UNC_CBO_3_CTR1, 0, 0, IVB_VALID_OPTIONS_CBOX},
+ {"UBOX0", PMC20, UBOX, MSR_UNC_ARB_PERFEVTSEL0, MSR_UNC_ARB_CTR0, 0, 0, IVB_VALID_OPTIONS_UBOX},
+ {"UBOX1", PMC21, UBOX, MSR_UNC_ARB_PERFEVTSEL1, MSR_UNC_ARB_CTR1, 0, 0, IVB_VALID_OPTIONS_UBOX},
+ {"UBOXFIX", PMC22, UBOXFIX, MSR_UNC_PERF_FIXED_CTRL, MSR_UNC_PERF_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+};
+
+
+static BoxMap ivybridge_box_map[NUM_UNITS] = {
+ [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+ [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+ [THERMAL] = {0, 0, 0, 0, 0, MSR_DEV, 8},
+ [POWER] = {0, 0, 0, 0, 0, MSR_DEV, 32},
+ [CBOX0] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+ [CBOX1] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+ [CBOX2] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+ [CBOX3] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+ [UBOX] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 1, 0, MSR_DEV, 44},
+ [UBOXFIX] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 0, 0, MSR_DEV, 44},
};
diff --git a/src/includes/perfmon_ivybridge_events.txt b/src/includes/perfmon_ivybridge_events.txt
index 5318ce6..1ff619a 100644
--- a/src/includes/perfmon_ivybridge_events.txt
+++ b/src/includes/perfmon_ivybridge_events.txt
@@ -1,16 +1,17 @@
# =======================================================================================
-#
+#
# Filename: perfmon_ivybridge_events.txt
-#
+#
# Description: Event list for Intel Ivy Bridge
-#
-# Version: 3.1.3
-# Released: 4.11.2014
-#
-# Author: Jan Treibig (jt), jan.treibig at gmail.com
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
+# Author: Jan Treibig (jt), jan.treibig at gmail.com
+# Thomas Roehl (tr), thomas.roehl at googlemail.com
# Project: likwid
#
-# Copyright (C) 2014 Jan Treibig
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
@@ -26,8 +27,8 @@
#
# =======================================================================================
-EVENT_TEMP_CORE 0x00 TMP0
-UMASK_TEMP_CORE 0x00
+EVENT_TEMP_CORE 0x00 TMP0
+UMASK_TEMP_CORE 0x00
EVENT_PWR_PKG_ENERGY 0x00 PWR0
UMASK_PWR_PKG_ENERGY 0x00
@@ -35,101 +36,134 @@ UMASK_PWR_PKG_ENERGY 0x00
EVENT_PWR_PP0_ENERGY 0x00 PWR1
UMASK_PWR_PP0_ENERGY 0x00
-EVENT_PWR_DRAM_ENERGY 0x00 PWR3
-UMASK_PWR_DRAM_ENERGY 0x00
+EVENT_PWR_PP1_ENERGY 0x00 PWR2
+UMASK_PWR_PP1_ENERGY 0x00
-EVENT_INSTR_RETIRED 0x00 FIXC0
-UMASK_INSTR_RETIRED_ANY 0x00
+EVENT_PWR_DRAM_ENERGY 0x00 PWR3
+UMASK_PWR_DRAM_ENERGY 0x00
-EVENT_CPU_CLK_UNHALTED 0x00 FIXC1
-UMASK_CPU_CLK_UNHALTED_CORE 0x00
+EVENT_INSTR_RETIRED 0x00 FIXC0
+UMASK_INSTR_RETIRED_ANY 0x00
-EVENT_CPU_CLK_UNHALTED 0x00 FIXC2
-UMASK_CPU_CLK_UNHALTED_REF 0x00
+EVENT_CPU_CLK_UNHALTED 0x00 FIXC1
+UMASK_CPU_CLK_UNHALTED_CORE 0x00
+
+EVENT_CPU_CLK_UNHALTED 0x00 FIXC2
+UMASK_CPU_CLK_UNHALTED_REF 0x00
EVENT_LD_BLOCKS 0x03 PMC
UMASK_LD_BLOCKS_STORE_FORWARD 0x02
+UMASK_LD_BLOCKS_NO_SR 0x08
-EVENT_MISALIGN_MEM_REF 0x05 PMC
+EVENT_MISALIGN_MEM_REF 0x05 PMC
UMASK_MISALIGN_MEM_REF_LOADS 0x01
UMASK_MISALIGN_MEM_REF_STORES 0x02
UMASK_MISALIGN_MEM_REF_ANY 0x03
-EVENT_LD_BLOCKS_PARTIAL 0x07 PMC
-UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS 0x01 PMC
+EVENT_LD_BLOCKS_PARTIAL 0x07 PMC
+UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS 0x01
EVENT_DTLB_LOAD_MISSES 0x08 PMC
UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK 0x81
UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED 0x82
UMASK_DTLB_LOAD_MISSES_WALK_DURATION 0x84
-EVENT_UOPS_ISSUED 0x0E PMC
-UMASK_UOPS_ISSUED_ANY 0x01
-UMASK_UOPS_ISSUED_FLAGS_MERGE 0x10
-UMASK_UOPS_ISSUED_SLOW_LEA 0x20
-UMASK_UOPS_ISSUED_SINGLE_MUL 0x40
-
-EVENT_FP_COMP_OPS_EXE 0x10 PMC
+EVENT_INT_MISC 0x0D PMC
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_INT_MISC_RECOVERY_CYCLES 0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_INT_MISC_RECOVERY_COUNT 0x03
+
+EVENT_UOPS_ISSUED 0x0E PMC
+UMASK_UOPS_ISSUED_ANY 0x01
+UMASK_UOPS_ISSUED_FLAGS_MERGE 0x10
+UMASK_UOPS_ISSUED_SLOW_LEA 0x20
+UMASK_UOPS_ISSUED_SINGLE_MUL 0x40
+DEFAULT_OPTIONS_UOPS_ISSUED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_ISSUED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_ANY 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_FLAGS_MERGE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_FLAGS_MERGE 0x10
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_SLOW_LEA EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_SLOW_LEA 0x20
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_SINGLE_MUL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_SINGLE_MUL 0x40
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_TOTAL_CYCLES 0x01
+
+EVENT_FP_COMP_OPS_EXE 0x10 PMC
UMASK_FP_COMP_OPS_EXE_X87 0x01
UMASK_FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE 0x10
UMASK_FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE 0x20
UMASK_FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE 0x40
UMASK_FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE 0x80
-EVENT_SIMD_FP_256_PACKED 0x11 PMC
+EVENT_SIMD_FP_256_PACKED 0x11 PMC
UMASK_SIMD_FP_256_PACKED_SINGLE 0x01
UMASK_SIMD_FP_256_PACKED_DOUBLE 0x02
EVENT_ARITH 0x14 PMC
UMASK_ARITH_FPU_DIV_ACTIVE 0x01
-UMASK_ARITH_NUM_DIV 0x01 0xC5 0x01
+DEFAULT_OPTIONS_ARITH_NUM_DIV EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_ARITH_NUM_DIV 0x01
-EVENT_L2_RQSTS 0x24 PMC
+EVENT_L2_RQSTS 0x24 PMC
UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD_HIT 0x01
UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD 0x03
-UMASK_L2_RQSTS_RFO_HITS 0x04
-UMASK_L2_RQSTS_RFO_MISS 0x08
-UMASK_L2_RQSTS_RFO_ANY 0x0C
-UMASK_L2_RQSTS_CODE_RD_HITS 0x10
-UMASK_L2_RQSTS_CODE_RD_MISS 0x20
-UMASK_L2_RQSTS_ALL_CODE_CODE_RD 0x30
-UMASK_L2_RQSTS_PF_HIT 0x40
-UMASK_L2_RQSTS_PF_MISS 0x80
-UMASK_L2_RQSTS_ALL_PF 0xC0
-UMASK_L2_RQSTS_MISS 0xAA
+UMASK_L2_RQSTS_RFO_HITS 0x04
+UMASK_L2_RQSTS_RFO_MISS 0x08
+UMASK_L2_RQSTS_RFO_ANY 0x0C
+UMASK_L2_RQSTS_CODE_RD_HITS 0x10
+UMASK_L2_RQSTS_CODE_RD_MISS 0x20
+UMASK_L2_RQSTS_ALL_CODE_CODE_RD 0x30
+UMASK_L2_RQSTS_PF_HIT 0x40
+UMASK_L2_RQSTS_PF_MISS 0x80
+UMASK_L2_RQSTS_ALL_PF 0xC0
+UMASK_L2_RQSTS_MISS 0xAA
EVENT_L2_STORE_LOCK_RQSTS 0x27 PMC
UMASK_L2_STORE_LOCK_RQSTS_MISS 0x01
-UMASK_L2_STORE_LOCK_RQSTS_HIT_M 0x08
+UMASK_L2_STORE_LOCK_RQSTS_HIT_M 0x08
UMASK_L2_STORE_LOCK_RQSTS_ALL 0x0F
EVENT_L1D_WB_RQST 0x28 PMC
-UMASK_L1D_WB_RQST_HIT_E 0x04
-UMASK_L1D_WB_RQST_HIT_M 0x08
-UMASK_L1D_WB_RQST_ALL 0x0F
+UMASK_L1D_WB_RQST_HIT_E 0x04
+UMASK_L1D_WB_RQST_HIT_M 0x08
+UMASK_L1D_WB_RQST_ALL 0x0F
EVENT_L3_LAT_CACHE 0x2E PMC
UMASK_L3_LAT_CACHE_REFERENCE 0x4F
UMASK_L3_LAT_CACHE_MISS 0x41
-EVENT_CPU_CLOCK_UNHALTED 0x3C PMC
+EVENT_CPU_CLOCK_UNHALTED 0x3C PMC
UMASK_CPU_CLOCK_UNHALTED_THREAD_P 0x00
UMASK_CPU_CLOCK_UNHALTED_REF_P 0x01
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0x2,EVENT_OPTION_INVERT=0x1
+UMASK_CPU_CLOCK_UNHALTED_TOTAL_CYCLES 0x00
EVENT_L1D_PEND_MISS 0x48 PMC1
UMASK_L1D_PEND_MISS_PENDING 0x01
-EVENT_DTLB_STORE_MISSES 0x49 PMC
-UMASK_DTLB_STORE_MISSES_MISS_CAUSES_A_WALK 0x01
+EVENT_DTLB_STORE_MISSES 0x49 PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK 0x01
UMASK_DTLB_STORE_MISSES_WALK_COMPLETED 0x02
-UMASK_DTLB_STORE_MISSES_WALK_DURATION 0x04
+UMASK_DTLB_STORE_MISSES_WALK_DURATION 0x04
UMASK_DTLB_STORE_MISSES_STLB_HIT 0x10
-EVENT_LOAD_HIT_PRE 0x4C PMC
+EVENT_LOAD_HIT_PRE 0x4C PMC
UMASK_LOAD_HIT_PRE_SW_PF 0x01
UMASK_LOAD_HIT_PRE_HW_PF 0x02
-EVENT_L1D 0x51 PMC
+EVENT_L1D 0x51 PMC
UMASK_L1D_REPLACEMENT 0x01
UMASK_L1D_ALLOCATED_IN_M 0x02
UMASK_L1D_M_EVICT 0x04
@@ -142,40 +176,45 @@ UMASK_MOVE_ELIMINATION_INT_ELIMINATED 0x01
UMASK_MOVE_ELIMINATION_SIMD_ELIMINATED 0x02
EVENT_CPL_CYCLES 0x5C PMC
-UMASK_CPL_CYCLES_RING0 0x01
-UMASK_CPL_CYCLES_RING123 0x02
+UMASK_CPL_CYCLES_RING0 0x01
+UMASK_CPL_CYCLES_RING123 0x02
-EVENT_RS_EVENTS 0x5E PMC
+EVENT_RS_EVENTS 0x5E PMC
UMASK_RS_EVENTS_EMPTY_CYCLES 0x01
-EVENT_DTLB_LOAD_MISSES_STLB 0x5F PMC
-UMASK_DTLB_LOAD_MISSES_STLB_HIT 0x04
-
-EVENT_OFFCORE_REQUESTS_OUTSTANDING 0x60 PMC
+EVENT_OFFCORE_REQUESTS_OUTSTANDING 0x60 PMC
UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD 0x01
UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD 0x02
-UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO 0x04
-UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD 0x08
-
-EVENT_CACHE_LOCK_CYCLES 0x63 PMC
-UMASK_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION 0x01
-UMASK_CACHE_LOCK_CYCLES_CACHE_LOCK_DURATION 0x02
-
-EVENT_IDQ 0x79 PMC
-UMASK_IDQ_EMPTY 0x02
-UMASK_IDQ_MITE_UOPS 0x04
-UMASK_IDQ_DSB_UOPS 0x08
-UMASK_IDQ_MS_DSB_UOPS 0x10
-UMASK_IDQ_MS_MITE_UOPS 0x20
-UMASK_IDQ_MS_UOPS 0x30
-UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS 0x18 0x00 0x01
-UMASK_IDQ_ALL_DSB_CYCLES_4_UOPS 0x18 0x00 0x04
-UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS 0x24 0x00 0x01
-UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS 0x24 0x00 0x04
-UMASK_IDQ_ALL_MITE_ALL_UOPS 0x3C
-
-EVENT_ICACHE 0x80 PMC
-UMASK_ICACHE_HITS 0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO 0x04
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD 0x08
+
+EVENT_CACHE_LOCK_CYCLES 0x63 PMC
+UMASK_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION 0x01
+DEFAULT_OPTIONS_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_COUNT EVENT_OPTION_EDGE=1
+UMASK_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_COUNT 0x01
+UMASK_CACHE_LOCK_CYCLES_CACHE_LOCK_DURATION 0x02
+DEFAULT_OPTIONS_CACHE_LOCK_CYCLES_CACHE_LOCK_COUNT EVENT_OPTION_EDGE=1
+UMASK_CACHE_LOCK_CYCLES_CACHE_LOCK_COUNT 0x02
+
+EVENT_IDQ 0x79 PMC
+UMASK_IDQ_EMPTY 0x02
+UMASK_IDQ_MITE_UOPS 0x04
+UMASK_IDQ_DSB_UOPS 0x08
+UMASK_IDQ_MS_DSB_UOPS 0x10
+UMASK_IDQ_MS_MITE_UOPS 0x20
+UMASK_IDQ_MS_UOPS 0x30
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS 0x18
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_ALL_DSB_CYCLES_4_UOPS 0x18
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_ALL_MITE_CYCLES_ANY_UOPS 0x24
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS 0x24
+UMASK_IDQ_ALL_MITE_ALL_UOPS 0x3C
+
+EVENT_ICACHE 0x80 PMC
+UMASK_ICACHE_HITS 0x01
UMASK_ICACHE_MISSES 0x02
UMASK_ICACHE_ACCESSES 0x03
UMASK_ICACHE_IFETCH_STALL 0x04
@@ -184,7 +223,7 @@ EVENT_ITLB_MISSES 0x85 PMC
UMASK_ITLB_MISSES_CAUSES_A_WALK 0x01
UMASK_ITLB_MISSES_WALK_COMPLETED 0x02
UMASK_ITLB_MISSES_WALK_DURATION 0x04
-UMASK_ITLB_MISSES_STLB_HIT 0x10
+UMASK_ITLB_MISSES_STLB_HIT 0x10
EVENT_ILD_STALL 0x87 PMC
UMASK_ILD_STALL_LCP 0x01
@@ -201,9 +240,9 @@ UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN 0x88
UMASK_BR_INST_EXEC_RETURN_NEAR_NON_TAKEN 0x48
UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN 0x90
UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_NON_TAKEN 0x50
-UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN 0xA0
-UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN 0x60
-UMASK_BR_INST_EXEC_ALL_BRANCHES 0xFF
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN 0xA0
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN 0x60
+UMASK_BR_INST_EXEC_ALL_BRANCHES 0xFF
EVENT_BR_MISP_EXEC 0x89 PMC
UMASK_BR_MISP_EXEC_COND_TAKEN 0x81
@@ -218,54 +257,100 @@ UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN 0xA0
UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN 0x60
UMASK_BR_MISP_EXEC_ALL_BRANCHES 0xFF
-EVENT_IDQ_UOPS_NOT_DELIVERED 0x9C PMC
-UMASK_IDQ_UOPS_NOT_DELIVERED_CORE 0x01
-
-EVENT_UOPS_DISPATCHED_PORT 0xA1 PMC
+EVENT_IDQ_UOPS_NOT_DELIVERED 0x9C PMC
+UMASK_IDQ_UOPS_NOT_DELIVERED_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x3
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x2
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK 0x01
+
+EVENT_UOPS_DISPATCHED_PORT 0xA1 PMC
UMASK_UOPS_DISPATCHED_PORT_PORT_0 0x01
UMASK_UOPS_DISPATCHED_PORT_PORT_1 0x02
UMASK_UOPS_DISPATCHED_PORT_PORT_2_LD 0x04
UMASK_UOPS_DISPATCHED_PORT_PORT_2_STA 0x08
UMASK_UOPS_DISPATCHED_PORT_PORT_2 0x0C
-UMASK_UOPS_DISPATCHED_PORT_PORT_3_LD 0x10
-UMASK_UOPS_DISPATCHED_PORT_PORT_3_STA 0x20
+UMASK_UOPS_DISPATCHED_PORT_PORT_3_LD 0x10
+UMASK_UOPS_DISPATCHED_PORT_PORT_3_STA 0x20
UMASK_UOPS_DISPATCHED_PORT_PORT_3 0x30
UMASK_UOPS_DISPATCHED_PORT_PORT_4 0x40
UMASK_UOPS_DISPATCHED_PORT_PORT_5 0x80
+DEFAULT_OPTIONS_UOPS_DISPATCHED_PORT_ARITH_PORTS_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_DISPATCHED_PORT_ARITH_PORTS_CORE 0x83
+UMASK_UOPS_DISPATCHED_PORT_DATA_PORTS 0x7C
+UMASK_UOPS_DISPATCHED_PORT_ALL_PORTS 0xFF
EVENT_RESOURCE_STALLS 0xA2 PMC
UMASK_RESOURCE_STALLS_ANY 0x01
UMASK_RESOURCE_STALLS_RS 0x04
-UMASK_RESOURCE_STALLS_B 0x08
+UMASK_RESOURCE_STALLS_SB 0x08
UMASK_RESOURCE_STALLS_ROB 0x10
-EVENT_CYCLE_ACTIVITY 0xA3 PMC
+EVENT_CYCLE_ACTIVITY 0xA3 PMC
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_L2_PENDING EVENT_OPTION_THRESHOLD=0x01
UMASK_CYCLE_ACTIVITY_CYCLES_L2_PENDING 0x01
-UMASK_CYCLE_ACTIVITY_CYCLES_LDM_PENDING 0x02
-UMASK_CYCLE_ACTIVITY_L1D_PENDING 0x08
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_LDM_PENDING EVENT_OPTION_THRESHOLD=0x02
+UMASK_CYCLE_ACTIVITY_CYCLES_LDM_PENDING 0x02
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_L1D_PENDING EVENT_OPTION_THRESHOLD=0x08
+UMASK_CYCLE_ACTIVITY_L1D_PENDING 0x08
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE EVENT_OPTION_THRESHOLD=0x04
UMASK_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE 0x04
-EVENT_DSB2MITE_SWITCHES 0xAB PMC
-UMASK_DSB2MITE_SWITCHES_COUNT 0x01
-UMASK_DSB2MITE_SWITCHES_PENALTY_CYCLES 0x02
+EVENT_DSB2MITE_SWITCHES 0xAB PMC
+UMASK_DSB2MITE_SWITCHES_COUNT 0x01
+UMASK_DSB2MITE_SWITCHES_PENALTY_CYCLES 0x02
-EVENT_DSB_FILL 0xAC PMC
-UMASK_DSB_FILL_EXCEED_DSB_LINES 0x08
+EVENT_DSB_FILL 0xAC PMC
+UMASK_DSB_FILL_EXCEED_DSB_LINES 0x08
-EVENT_ITLB 0xAE PMC
-UMASK_ITLB_ITLB_FLUSH 0x01
+EVENT_ITLB 0xAE PMC
+UMASK_ITLB_ITLB_FLUSH 0x01
-EVENT_OFFCORE_REQUESTS 0xB0 PMC
+EVENT_OFFCORE_REQUESTS 0xB0 PMC
UMASK_OFFCORE_REQUESTS_DEMAND_DATA_RD 0x01
UMASK_OFFCORE_REQUESTS_DEMAND_CODE_RD 0x02
UMASK_OFFCORE_REQUESTS_DEMAND_RFO 0x04
UMASK_OFFCORE_REQUESTS_ALL_DATA_RD 0x08
-EVENT_UOPS_EXECUTED 0xB1 PMC
-UMASK_UOPS_EXECUTED_THREAD 0x01
-UMASK_UOPS_EXECUTED_CORE 0x02
-
-EVENT_TLB_FLUSH 0xBD PMC
+EVENT_UOPS_EXECUTED 0xB1 PMC
+UMASK_UOPS_EXECUTED_THREAD 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC 0x01
+UMASK_UOPS_EXECUTED_CORE 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_USED_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_TOTAL_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC 0x02
+
+EVENT_TLB_FLUSH 0xBD PMC
UMASK_TLB_FLUSH_DTLB_THREAD 0x01
UMASK_TLB_FLUSH_STLB_ANY 0x20
@@ -274,15 +359,38 @@ UMASK_INST_RETIRED_ANY_P 0x00
UMASK_INST_RETIRED_ALL 0x01
EVENT_OTHER_ASSISTS 0xC1 PMC
-UMASK_OTHER_ASSISTS_AVX_STORE 0x08
-UMASK_OTHER_ASSISTS_AVX_TO_SSE 0x10
-UMASK_OTHER_ASSISTS_SSE_TO_AVX 0x20
-
-EVENT_UOPS_RETIRED 0xC2 PMC
-UMASK_UOPS_RETIRED_ALL 0x01
-UMASK_UOPS_RETIRED_RETIRE_SLOTS 0x02
-
-EVENT_MACHINE_CLEARS 0xC3 PMC
+UMASK_OTHER_ASSISTS_AVX_STORE 0x08
+UMASK_OTHER_ASSISTS_AVX_TO_SSE 0x10
+UMASK_OTHER_ASSISTS_SSE_TO_AVX 0x20
+
+EVENT_UOPS_RETIRED 0xC2 PMC
+UMASK_UOPS_RETIRED_ALL 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL 0x01
+UMASK_UOPS_RETIRED_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_RETIRED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL 0x01
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_TOTAL_CYCLES 0x01
+
+EVENT_MACHINE_CLEARS 0xC3 PMC
+UMASK_MACHINE_CLEARS_CYCLES 0x01
+DEFAULT_OPTIONS_MACHINE_CLEARS_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_MACHINE_CLEARS_COUNT 0x01
UMASK_MACHINE_CLEARS_MEMORY_ORDERING 0x02
UMASK_MACHINE_CLEARS_SMC 0x04
UMASK_MACHINE_CLEARS_MASKMOV 0x20
@@ -291,7 +399,6 @@ EVENT_BR_INST_RETIRED 0xC4 PMC
UMASK_BR_INST_RETIRED_ALL_BRANCHES 0x00
UMASK_BR_INST_RETIRED_CONDITIONAL 0x01
UMASK_BR_INST_RETIRED_NEAR_CALL 0x02
-UMASK_BR_INST_RETIRED_ALL_BRANCHES 0x04
UMASK_BR_INST_RETIRED_NEAR_RETURN 0x08
UMASK_BR_INST_RETIRED_NOT_TAKEN 0x10
UMASK_BR_INST_RETIRED_NEAR_TAKEN 0x20
@@ -299,31 +406,30 @@ UMASK_BR_INST_RETIRED_FAR_BRANCH 0x40
EVENT_BR_MISP_RETIRED 0xC5 PMC
UMASK_BR_MISP_RETIRED_ALL_BRANCHES 0x00
-UMASK_BR_MISP_RETIRED_CONDITIONAL 0x01
+UMASK_BR_MISP_RETIRED_CONDITIONAL 0x01
UMASK_BR_MISP_RETIRED_NEAR_CALL 0x02
-UMASK_BR_MISP_RETIRED_ALL_BRANCHES 0x04
-UMASK_BR_MISP_RETIRED_NOT_TAKEN 0x10
-UMASK_BR_MISP_RETIRED_TAKEN 0x20
+UMASK_BR_MISP_RETIRED_NOT_TAKEN 0x10
+UMASK_BR_MISP_RETIRED_TAKEN 0x20
-EVENT_FP_ASSIST 0xCA PMC
-UMASK_FP_ASSIST_X87_OUTPUT 0x02
-UMASK_FP_ASSIST_X87_INPUT 0x04
-UMASK_FP_ASSIST_SIMD_OUTPUT 0x08
-UMASK_FP_ASSIST_SIMD_INPUT 0x10
-UMASK_FP_ASSIST_ANY 0x1E
+EVENT_FP_ASSIST 0xCA PMC
+UMASK_FP_ASSIST_X87_OUTPUT 0x02
+UMASK_FP_ASSIST_X87_INPUT 0x04
+UMASK_FP_ASSIST_SIMD_OUTPUT 0x08
+UMASK_FP_ASSIST_SIMD_INPUT 0x10
+UMASK_FP_ASSIST_ANY 0x1E
EVENT_ROB_MISC_EVENT_LBR_INSERTS 0xCC PMC
UMASK_ROB_MISC_EVENT_LBR_INSERTS 0x20
-EVENT_MEM_UOP_RETIRED 0xD0 PMC
-UMASK_MEM_UOP_RETIRED_LOADS 0x81
-UMASK_MEM_UOP_RETIRED_STORES 0x82
-UMASK_MEM_UOP_RETIRED_LOADS_STLB_MISS 0x11
-UMASK_MEM_UOP_RETIRED_STORES_STLB_MISS 0x12
-UMASK_MEM_UOP_RETIRED_LOADS_LOCK 0x21
-UMASK_MEM_UOP_RETIRED_STORES_LOCK 0x22
-UMASK_MEM_UOP_RETIRED_LOADS_SPLIT 0x41
-UMASK_MEM_UOP_RETIRED_STORES_SPLIT 0x42
+EVENT_MEM_UOPS_RETIRED 0xD0 PMC
+UMASK_MEM_UOPS_RETIRED_LOADS 0x81
+UMASK_MEM_UOPS_RETIRED_STORES 0x82
+UMASK_MEM_UOPS_RETIRED_LOADS_STLB_MISS 0x11
+UMASK_MEM_UOPS_RETIRED_STORES_STLB_MISS 0x12
+UMASK_MEM_UOPS_RETIRED_LOADS_LOCK 0x21
+UMASK_MEM_UOPS_RETIRED_STORES_LOCK 0x22
+UMASK_MEM_UOPS_RETIRED_LOADS_SPLIT 0x41
+UMASK_MEM_UOPS_RETIRED_STORES_SPLIT 0x42
EVENT_MEM_LOAD_UOPS_RETIRED 0xD1 PMC
UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT 0x01
@@ -336,326 +442,153 @@ UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT 0x04
UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS 0x20
UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL 0x24
UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB 0x40
+UMASK_MEM_LOAD_UOPS_RETIRED_ALL 0x7F
-EVENT_MEM_LOAD_UOPS_LLC_HIT_RETIRED 0xD2 PMC
+EVENT_MEM_LOAD_UOPS_LLC_HIT_RETIRED 0xD2 PMC
UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_MISS 0x01
UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HIT 0x02
UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM 0x04
UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_NONE 0x08
-EVENT_MEM_LOAD_UOPS_LLC_MISS_RETIRED 0xD3 PMC
-UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_LOCAL_DRAM 0x01
-
EVENT_BACLEARS 0xE6 PMC
UMASK_BACLEARS_ANY 0x1F
-EVENT_L2_TRANS 0xF0 PMC
-UMASK_L2_TRANS_DEMAND_DATA_RD 0x01
-UMASK_L2_TRANS_RFO 0x02
-UMASK_L2_TRANS_CODE_RD 0x04
-UMASK_L2_TRANS_ALL_PREF 0x08
-UMASK_L2_TRANS_L1D_WB 0x10
-UMASK_L2_TRANS_L2_FILL 0x20
-UMASK_L2_TRANS_L2_WB 0x40
-UMASK_L2_TRANS_ALL_REQUESTS 0x80
-
-EVENT_L2_LINES_IN 0xF1 PMC
+EVENT_L2_TRANS 0xF0 PMC
+UMASK_L2_TRANS_DEMAND_DATA_RD 0x01
+UMASK_L2_TRANS_RFO 0x02
+UMASK_L2_TRANS_CODE_RD 0x04
+UMASK_L2_TRANS_ALL_PREF 0x08
+UMASK_L2_TRANS_L1D_WB 0x10
+UMASK_L2_TRANS_L2_FILL 0x20
+UMASK_L2_TRANS_L2_WB 0x40
+UMASK_L2_TRANS_ALL_REQUESTS 0x80
+
+EVENT_L2_LINES_IN 0xF1 PMC
UMASK_L2_LINES_IN_I 0x01
-UMASK_L2_LINES_IN_S 0x02
+UMASK_L2_LINES_IN_S 0x02
UMASK_L2_LINES_IN_E 0x04
-UMASK_L2_LINES_IN_ALL 0x07
+UMASK_L2_LINES_IN_ALL 0x07
EVENT_L2_LINES_OUT 0xF2 PMC
UMASK_L2_LINES_OUT_DEMAND_CLEAN 0x01
UMASK_L2_LINES_OUT_DEMAND_DIRTY 0x02
-UMASK_L2_LINES_OUT_PF_CLEAN 0x04
-UMASK_L2_LINES_OUT_PF_DIRTY 0x08
-UMASK_L2_LINES_OUT_DIRTY_ALL 0x0A
-
-EVENT_MEM_LOAD_UOPS_LLC_MISS_RETIRED 0xD3 PMC
-UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_LOCAL_DRAM 0x03
-UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_DRAM 0x0C
-UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_HITM 0x10
-UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_FWD 0x20
-
-EVENT_DRAM_CLOCKTICKS 0x00 MBOX
-UMASK_DRAM_CLOCKTICKS 0x00
-
-EVENT_ACT_COUNT 0x01 MBOX
-UMASK_ACT_COUNT_RD 0x01
-UMASK_ACT_COUNT_WR 0x02
-UMASK_ACT_COUNT_BYP 0x08
-
-EVENT_BYP_CMDS 0xA1 MBOX
-UMASK_BYP_CMDS_ACT 0x01
-UMASK_BYP_CMDS_CAS 0x02
-UMASK_BYP_CMDS_PRE 0x04
-
-EVENT_CAS_COUNT 0x04 MBOX
-UMASK_CAS_COUNT_RD_REG 0x01
-UMASK_CAS_COUNT_RD_UNDERFILL 0x02
-UMASK_CAS_COUNT_RD 0x03
-UMASK_CAS_COUNT_WR_WMM 0x04
-UMASK_CAS_COUNT_WR_RMM 0x08
-UMASK_CAS_COUNT_WR 0x0C
-UMASK_CAS_COUNT_ALL 0x0F
-UMASK_CAS_COUNT_RD_WMM 0x01
-UMASK_CAS_COUNT_RD_RMM 0x02
-
-EVENT_DRAM_PRE_ALL 0x06 MBOX
-UMASK_DRAM_PRE_ALL 0x00
-
-EVENT_DRAM_REFRESH 0x05 MBOX
-UMASK_DRAM_REFRESH_PANIC 0x02
-UMASK_DRAM_REFRESH_HIGH 0x04
-
-EVENT_ECC_CORRECTABLE_ERRORS 0x09 MBOX
-UMASK_ECC_CORRECTABLE_ERRORS 0x00
-
-EVENT_MAJOR_MODES 0x07 MBOX
-UMASK_MAJOR_MODES_READ 0x01
-UMASK_MAJOR_MODES_WRITE 0x02
-UMASK_MAJOR_MODES_PARTIAL 0x04
-UMASK_MAJOR_MODES_ISOCH 0x08
-
-EVENT_POWER_CHANNEL_DLLOFF 0x84 MBOX
-UMASK_POWER_CHANNEL_DLLOFF 0x00
-
-EVENT_POWER_CHANNEL_PPD 0x85 MBOX
-UMASK_POWER_CHANNEL_PPD 0x00
-
-EVENT_POWER_CKE_CYCLES 0x83 MBOX
-UMASK_POWER_CKE_CYCLES_RANK0 0x01
-UMASK_POWER_CKE_CYCLES_RANK1 0x02
-UMASK_POWER_CKE_CYCLES_RANK2 0x04
-UMASK_POWER_CKE_CYCLES_RANK3 0x08
-UMASK_POWER_CKE_CYCLES_RANK4 0x10
-UMASK_POWER_CKE_CYCLES_RANK5 0x20
-UMASK_POWER_CKE_CYCLES_RANK6 0x40
-UMASK_POWER_CKE_CYCLES_RANK7 0x80
-
-EVENT_POWER_CRITICAL_THROTTLE_CYCLES 0x86 MBOX
-UMASK_POWER_CRITICAL_THROTTLE_CYCLES 0x00
-
-EVENT_POWER_PCU_THROTTLING 0x42 MBOX
-UMASK_POWER_PCU_THROTTLING 0x00
-
-EVENT_POWER_SELF_REFRESH 0x43 MBOX
-UMASK_POWER_SELF_REFRESH 0x00
-
-EVENT_POWER_THROTTLE_CYCLES 0x41 MBOX
-UMASK_POWER_THROTTLE_CYCLES_RANK0 0x01
-UMASK_POWER_THROTTLE_CYCLES_RANK1 0x02
-UMASK_POWER_THROTTLE_CYCLES_RANK2 0x04
-UMASK_POWER_THROTTLE_CYCLES_RANK3 0x08
-UMASK_POWER_THROTTLE_CYCLES_RANK4 0x10
-UMASK_POWER_THROTTLE_CYCLES_RANK5 0x20
-UMASK_POWER_THROTTLE_CYCLES_RANK6 0x40
-UMASK_POWER_THROTTLE_CYCLES_RANK7 0x80
-
-EVENT_PREEMPTION 0x08 MBOX
-UMASK_PREEMPTION_RD_PREEMPT_RD 0x01
-UMASK_PREEMPTION_RD_PREEMPT_WR 0x02
-
-EVENT_PRE_COUNT 0x02 MBOX
-UMASK_PRE_COUNT_PAGE_MISS 0x01
-UMASK_PRE_COUNT_PAGE_CLOSE 0x02
-
-EVENT_RD_CAS_PRIO 0xA0 MBOX
-UMASK_RD_CAS_PRIO_LOW 0x01
-UMASK_RD_CAS_PRIO_MED 0x02
-UMASK_RD_CAS_PRIO_HIGH 0x04
-UMASK_RD_CAS_PRIO_PANIC 0x08
-
-EVENT_RD_CAS_RANK0 0xB0 MBOX
-UMASK_RD_CAS_RANK0_BANK0 0x01
-UMASK_RD_CAS_RANK0_BANK1 0x02
-UMASK_RD_CAS_RANK0_BANK2 0x04
-UMASK_RD_CAS_RANK0_BANK3 0x08
-UMASK_RD_CAS_RANK0_BANK4 0x10
-UMASK_RD_CAS_RANK0_BANK5 0x20
-UMASK_RD_CAS_RANK0_BANK6 0x40
-UMASK_RD_CAS_RANK0_BANK7 0x80
-
-EVENT_RD_CAS_RANK1 0xB1 MBOX
-UMASK_RD_CAS_RANK1_BANK0 0x01
-UMASK_RD_CAS_RANK1_BANK1 0x02
-UMASK_RD_CAS_RANK1_BANK2 0x04
-UMASK_RD_CAS_RANK1_BANK3 0x08
-UMASK_RD_CAS_RANK1_BANK4 0x10
-UMASK_RD_CAS_RANK1_BANK5 0x20
-UMASK_RD_CAS_RANK1_BANK6 0x40
-UMASK_RD_CAS_RANK1_BANK7 0x80
-
-EVENT_RD_CAS_RANK2 0xB2 MBOX
-UMASK_RD_CAS_RANK2_BANK0 0x01
-UMASK_RD_CAS_RANK2_BANK1 0x02
-UMASK_RD_CAS_RANK2_BANK2 0x04
-UMASK_RD_CAS_RANK2_BANK3 0x08
-UMASK_RD_CAS_RANK2_BANK4 0x10
-UMASK_RD_CAS_RANK2_BANK5 0x20
-UMASK_RD_CAS_RANK2_BANK6 0x40
-UMASK_RD_CAS_RANK2_BANK7 0x80
-
-EVENT_RD_CAS_RANK3 0xB3 MBOX
-UMASK_RD_CAS_RANK3_BANK0 0x01
-UMASK_RD_CAS_RANK3_BANK1 0x02
-UMASK_RD_CAS_RANK3_BANK2 0x04
-UMASK_RD_CAS_RANK3_BANK3 0x08
-UMASK_RD_CAS_RANK3_BANK4 0x10
-UMASK_RD_CAS_RANK3_BANK5 0x20
-UMASK_RD_CAS_RANK3_BANK6 0x40
-UMASK_RD_CAS_RANK3_BANK7 0x80
-
-EVENT_RD_CAS_RANK4 0xB4 MBOX
-UMASK_RD_CAS_RANK4_BANK0 0x01
-UMASK_RD_CAS_RANK4_BANK1 0x02
-UMASK_RD_CAS_RANK4_BANK2 0x04
-UMASK_RD_CAS_RANK4_BANK3 0x08
-UMASK_RD_CAS_RANK4_BANK4 0x10
-UMASK_RD_CAS_RANK4_BANK5 0x20
-UMASK_RD_CAS_RANK4_BANK6 0x40
-UMASK_RD_CAS_RANK4_BANK7 0x80
-
-EVENT_RD_CAS_RANK5 0xB5 MBOX
-UMASK_RD_CAS_RANK5_BANK0 0x01
-UMASK_RD_CAS_RANK5_BANK1 0x02
-UMASK_RD_CAS_RANK5_BANK2 0x04
-UMASK_RD_CAS_RANK5_BANK3 0x08
-UMASK_RD_CAS_RANK5_BANK4 0x10
-UMASK_RD_CAS_RANK5_BANK5 0x20
-UMASK_RD_CAS_RANK5_BANK6 0x40
-UMASK_RD_CAS_RANK5_BANK7 0x80
-
-EVENT_RD_CAS_RANK6 0xB6 MBOX
-UMASK_RD_CAS_RANK6_BANK0 0x01
-UMASK_RD_CAS_RANK6_BANK1 0x02
-UMASK_RD_CAS_RANK6_BANK2 0x04
-UMASK_RD_CAS_RANK6_BANK3 0x08
-UMASK_RD_CAS_RANK6_BANK4 0x10
-UMASK_RD_CAS_RANK6_BANK5 0x20
-UMASK_RD_CAS_RANK6_BANK6 0x40
-UMASK_RD_CAS_RANK6_BANK7 0x80
-
-EVENT_RD_CAS_RANK7 0xB7 MBOX
-UMASK_RD_CAS_RANK7_BANK0 0x01
-UMASK_RD_CAS_RANK7_BANK1 0x02
-UMASK_RD_CAS_RANK7_BANK2 0x04
-UMASK_RD_CAS_RANK7_BANK3 0x08
-UMASK_RD_CAS_RANK7_BANK4 0x10
-UMASK_RD_CAS_RANK7_BANK5 0x20
-UMASK_RD_CAS_RANK7_BANK6 0x40
-UMASK_RD_CAS_RANK7_BANK7 0x80
-
-EVENT_RPQ_CYCLES_NE 0x11 MBOX
-UMASK_RPQ_CYCLES_NE 0x00
-
-EVENT_RPQ_INSERTS 0x10 MBOX
-UMASK_RPQ_INSERTS 0x00
-
-EVENT_VMSE_MXB_WR_OCCUPANCY 0x91 MBOX
-UMASK_VMSE_MXB_WR_OCCUPANCY 0x00
-
-EVENT_VMSE_WR_PUSH 0x90 MBOX
-UMASK_VMSE_WR_PUSH 0x00
-
-EVENT_WMM_TO_RMM 0xC0 MBOX
-UMASK_WMM_TO_RMM 0x00
-
-EVENT_WPQ_CYCLES_FULL 0x22 MBOX
-UMASK_WPQ_CYCLES_FULL 0x00
-
-EVENT_WPQ_CYCLES_NE 0x21 MBOX
-UMASK_WPQ_CYCLES_NE 0x00
-
-EVENT_WPQ_INSERTS 0x20 MBOX
-UMASK_WPQ_INSERTS 0x00
-
-EVENT_WPQ_READ_HIT 0x23 MBOX
-UMASK_WPQ_READ_HIT 0x00
-
-EVENT_WPQ_WRITE_HIT 0x24 MBOX
-UMASK_WPQ_WRITE_HIT 0x00
-
-EVENT_WRONG_MM 0xC1 MBOX
-UMASK_WRONG_MM 0x00
-
-EVENT_WR_CAS_RANK0 0xB8 MBOX
-UMASK_WR_CAS_RANK0_BANK0 0x01
-UMASK_WR_CAS_RANK0_BANK1 0x02
-UMASK_WR_CAS_RANK0_BANK2 0x04
-UMASK_WR_CAS_RANK0_BANK3 0x08
-UMASK_WR_CAS_RANK0_BANK4 0x10
-UMASK_WR_CAS_RANK0_BANK5 0x20
-UMASK_WR_CAS_RANK0_BANK6 0x40
-UMASK_WR_CAS_RANK0_BANK7 0x80
-
-EVENT_WR_CAS_RANK1 0xB9 MBOX
-UMASK_WR_CAS_RANK1_BANK0 0x01
-UMASK_WR_CAS_RANK1_BANK1 0x02
-UMASK_WR_CAS_RANK1_BANK2 0x04
-UMASK_WR_CAS_RANK1_BANK3 0x08
-UMASK_WR_CAS_RANK1_BANK4 0x10
-UMASK_WR_CAS_RANK1_BANK5 0x20
-UMASK_WR_CAS_RANK1_BANK6 0x40
-UMASK_WR_CAS_RANK1_BANK7 0x80
-
-EVENT_WR_CAS_RANK2 0xBA MBOX
-UMASK_WR_CAS_RANK2_BANK0 0x01
-UMASK_WR_CAS_RANK2_BANK1 0x02
-UMASK_WR_CAS_RANK2_BANK2 0x04
-UMASK_WR_CAS_RANK2_BANK3 0x08
-UMASK_WR_CAS_RANK2_BANK4 0x10
-UMASK_WR_CAS_RANK2_BANK5 0x20
-UMASK_WR_CAS_RANK2_BANK6 0x40
-UMASK_WR_CAS_RANK2_BANK7 0x80
-
-EVENT_WR_CAS_RANK3 0xBB MBOX
-UMASK_WR_CAS_RANK3_BANK0 0x01
-UMASK_WR_CAS_RANK3_BANK1 0x02
-UMASK_WR_CAS_RANK3_BANK2 0x04
-UMASK_WR_CAS_RANK3_BANK3 0x08
-UMASK_WR_CAS_RANK3_BANK4 0x10
-UMASK_WR_CAS_RANK3_BANK5 0x20
-UMASK_WR_CAS_RANK3_BANK6 0x40
-UMASK_WR_CAS_RANK3_BANK7 0x80
-
-EVENT_WR_CAS_RANK4 0xBC MBOX
-UMASK_WR_CAS_RANK4_BANK0 0x01
-UMASK_WR_CAS_RANK4_BANK1 0x02
-UMASK_WR_CAS_RANK4_BANK2 0x04
-UMASK_WR_CAS_RANK4_BANK3 0x08
-UMASK_WR_CAS_RANK4_BANK4 0x10
-UMASK_WR_CAS_RANK4_BANK5 0x20
-UMASK_WR_CAS_RANK4_BANK6 0x40
-UMASK_WR_CAS_RANK4_BANK7 0x80
-
-EVENT_WR_CAS_RANK5 0xBD MBOX
-UMASK_WR_CAS_RANK5_BANK0 0x01
-UMASK_WR_CAS_RANK5_BANK1 0x02
-UMASK_WR_CAS_RANK5_BANK2 0x04
-UMASK_WR_CAS_RANK5_BANK3 0x08
-UMASK_WR_CAS_RANK5_BANK4 0x10
-UMASK_WR_CAS_RANK5_BANK5 0x20
-UMASK_WR_CAS_RANK5_BANK6 0x40
-UMASK_WR_CAS_RANK5_BANK7 0x80
-
-EVENT_WR_CAS_RANK6 0xBE MBOX
-UMASK_WR_CAS_RANK6_BANK0 0x01
-UMASK_WR_CAS_RANK6_BANK1 0x02
-UMASK_WR_CAS_RANK6_BANK2 0x04
-UMASK_WR_CAS_RANK6_BANK3 0x08
-UMASK_WR_CAS_RANK6_BANK4 0x10
-UMASK_WR_CAS_RANK6_BANK5 0x20
-UMASK_WR_CAS_RANK6_BANK6 0x40
-UMASK_WR_CAS_RANK6_BANK7 0x80
-
-EVENT_WR_CAS_RANK7 0xBF MBOX
-UMASK_WR_CAS_RANK7_BANK0 0x01
-UMASK_WR_CAS_RANK7_BANK1 0x02
-UMASK_WR_CAS_RANK7_BANK2 0x04
-UMASK_WR_CAS_RANK7_BANK3 0x08
-UMASK_WR_CAS_RANK7_BANK4 0x10
-UMASK_WR_CAS_RANK7_BANK5 0x20
-UMASK_WR_CAS_RANK7_BANK6 0x40
-UMASK_WR_CAS_RANK7_BANK7 0x80
+UMASK_L2_LINES_OUT_PF_CLEAN 0x04
+UMASK_L2_LINES_OUT_PF_DIRTY 0x08
+UMASK_L2_LINES_OUT_DIRTY_ALL 0x0A
+UMASK_L2_LINES_OUT_CLEAN_ALL 0x05
+UMASK_L2_LINES_OUT_ALL 0x0F
+
+EVENT_MEM_LOAD_UOPS_LLC_MISS_RETIRED 0xD3 PMC
+UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_LOCAL_DRAM 0x01
+
+EVENT_OFFCORE_RESPONSE_0 0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS 0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY 0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY 0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY 0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY 0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY 0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY 0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY 0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY 0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY 0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY 0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY 0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY 0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY 0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1 0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS 0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY 0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY 0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY 0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY 0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY 0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY 0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY 0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY 0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY 0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY 0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY 0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY 0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY 0x01 0x0F 0x10
+
+
+EVENT_CACHE_LOOKUP 0x34 CBOX
+UMASK_CACHE_LOOKUP_M 0x01
+UMASK_CACHE_LOOKUP_E 0x02
+UMASK_CACHE_LOOKUP_S 0x04
+UMASK_CACHE_LOOKUP_I 0x08
+UMASK_CACHE_LOOKUP_READ_FILTER 0x10
+UMASK_CACHE_LOOKUP_WRITE_FILTER 0x20
+UMASK_CACHE_LOOKUP_EXTSNP_FILTER 0x40
+UMASK_CACHE_LOOKUP_ANY_REQUEST_FILTER 0x80
+UMASK_CACHE_LOOKUP_READ_M 0x11
+UMASK_CACHE_LOOKUP_WRITE_M 0x21
+UMASK_CACHE_LOOKUP_EXTSNP_M 0x41
+UMASK_CACHE_LOOKUP_ANY_M 0x81
+UMASK_CACHE_LOOKUP_READ_E 0x12
+UMASK_CACHE_LOOKUP_WRITE_E 0x22
+UMASK_CACHE_LOOKUP_EXTSNP_E 0x42
+UMASK_CACHE_LOOKUP_ANY_E 0x82
+UMASK_CACHE_LOOKUP_READ_S 0x14
+UMASK_CACHE_LOOKUP_WRITE_S 0x24
+UMASK_CACHE_LOOKUP_EXTSNP_S 0x44
+UMASK_CACHE_LOOKUP_ANY_S 0x84
+UMASK_CACHE_LOOKUP_READ_ES 0x16
+UMASK_CACHE_LOOKUP_WRITE_ES 0x26
+UMASK_CACHE_LOOKUP_EXTSNP_ES 0x46
+UMASK_CACHE_LOOKUP_ANY_ES 0x86
+UMASK_CACHE_LOOKUP_READ_I 0x18
+UMASK_CACHE_LOOKUP_WRITE_I 0x28
+UMASK_CACHE_LOOKUP_EXTSNP_I 0x48
+UMASK_CACHE_LOOKUP_ANY_I 0x88
+UMASK_CACHE_LOOKUP_READ_MESI 0x1F
+UMASK_CACHE_LOOKUP_WRITE_MESI 0x2F
+UMASK_CACHE_LOOKUP_EXTSNP_MESI 0x4F
+UMASK_CACHE_LOOKUP_ANY_MESI 0x8F
+
+EVENT_XSNP_RESPONSE 0x22 CBOX
+UMASK_XSNP_RESPONSE_MISS 0x01
+UMASK_XSNP_RESPONSE_INVAL 0x02
+UMASK_XSNP_RESPONSE_HIT 0x04
+UMASK_XSNP_RESPONSE_HITM 0x08
+UMASK_XSNP_RESPONSE_INVAL_M 0x10
+UMASK_XSNP_RESPONSE_EXTERNAL_FILTER 0x20
+UMASK_XSNP_RESPONSE_XCORE_FILTER 0x40
+UMASK_XSNP_RESPONSE_EVICTION_FILTER 0x80
+UMASK_XSNP_RESPONSE_MISS_EXTERNAL 0x21
+UMASK_XSNP_RESPONSE_MISS_XCORE 0x41
+UMASK_XSNP_RESPONSE_MISS_EVICTION 0x81
+UMASK_XSNP_RESPONSE_INVAL_EXTERNAL 0x22
+UMASK_XSNP_RESPONSE_INVAL_XCORE 0x42
+UMASK_XSNP_RESPONSE_INVAL_EVICTION 0x82
+UMASK_XSNP_RESPONSE_HIT_EXTERNAL 0x24
+UMASK_XSNP_RESPONSE_HIT_XCORE 0x44
+UMASK_XSNP_RESPONSE_HIT_EVICTION 0x84
+UMASK_XSNP_RESPONSE_HITM_EXTERNAL 0x28
+UMASK_XSNP_RESPONSE_HITM_XCORE 0x48
+UMASK_XSNP_RESPONSE_HITM_EVICTION 0x88
+
+EVENT_TRK_OCCUPANCY_ALL 0x80 UBOX0
+UMASK_TRK_OCCUPANCY_ALL 0x01
+DEFAULT_OPTIONS_TRK_OCCUPANCY_CYCLES_WITH_ANY_REQUEST EVENT_OPTION_THRESHOLD=0x1
+UMASK_TRK_OCCUPANCY_CYCLES_WITH_ANY_REQUEST 0x01
+DEFAULT_OPTIONS_TRK_OCCUPANCY_CYCLES_OVER_HALF_FULL EVENT_OPTION_THRESHOLD=0xA
+UMASK_TRK_OCCUPANCY_CYCLES_OVER_HALF_FULL 0x01
+
+EVENT_TRK_REQUESTS 0x81 UBOX
+UMASK_TRK_REQUESTS_ALL 0x01
+UMASK_TRK_REQUESTS_WRITES 0x20
+UMASK_TRK_REQUESTS_EVICTIONS 0x80
+
+EVENT_COH_TRK_OCCUPANCY 0x83 UBOX0
+UMASK_COH_TRK_OCCUPANCY 0x01
+
+EVENT_COH_TRK_REQUESTS 0x84 UBOX
+UMASK_COH_TRK_REQUESTS_ALL 0x01
+
+EVENT_UNCORE_CLOCK 0x00 UBOXFIX
+UMASK_UNCORE_CLOCK 0x01
diff --git a/src/includes/perfmon_k10.h b/src/includes/perfmon_k10.h
index cc614af..2a7bc59 100644
--- a/src/includes/perfmon_k10.h
+++ b/src/includes/perfmon_k10.h
@@ -3,15 +3,16 @@
*
* Filename: perfmon_k10.h
*
- * Description: Header file of perfmon module for K10
+ * Description: Header file of perfmon module for AMD K10
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -29,119 +30,197 @@
*/
#include <perfmon_k10_events.h>
-#include <perfmon_k10_groups.h>
#include <perfmon_k10_counters.h>
+#include <error.h>
static int perfmon_numCountersK10 = NUM_COUNTERS_K10;
-static int perfmon_numGroupsK10 = NUM_GROUPS_K10;
static int perfmon_numArchEventsK10 = NUM_ARCH_EVENTS_K10;
-void perfmon_init_k10(PerfmonThread *thread)
+int perfmon_init_k10(int cpu_id)
{
- uint64_t flags = 0x0ULL;
- int cpu_id = thread->processorId;
+ return 0;
+}
- msr_write(cpu_id, MSR_AMD_PERFEVTSEL0, 0x0ULL);
- msr_write(cpu_id, MSR_AMD_PERFEVTSEL1, 0x0ULL);
- msr_write(cpu_id, MSR_AMD_PERFEVTSEL2, 0x0ULL);
- msr_write(cpu_id, MSR_AMD_PERFEVTSEL3, 0x0ULL);
+int k10_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent* event)
+{
+ uint64_t flags = 0x0ULL;
- //flags |= (1<<16); /* user mode flag */
+ flags |= (1ULL<<16);
+ flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
- /*msr_write(cpu_id, MSR_AMD_PERFEVTSEL0, flags);
- msr_write(cpu_id, MSR_AMD_PERFEVTSEL1, flags);
- msr_write(cpu_id, MSR_AMD_PERFEVTSEL2, flags);
- msr_write(cpu_id, MSR_AMD_PERFEVTSEL3, flags);*/
+ if (event->numberOfOptions > 0)
+ {
+ for(int j=0;j<event->numberOfOptions;j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<17);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ if ((event->options[j].value & 0xFFULL) < 0x04ULL)
+ {
+ flags |= (event->options[j].value & 0xFFULL) << 24;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
}
-
-void perfmon_setupCounterThread_k10(
- int thread_id,
- PerfmonEvent* event,
- PerfmonCounterIndex index)
+int perfmon_setupCounterThread_k10(int thread_id, PerfmonEventSet* eventSet)
{
- uint64_t flags;
- uint64_t reg = k10_counter_map[index].configRegister;
- int cpu_id = perfmon_threadData[thread_id].processorId;
- perfmon_threadData[thread_id].counters[index].init = TRUE;
-
- flags |= (1<<16);
+ int cpu_id = groupSet->threads[thread_id].processorId;
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ if (type == PMC)
+ {
+ k10_pmc_setup(cpu_id, index, event);
+ eventSet->events[i].threadCounter[thread_id].init = TRUE;
+ }
+ }
+ return 0;
+}
- /* AMD uses a 12 bit Event mask: [35:32][7:0] */
- flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
+int perfmon_startCountersThread_k10(int thread_id, PerfmonEventSet* eventSet)
+{
+ uint64_t flags = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- if (perfmon_verbose)
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n",
- cpu_id,
- LLU_CAST reg,
- LLU_CAST flags);
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ uint32_t reg = counter_map[index].configRegister;
+ uint32_t counter = counter_map[index].counterRegister;
+ eventSet->events[i].threadCounter[thread_id].startData = 0;
+ eventSet->events[i].threadCounter[thread_id].counterData = 0;
+ VERBOSEPRINTREG(cpu_id, counter, 0x0ULL, CLEAR_PMC);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter, 0x0ULL));
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &flags));
+ VERBOSEPRINTREG(cpu_id, reg, flags, READ_PMC_CTRL);
+ flags |= (1ULL<<22); /* enable flag */
+ VERBOSEPRINTREG(cpu_id, reg, flags, START_PMC);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+ }
}
- msr_write(cpu_id, reg , flags);
+ return 0;
}
-void perfmon_startCountersThread_k10(int thread_id)
+int perfmon_stopCountersThread_k10(int thread_id, PerfmonEventSet* eventSet)
{
- uint64_t flags;
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ uint64_t flags = 0x0ULL;
+ uint64_t tmp;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- for ( int i=0; i<NUM_COUNTERS_K10; i++)
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- msr_write(cpu_id, k10_counter_map[i].counterRegister , 0x0ULL);
- flags = msr_read(cpu_id, k10_counter_map[i].configRegister);
- flags |= (1<<22); /* enable flag */
-
- if (perfmon_verbose)
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- printf("perfmon_start_counters: Write Register 0x%llX , Flags: 0x%llX \n",
- LLU_CAST k10_counter_map[i].configRegister,
- LLU_CAST flags);
+ continue;
}
-
- msr_write(cpu_id, k10_counter_map[i].configRegister , flags);
+ tmp = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ uint32_t reg = counter_map[index].configRegister;
+ uint32_t counter = counter_map[index].counterRegister;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &flags));
+ VERBOSEPRINTREG(cpu_id, reg, flags, READ_PMC_CTRL);
+ flags &= ~(1ULL<<22); /* clear enable flag */
+ VERBOSEPRINTREG(cpu_id, reg, flags, STOP_PMC);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &tmp));
+ VERBOSEPRINTREG(cpu_id, counter, tmp, READ_PMC);
+ if (tmp < eventSet->events[i].threadCounter[thread_id].counterData)
+ {
+ eventSet->events[i].threadCounter[thread_id].overflows++;
+ }
+ eventSet->events[i].threadCounter[thread_id].counterData = field64(tmp, 0, box_map[type].regWidth);
}
}
+ return 0;
}
-void perfmon_stopCountersThread_k10(int thread_id)
+int perfmon_readCountersThread_k10(int thread_id, PerfmonEventSet* eventSet)
{
- uint64_t flags;
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ uint64_t tmp;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- for ( int i=0; i<NUM_COUNTERS_K10; i++ )
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- flags = msr_read(cpu_id, k10_counter_map[i].configRegister);
- flags &= ~(1<<22); /* clear enable flag */
- msr_write(cpu_id, k10_counter_map[i].configRegister , flags);
-
- if (perfmon_verbose)
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- printf("perfmon_stop_counters: Write Register 0x%llX , Flags: 0x%llX \n",
- LLU_CAST k10_counter_map[i].configRegister,
- LLU_CAST flags);
+ continue;
}
-
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, k10_counter_map[i].counterRegister);
+ tmp = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ uint32_t counter = counter_map[index].counterRegister;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &tmp));
+ VERBOSEPRINTREG(cpu_id, counter, tmp, READ_PMC);
+ if (tmp < eventSet->events[i].threadCounter[thread_id].counterData)
+ {
+ eventSet->events[i].threadCounter[thread_id].overflows++;
+ }
+ eventSet->events[i].threadCounter[thread_id].counterData = field64(tmp, 0, box_map[type].regWidth);
}
}
+ return 0;
}
-void perfmon_readCountersThread_k10(int thread_id)
+
+int perfmon_finalizeCountersThread_k10(int thread_id, PerfmonEventSet* eventSet)
{
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- for ( int i=0; i<NUM_COUNTERS_K10; i++ )
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, k10_counter_map[i].counterRegister);
+ continue;
}
+ RegisterIndex index = eventSet->events[i].index;
+ uint32_t reg = counter_map[index].configRegister;
+ if (reg)
+ {
+ VERBOSEPRINTREG(cpu_id, reg, 0x0ULL, CLEAR_CTRL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL));
+ }
+ eventSet->events[i].threadCounter[thread_id].init = FALSE;
}
+ return 0;
}
-
diff --git a/src/includes/perfmon_k10_counters.h b/src/includes/perfmon_k10_counters.h
index d01be3d..e94e29a 100644
--- a/src/includes/perfmon_k10_counters.h
+++ b/src/includes/perfmon_k10_counters.h
@@ -3,15 +3,16 @@
*
* Filename: perfmon_k10_counters.h
*
- * Description: AMD K10 specific subroutines
+ * Description: AMD K10 performance counter definition. Also used for AMD K8.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -31,10 +32,15 @@
#define NUM_COUNTERS_K10 4
#define NUM_COUNTERS_CORE_K10 4
-static PerfmonCounterMap k10_counter_map[NUM_COUNTERS_K10] = {
- {"PMC0",PMC0, PMC, MSR_AMD_PERFEVTSEL0, MSR_AMD_PMC0, 0, 0},
- {"PMC1",PMC1, PMC, MSR_AMD_PERFEVTSEL1, MSR_AMD_PMC1, 0, 0},
- {"PMC2",PMC2, PMC, MSR_AMD_PERFEVTSEL2, MSR_AMD_PMC2, 0, 0},
- {"PMC3",PMC3, PMC, MSR_AMD_PERFEVTSEL3, MSR_AMD_PMC3, 0, 0}
+#define K10_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD
+
+static RegisterMap k10_counter_map[NUM_COUNTERS_K10] = {
+ {"PMC0",PMC0, PMC, MSR_AMD_PERFEVTSEL0, MSR_AMD_PMC0, 0, 0, K10_VALID_OPTIONS_PMC},
+ {"PMC1",PMC1, PMC, MSR_AMD_PERFEVTSEL1, MSR_AMD_PMC1, 0, 0, K10_VALID_OPTIONS_PMC},
+ {"PMC2",PMC2, PMC, MSR_AMD_PERFEVTSEL2, MSR_AMD_PMC2, 0, 0, K10_VALID_OPTIONS_PMC},
+ {"PMC3",PMC3, PMC, MSR_AMD_PERFEVTSEL3, MSR_AMD_PMC3, 0, 0, K10_VALID_OPTIONS_PMC}
};
+static BoxMap k10_box_map[NUM_UNITS] = {
+ [PMC] = {0, 0, 0, 0, 0, 0, 48}
+};
diff --git a/src/includes/perfmon_k10_events.txt b/src/includes/perfmon_k10_events.txt
index 64c20e9..d45d790 100644
--- a/src/includes/perfmon_k10_events.txt
+++ b/src/includes/perfmon_k10_events.txt
@@ -1,16 +1,17 @@
# =======================================================================================
-#
+#
# Filename: perfmon_k10_events.txt
-#
+#
# Description: Event list for AMD K10
-#
-# Version: 3.1.3
-# Released: 4.11.2014
-#
-# Author: Jan Treibig (jt), jan.treibig at gmail.com
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
+# Author: Jan Treibig (jt), jan.treibig at gmail.com
+# Thomas Roehl (tr), thomas.roehl at googlemail.com
# Project: likwid
#
-# Copyright (C) 2014 Jan Treibig
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
@@ -132,17 +133,17 @@ UMASK_DATA_CACHE_EVICTED_PREFETCH_NTA 0x20
UMASK_DATA_CACHE_EVICTED_NOT_PREFETCH_NTA 0x40
UMASK_DATA_CACHE_EVICTED_ALL 0x1F
-EVENT_DTLB_L2_HIT 0x45 PMC
-UMASK_DTLB_L2_HIT_4K 0x01
-UMASK_DTLB_L2_HIT_2M 0x02
-UMASK_DTLB_L2_HIT_1G 0x04
-UMASK_DTLB_L2_HIT_ALL 0x07
+EVENT_DTLB_L2_HIT 0x45 PMC
+UMASK_DTLB_L2_HIT_4KB 0x01
+UMASK_DTLB_L2_HIT_2MB 0x02
+UMASK_DTLB_L2_HIT_1GB 0x04
+UMASK_DTLB_L2_HIT_ALL 0x07
-EVENT_DTLB_L2_MISS 0x46 PMC
-UMASK_DTLB_L2_MISS_4K 0x01
-UMASK_DTLB_L2_MISS_2M 0x02
-UMASK_DTLB_L2_MISS_1G 0x04
-UMASK_DTLB_L2_MISS_ALL 0x07
+EVENT_DTLB_L2_MISS 0x46 PMC
+UMASK_DTLB_L2_MISS_4KB 0x01
+UMASK_DTLB_L2_MISS_2MB 0x02
+UMASK_DTLB_L2_MISS_1GB 0x04
+UMASK_DTLB_L2_MISS_ALL 0x07
EVENT_MISALIGNED_ACCESS 0x47 PMC
UMASK_MISALIGNED_ACCESS 0x00
@@ -167,10 +168,11 @@ UMASK_PREFETCH_INSTRUCTION_DISPATCHED_NTA 0x04
EVENT_DCACHE_LOCK_MISS 0x4C PMC
UMASK_DCACHE_LOCK_MISS 0x02
-EVENT_DTLB_L1_HIT 0x4D PMC
-UMASK_DTLB_L1_HIT_4K 0x01
-UMASK_DTLB_L1_HIT_2M 0x02
-UMASK_DTLB_L1_HIT_1G 0x04
+EVENT_DTLB_L1_HIT 0x4D PMC
+UMASK_DTLB_L1_HIT_4KB 0x01
+UMASK_DTLB_L1_HIT_2MB 0x02
+UMASK_DTLB_L1_HIT_1GB 0x04
+UMASK_DTLB_L1_HIT_ANY 0x07
EVENT_SW_PREFETCH_HIT 0x52 PMC
UMASK_SW_PREFETCH_HIT_L1 0x01
@@ -238,9 +240,10 @@ UMASK_ICACHE_REFILLS_MEM 0x00
EVENT_ITLB_L2_HIT 0x84 PMC
UMASK_ITLB_L2_HIT 0x00
-EVENT_ITLB_L2_MISS 0x85 PMC
-UMASK_ITLB_L2_MISS_4K 0x01
-UMASK_ITLB_L2_MISS_2M 0x02
+EVENT_ITLB_L2_MISS 0x85 PMC
+UMASK_ITLB_L2_MISS_4KB 0x01
+UMASK_ITLB_L2_MISS_2MB 0x02
+UMASK_ITLB_L2_MISS_ANY 0x03
EVENT_PIPELINE_RESTART_STREAM_PROBE 0x86 PMC
UMASK_PIPELINE_RESTART_STREAM_PROBE 0x00
diff --git a/src/includes/perfmon_k8.h b/src/includes/perfmon_k8.h
index 9313168..513929b 100644
--- a/src/includes/perfmon_k8.h
+++ b/src/includes/perfmon_k8.h
@@ -3,17 +3,17 @@
*
* Filename: perfmon_k8.h
*
- * Description: Header File of perfmon module for K8 support.
- * Configures and reads out performance counters
- * on x86 based architectures. Supports multi threading.
+ * Description: Header File of perfmon module for AMD K8 support.
+ * The setup routines and registers are similar to AMD K10
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -31,10 +31,9 @@
*/
#include <perfmon_k8_events.h>
-#include <perfmon_k8_groups.h>
+#include <error.h>
-static int perfmon_numGroupsK8 = NUM_GROUPS_K8;
static int perfmon_numArchEventsK8 = NUM_ARCH_EVENTS_K8;
diff --git a/src/includes/perfmon_k8_events.txt b/src/includes/perfmon_k8_events.txt
index 127b56f..48d0614 100644
--- a/src/includes/perfmon_k8_events.txt
+++ b/src/includes/perfmon_k8_events.txt
@@ -1,16 +1,17 @@
# =======================================================================================
-#
+#
# Filename: perfmon_k8_events.txt
-#
+#
# Description: Event list for AMD K8
-#
-# Version: 3.1.3
-# Released: 4.11.2014
-#
-# Author: Jan Treibig (jt), jan.treibig at gmail.com
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
+# Author: Jan Treibig (jt), jan.treibig at gmail.com
+# Thomas Roehl (tr), thomas.roehl at googlemail.com
# Project: likwid
#
-# Copyright (C) 2014 Jan Treibig
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
@@ -97,15 +98,17 @@ UMASK_DATA_CACHE_EVICTED_PREFETCH_NTA 0x20
UMASK_DATA_CACHE_EVICTED_NOT_PREFETCH_NTA 0x40
UMASK_DATA_CACHE_EVICTED_ALL 0x1F
-EVENT_DTLB_L2_HIT 0x45 PMC
-UMASK_DTLB_L2_HIT_4K 0x01
-UMASK_DTLB_L2_HIT_2M 0x02
-UMASK_DTLB_L2_HIT_1G 0x04
+EVENT_DTLB_L2_HIT 0x45 PMC
+UMASK_DTLB_L2_HIT_4KB 0x01
+UMASK_DTLB_L2_HIT_2MB 0x02
+UMASK_DTLB_L2_HIT_1GB 0x04
+UMASK_DTLB_L2_HIT_ANY 0x07
-EVENT_DTLB_L2_MISS 0x46 PMC
-UMASK_DTLB_L2_MISS_4K 0x01
-UMASK_DTLB_L2_MISS_2M 0x02
-UMASK_DTLB_L2_MISS_1G 0x04
+EVENT_DTLB_L2_MISS 0x46 PMC
+UMASK_DTLB_L2_MISS_4KB 0x01
+UMASK_DTLB_L2_MISS_2MB 0x02
+UMASK_DTLB_L2_MISS_1GB 0x04
+UMASK_DTLB_L2_MISS_ANY 0x07
EVENT_MISALIGNED_ACCESS 0x47 PMC
UMASK_MISALIGNED_ACCESS 0x00
@@ -178,9 +181,10 @@ UMASK_ICACHE_REFILLS_MEM 0x00
EVENT_ITLB_L2_HIT 0x84 PMC
UMASK_ITLB_L2_HIT 0x00
-EVENT_ITLB_L2_MISS 0x85 PMC
-UMASK_ITLB_L2_MISS_4K 0x01
-UMASK_ITLB_L2_MISS_2M 0x02
+EVENT_ITLB_L2_MISS 0x85 PMC
+UMASK_ITLB_L2_MISS_4KB 0x01
+UMASK_ITLB_L2_MISS_2MB 0x02
+UMASK_ITLB_L2_MISS_ANY 0x03
EVENT_PIPELINE_RESTART_STREAM_PROBE 0x86 PMC
UMASK_PIPELINE_RESTART_STREAM_PROBE 0x00
diff --git a/src/includes/perfmon_kabini.h b/src/includes/perfmon_kabini.h
index 018eb04..323e713 100644
--- a/src/includes/perfmon_kabini.h
+++ b/src/includes/perfmon_kabini.h
@@ -3,15 +3,16 @@
*
* Filename: perfmon_kabini.h
*
- * Description: Header file of perfmon module for AMD Family16
+ * Description: Header file of perfmon module for AMD Family 16
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -29,218 +30,331 @@
*/
#include <perfmon_kabini_events.h>
-#include <perfmon_kabini_groups.h>
#include <perfmon_kabini_counters.h>
+#include <error.h>
+#include <affinity.h>
static int perfmon_numCountersKabini = NUM_COUNTERS_KABINI;
-static int perfmon_numGroupsKabini = NUM_GROUPS_KABINI;
static int perfmon_numArchEventsKabini = NUM_ARCH_EVENTS_KABINI;
-void perfmon_init_kabini(PerfmonThread *thread)
+int perfmon_init_kabini(int cpu_id)
+{
+ lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+ lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+ return 0;
+}
+
+
+int k16_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent* event)
{
uint64_t flags = 0x0ULL;
- int cpu_id = thread->processorId;
- msr_write(cpu_id, MSR_AMD16_PERFEVTSEL0, 0x0ULL);
- msr_write(cpu_id, MSR_AMD16_PERFEVTSEL1, 0x0ULL);
- msr_write(cpu_id, MSR_AMD16_PERFEVTSEL2, 0x0ULL);
- msr_write(cpu_id, MSR_AMD16_PERFEVTSEL3, 0x0ULL);
+ flags |= (1ULL<<16);
+ flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
- lock_acquire(
- (int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id)
- )
+ if (event->numberOfOptions > 0)
{
- msr_write(cpu_id, MSR_AMD16_NB_PERFEVTSEL0, 0x0ULL);
- msr_write(cpu_id, MSR_AMD16_NB_PERFEVTSEL1, 0x0ULL);
- msr_write(cpu_id, MSR_AMD16_NB_PERFEVTSEL2, 0x0ULL);
- msr_write(cpu_id, MSR_AMD16_NB_PERFEVTSEL3, 0x0ULL);
+ for(int j=0;j<event->numberOfOptions;j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<17);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ if ((event->options[j].value & 0xFFULL) < 0x04)
+ {
+ flags |= (event->options[j].value & 0xFFULL) << 24;
+ }
+ break;
+ default:
+ break;
+ }
+ }
}
-
- //flags |= (1<<16); /* user mode flag */
- /*msr_write(cpu_id, MSR_AMD16_PERFEVTSEL0, flags);
- msr_write(cpu_id, MSR_AMD16_PERFEVTSEL1, flags);
- msr_write(cpu_id, MSR_AMD16_PERFEVTSEL2, flags);
- msr_write(cpu_id, MSR_AMD16_PERFEVTSEL3, flags);*/
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
}
-
-void perfmon_setupCounterThread_kabini(
- int thread_id,
- PerfmonEvent* event,
- PerfmonCounterIndex index)
+int k16_uncore_setup(int cpu_id, RegisterIndex index, PerfmonEvent* event)
{
uint64_t flags = 0x0ULL;
- uint64_t reg = kabini_counter_map[index].configRegister;
- int cpu_id = perfmon_threadData[thread_id].processorId;
- perfmon_threadData[thread_id].counters[index].init = TRUE;
- /* only one thread accesses Uncore */
- if ( (kabini_counter_map[index].type == UNCORE) &&
- !(socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) )
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
{
- return;
+ return 0;
}
- if (kabini_counter_map[index].type == PMC)
+ flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
+ if (flags != currentConfig[cpu_id][index])
{
- flags |= (1<<16);
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_UNCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
}
+ return 0;
+}
- /* AMD uses a 12 bit Event mask: [35:32][7:0] */
- flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
+int k16_cache_setup(int cpu_id, RegisterIndex index, PerfmonEvent* event)
+{
+ uint64_t flags = 0x0ULL;
- if (perfmon_verbose)
+ if (tile_lock[affinity_thread2tile_lookup[cpu_id]] != cpu_id)
{
- printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n",
- cpu_id,
- LLU_CAST reg,
- LLU_CAST flags);
+ return 0;
}
- msr_write(cpu_id, reg , flags);
+ flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
+ if (event->numberOfOptions > 0)
+ {
+ for(int j=0;j<event->numberOfOptions;j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ if ((event->options[j].value & 0xFFULL) < 0x04)
+ {
+ flags |= (event->options[j].value & 0xFFULL) << 24;
+ }
+ break;
+ case EVENT_OPTION_TID:
+ flags |= (~((uint64_t)(event->options[j].value & 0xFULL))) << 56;
+ break;
+ case EVENT_OPTION_NID:
+ flags |= (~((uint64_t)(event->options[j].value & 0xFULL))) << 48;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_CBOX);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int perfmon_setupCounterThread_kabini(int thread_id, PerfmonEventSet* eventSet)
+{
+ int cpu_id = groupSet->threads[thread_id].processorId;
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ switch (type)
+ {
+ case PMC:
+ k16_pmc_setup(cpu_id, index, event);
+ break;
+ case UNCORE:
+ k16_uncore_setup(cpu_id, index, event);
+ break;
+ case CBOX0:
+ k16_cache_setup(cpu_id, index, event);
+ break;
+ default:
+ break;
+ }
+ eventSet->events[i].threadCounter[thread_id].init = TRUE;
+ }
+ return 0;
}
-void perfmon_startCountersThread_kabini(int thread_id)
+int perfmon_startCountersThread_kabini(int thread_id, PerfmonEventSet* eventSet)
{
- int haveLock = 0;
- uint64_t flags;
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ int haveSLock = 0;
+ int haveTLock = 0;
+ uint64_t flags = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveSLock = 1;
+ }
+ if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
{
- haveLock = 1;
+ haveTLock = 1;
}
- for ( int i=0; i<NUM_COUNTERS_KABINI; i++ )
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- if (kabini_counter_map[i].type == PMC)
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- msr_write(cpu_id, kabini_counter_map[i].counterRegister , 0x0ULL);
- flags = msr_read(cpu_id, kabini_counter_map[i].configRegister);
- flags |= (1<<22); /* enable flag */
-
- if (perfmon_verbose)
- {
- printf("perfmon_start_counters: Write Register 0x%llX , Flags: 0x%llX \n",
- LLU_CAST kabini_counter_map[i].configRegister,
- LLU_CAST flags);
- }
-
- msr_write(cpu_id, kabini_counter_map[i].configRegister , flags);
-
+ continue;
}
- else if ( kabini_counter_map[i].type == UNCORE )
+ RegisterIndex index = eventSet->events[i].index;
+ uint32_t reg = counter_map[index].configRegister;
+ uint32_t counter = counter_map[index].counterRegister;
+ eventSet->events[i].threadCounter[thread_id].startData = 0;
+ eventSet->events[i].threadCounter[thread_id].counterData = 0;
+ if ((type == PMC) ||
+ ((type == UNCORE) && (haveSLock)) ||
+ ((type == CBOX0) && (haveTLock)))
{
- if(haveLock)
- {
- msr_write(cpu_id, kabini_counter_map[i].counterRegister , 0x0ULL);
- flags = msr_read(cpu_id, kabini_counter_map[i].configRegister);
- flags |= (1<<22); /* enable flag */
-
- if (perfmon_verbose)
- {
- printf("perfmon_start_counters: Write Register 0x%llX , Flags: 0x%llX \n",
- LLU_CAST kabini_counter_map[i].configRegister,
- LLU_CAST flags);
- }
-
- msr_write(cpu_id, kabini_counter_map[i].configRegister , flags);
- }
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter, 0x0ULL));
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &flags));
+ flags |= (1ULL<<22); /* enable flag */
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
}
}
}
+ return 0;
}
-void perfmon_stopCountersThread_kabini(int thread_id)
+int perfmon_stopCountersThread_kabini(int thread_id, PerfmonEventSet* eventSet)
{
- uint64_t flags;
- int haveLock = 0;
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ uint64_t flags = 0x0ULL;
+ int haveSLock = 0;
+ int haveTLock = 0;
+ uint64_t counter_result = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveSLock = 1;
+ }
+ if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
{
- haveLock = 1;
+ haveTLock = 1;
}
- for ( int i=0; i<NUM_COUNTERS_KABINI; i++ )
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- if ( kabini_counter_map[i].type == PMC )
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- flags = msr_read(cpu_id,kabini_counter_map[i].configRegister);
- flags &= ~(1<<22); /* clear enable flag */
- msr_write(cpu_id, kabini_counter_map[i].configRegister , flags);
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, kabini_counter_map[i].counterRegister);
-
- if (perfmon_verbose)
- {
- printf("perfmon_stop_counters: Write Register 0x%llX , Flags: 0x%llX \n",
- LLU_CAST kabini_counter_map[i].configRegister,
- LLU_CAST flags);
- printf("perfmon_stop_counters: Read Register 0x%llX , Flags: 0x%llX \n",
- LLU_CAST kabini_counter_map[i].counterRegister,
- LLU_CAST perfmon_threadData[thread_id].counters[i].counterData);
- }
-
+ continue;
}
- else if (kabini_counter_map[i].type == UNCORE)
+ counter_result = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ uint32_t reg = counter_map[index].configRegister;
+ uint32_t counter = counter_map[index].counterRegister;
+ if ((type == PMC) ||
+ ((type == UNCORE) && (haveSLock)) ||
+ ((type == CBOX0) && (haveTLock)))
{
- if(haveLock)
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &flags));
+ flags &= ~(1ULL<<22); /* clear enable flag */
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
{
- flags = msr_read(cpu_id, kabini_counter_map[i].configRegister);
- flags &= ~(1<<22); /* clear enable flag */
- msr_write(cpu_id, kabini_counter_map[i].configRegister , flags);
-
- if (perfmon_verbose)
- {
- printf("perfmon_stop_counters: Write Register 0x%llX , Flags: 0x%llX \n",
- LLU_CAST kabini_counter_map[i].configRegister,
- LLU_CAST flags);
- }
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, kabini_counter_map[i].counterRegister);
+ eventSet->events[i].threadCounter[thread_id].overflows++;
}
+ eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
}
}
}
+ return 0;
}
-void perfmon_readCountersThread_kabini(int thread_id)
+int perfmon_readCountersThread_kabini(int thread_id, PerfmonEventSet* eventSet)
{
- int haveLock = 0;
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ int haveSLock = 0;
+ int haveTLock = 0;
+ uint64_t counter_result = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
- haveLock = 1;
+ haveSLock = 1;
+ }
+ if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+ {
+ haveTLock = 1;
}
-
- for (int i=0;i<NUM_COUNTERS_KABINI;i++)
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- if ( kabini_counter_map[i].type == UNCORE )
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- if ( haveLock )
- {
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, kabini_counter_map[i].counterRegister);
- }
+ continue;
}
- else
+ counter_result = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ uint32_t counter = counter_map[index].counterRegister;
+
+ if ((type == PMC) ||
+ ((type == UNCORE) && (haveSLock)) ||
+ ((type == CBOX0) && (haveTLock)))
{
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, kabini_counter_map[i].counterRegister);
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+ VERBOSEPRINTREG(cpu_id, counter, counter_result, CLEAR_CTRL);
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+ {
+ eventSet->events[i].threadCounter[thread_id].overflows++;
+ }
+ eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
}
}
}
+ return 0;
}
+
+int perfmon_finalizeCountersThread_kabini(int thread_id, PerfmonEventSet* eventSet)
+{
+ int haveSLock = 0;
+ int haveTLock = 0;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveSLock = 1;
+ }
+ if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+ {
+ haveTLock = 1;
+ }
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ if ((type == PMC) ||
+ ((type == UNCORE) && (haveSLock)) ||
+ ((type == CBOX0) && (haveTLock)))
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, 0x0ULL, CLEAR_CTRL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, 0x0ULL));
+ eventSet->events[i].threadCounter[thread_id].init = FALSE;
+ }
+ }
+ return 0;
+}
diff --git a/src/includes/perfmon_kabini_counters.h b/src/includes/perfmon_kabini_counters.h
index 8662522..e303341 100644
--- a/src/includes/perfmon_kabini_counters.h
+++ b/src/includes/perfmon_kabini_counters.h
@@ -3,15 +3,16 @@
*
* Filename: perfmon_kabini_counters.h
*
- * Description: Counter Header File of perfmon module for AMD Family16
+ * Description: Counter Header File of perfmon module for AMD Family 16
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -28,19 +29,33 @@
* =======================================================================================
*/
-#define NUM_COUNTERS_KABINI 8
-#define NUM_COUNTERS_CORE_KABINI 4
+#define NUM_COUNTERS_KABINI 12
+#define NUM_COUNTERS_CORE_KABINI 8
-static PerfmonCounterMap kabini_counter_map[NUM_COUNTERS_KABINI] = {
+#define KAB_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD
+#define KAB_VALID_OPTIONS_CBOX EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD|EVENT_OPTION_TID_MASK|EVENT_OPTION_NID_MASK
+
+static RegisterMap kabini_counter_map[NUM_COUNTERS_KABINI] = {
/* Core counters */
{"PMC0",PMC0, PMC, MSR_AMD16_PERFEVTSEL0, MSR_AMD16_PMC0, 0, 0},
{"PMC1",PMC1, PMC, MSR_AMD16_PERFEVTSEL1, MSR_AMD16_PMC1, 0, 0},
{"PMC2",PMC2, PMC, MSR_AMD16_PERFEVTSEL2, MSR_AMD16_PMC2, 0, 0},
{"PMC3",PMC3, PMC, MSR_AMD16_PERFEVTSEL3, MSR_AMD16_PMC3, 0, 0},
+ /* L2 cache counters */
+ {"CPMC0",PMC4, CBOX0, MSR_AMD16_L2_PERFEVTSEL0, MSR_AMD16_L2_PMC0, 0, 0},
+ {"CPMC1",PMC5, CBOX0, MSR_AMD16_L2_PERFEVTSEL1, MSR_AMD16_L2_PMC1, 0, 0},
+ {"CPMC2",PMC6, CBOX0, MSR_AMD16_L2_PERFEVTSEL2, MSR_AMD16_L2_PMC2, 0, 0},
+ {"CPMC3",PMC7, CBOX0, MSR_AMD16_L2_PERFEVTSEL3, MSR_AMD16_L2_PMC3, 0, 0},
/* Northbridge counters */
- {"UPMC0",PMC4, UNCORE, MSR_AMD16_NB_PERFEVTSEL0, MSR_AMD16_NB_PMC0, 0, 0},
- {"UPMC1",PMC5, UNCORE, MSR_AMD16_NB_PERFEVTSEL1, MSR_AMD16_NB_PMC1, 0, 0},
- {"UPMC2",PMC6, UNCORE, MSR_AMD16_NB_PERFEVTSEL2, MSR_AMD16_NB_PMC2, 0, 0},
- {"UPMC3",PMC7, UNCORE, MSR_AMD16_NB_PERFEVTSEL3, MSR_AMD16_NB_PMC3, 0, 0}
+ {"UPMC0",PMC8, UNCORE, MSR_AMD16_NB_PERFEVTSEL0, MSR_AMD16_NB_PMC0, 0, 0},
+ {"UPMC1",PMC9, UNCORE, MSR_AMD16_NB_PERFEVTSEL1, MSR_AMD16_NB_PMC1, 0, 0},
+ {"UPMC2",PMC10, UNCORE, MSR_AMD16_NB_PERFEVTSEL2, MSR_AMD16_NB_PMC2, 0, 0},
+ {"UPMC3",PMC11, UNCORE, MSR_AMD16_NB_PERFEVTSEL3, MSR_AMD16_NB_PMC3, 0, 0}
+};
+
+static BoxMap kabini_box_map[NUM_UNITS] = {
+ [PMC] = {0, 0, 0, 0, 0, 0, 48},
+ [UNCORE] = {0, 0, 0, 0, 0, 0, 48},
+ [CBOX0] = {0, 0, 0, 0, 0, 0, 48},
};
diff --git a/src/includes/perfmon_kabini_events.txt b/src/includes/perfmon_kabini_events.txt
index 9ccc726..a1bac4f 100644
--- a/src/includes/perfmon_kabini_events.txt
+++ b/src/includes/perfmon_kabini_events.txt
@@ -1,16 +1,16 @@
# =======================================================================================
-#
+#
# Filename: perfmon_kabini_events.txt
-#
+#
# Description: Event list for AMD Kabini
-#
-# Version: 3.1.3
-# Released: 4.11.2014
-#
-# Author: saravanan.ekanathan at amd.com
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
+# Author: saravanan.ekanathan at amd.com
# Project: likwid
#
-# Copyright (C) 2014 Jan Treibig
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
@@ -148,9 +148,10 @@ UMASK_PREFETCH_INSTR_DISPATCHED_NTA 0x04
EVENT_DCACHE_LOCK_MISS 0x4C PMC
UMASK_DCACHE_LOCK_MISS 0x02
-EVENT_DTLB_L1_HIT 0x4D PMC
-UMASK_DTLB_L1_HIT_4K 0x01
-UMASK_DTLB_L1_HIT_2M 0x02
+EVENT_DTLB_L1_HIT 0x4D PMC
+UMASK_DTLB_L1_HIT_4KB 0x01
+UMASK_DTLB_L1_HIT_2MB 0x02
+UMASK_DTLB_L1_HIT_ANY 0x03
EVENT_INEFFECTIVE_PREFETCHES 0x52 PMC
UMASK_INEFFECTIVE_PREFETCHES_DATA_CACHE 0x01
@@ -234,12 +235,13 @@ UMASK_INSTRUCTION_CACHE_L2_REFILLS 0x00
EVENT_INSTRUCTION_CACHE_SYSTEM_REFILLS 0x083 PMC
UMASK_INSTRUCTION_CACHE_SYSTEM_REFILLS 0x00
-EVENT_ITLB_L1_MISS_L2_HIT 0x084 PMC
-UMASK_ITLB_L1_MISS_L2_HIT 0x00
+EVENT_ITLB_L1_MISS_L2_HIT 0x084 PMC
+UMASK_ITLB_L1_MISS_L2_HIT 0x00
-EVENT_ITLB_L1_MISS_L2_MISS 0x085 PMC
+EVENT_ITLB_L1_MISS_L2_MISS 0x085 PMC
UMASK_ITLB_L1_MISS_L2_MISS_4KB 0x01
UMASK_ITLB_L1_MISS_L2_MISS_2MB 0x02
+UMASK_ITLB_L1_MISS_L2_MISS_ANY 0x03
EVENT_INSTRUCTION_FETCH_STALL 0x087 PMC
UMASK_INSTRUCTION_FETCH_STALL 0x00
diff --git a/src/includes/perfmon_nehalem.h b/src/includes/perfmon_nehalem.h
index b3e7907..6f23bd0 100644
--- a/src/includes/perfmon_nehalem.h
+++ b/src/includes/perfmon_nehalem.h
@@ -3,15 +3,16 @@
*
* Filename: perfmon_nehalem.h
*
- * Description: Header File of perfmon module for Nehalem.
+ * Description: Header File of perfmon module for Intel Nehalem.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -29,300 +30,593 @@
*/
#include <perfmon_nehalem_events.h>
-#include <perfmon_nehalem_groups.h>
#include <perfmon_nehalem_counters.h>
+#include <error.h>
+#include <affinity.h>
+
static int perfmon_numCountersNehalem = NUM_COUNTERS_NEHALEM;
-static int perfmon_numGroupsNehalem = NUM_GROUPS_NEHALEM;
static int perfmon_numArchEventsNehalem = NUM_ARCH_EVENTS_NEHALEM;
-#define OFFSET_PMC 3
-#define OFFSET_UPMC 7
-void perfmon_init_nehalem(PerfmonThread *thread)
+int perfmon_init_nehalem(int cpu_id)
{
- uint64_t flags = 0x0ULL;
- int cpu_id = thread->processorId;
-
- msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL2, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL3, 0x0ULL);
- msr_write(cpu_id, MSR_PMC0, 0x0ULL);
- msr_write(cpu_id, MSR_PMC1, 0x0ULL);
- msr_write(cpu_id, MSR_PMC2, 0x0ULL);
- msr_write(cpu_id, MSR_PMC3, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_FIXED_CTR0, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_FIXED_CTR1, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_FIXED_CTR2, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
-
- /* initialize fixed counters
- * FIXED 0: Instructions retired
- * FIXED 1: Clocks unhalted core
- * FIXED 2: Clocks unhalted ref */
- //msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
-
- // flags |= (1<<22); /* enable flag */
- // flags |= (1<<16); /* user mode flag */
- //setBit(flags,16); /* set user mode flag */
- //setBit(flags,22); /* set enable flag */
-
- /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
- msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
- msr_write(cpu_id, MSR_PERFEVTSEL2, flags);
- msr_write(cpu_id, MSR_PERFEVTSEL3, flags);*/
-
-
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
- lock_acquire(
- (int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id)
- )
- {
- /* UNCORE FIXED 0: Uncore cycles */
- msr_write(cpu_id, MSR_UNCORE_FIXED_CTR_CTRL, 0x01ULL);
- msr_write(cpu_id, MSR_UNCORE_FIXED_CTR_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_UNCORE_FIXED_CTR0, 0x0ULL);
- msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL0, 0x0ULL);
- msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL1, 0x0ULL);
- msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL2, 0x0ULL);
- msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL3, 0x0ULL);
- msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL4, 0x0ULL);
- msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL5, 0x0ULL);
- msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL6, 0x0ULL);
- msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL7, 0x0ULL);
- msr_write(cpu_id, MSR_UNCORE_PMC0, 0x0ULL);
- msr_write(cpu_id, MSR_UNCORE_PMC1, 0x0ULL);
- msr_write(cpu_id, MSR_UNCORE_PMC2, 0x0ULL);
- msr_write(cpu_id, MSR_UNCORE_PMC3, 0x0ULL);
- msr_write(cpu_id, MSR_UNCORE_PMC4, 0x0ULL);
- msr_write(cpu_id, MSR_UNCORE_PMC5, 0x0ULL);
- msr_write(cpu_id, MSR_UNCORE_PMC6, 0x0ULL);
- msr_write(cpu_id, MSR_UNCORE_PMC7, 0x0ULL);
- msr_write(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_UNCORE_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_UNCORE_ADDR_OPCODE_MATCH, 0x0ULL);
- msr_write(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL);
-
- /* Preinit of PERFEVSEL registers */
- //clearBit(flags,16); /* set enable flag */
-
- /*msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL0, flags);
- msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL1, flags);
- msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL2, flags);
- msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL3, flags);
- msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL4, flags);
- msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL5, flags);
- msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL6, flags);
- msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL7, flags);*/
- }
+ lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+ lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+ return 0;
}
+uint32_t neh_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint32_t flags = (1ULL<<(1+(index*4)));
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<(index*4));
+ break;
+ case EVENT_OPTION_ANYTHREAD:
+ flags |= (1ULL<<(2+(index*4)));
+ default:
+ break;
+ }
+ }
+ return flags;
+}
-void perfmon_setupCounterThread_nehalem(
- int thread_id,
- PerfmonEvent* event,
- PerfmonCounterIndex index)
+int neh_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
{
- int haveLock = 0;
+ int j;
uint64_t flags = 0x0ULL;
- uint64_t reg = nehalem_counter_map[index].configRegister;
- int cpu_id = perfmon_threadData[thread_id].processorId;
- uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
-
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ uint64_t offcore_flags = 0x0ULL;
+
+ flags = (1ULL<<22)|(1ULL<<16);
+ flags |= (event->umask<<8) + event->eventId;
+ /* set custom cfg and cmask */
+ if ((event->cfgBits != 0) &&
+ (event->eventId != 0xB7) &&
+ (event->eventId != 0xBB))
{
- haveLock = 1;
+ flags |= ((event->cmask<<8) + event->cfgBits)<<16;
}
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<17);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_ANYTHREAD:
+ flags |= (1ULL<<21);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL)<<24;
+ break;
+ case EVENT_OPTION_MATCH0:
+ offcore_flags |= (event->options[j].value & 0xFF);
+ break;
+ case EVENT_OPTION_MATCH1:
+ offcore_flags |= (event->options[j].value & 0xF7)<<7;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ // Offcore event with additional configuration register
+ // cfgBits contain offset of "request type" bit
+ // cmask contain offset of "response type" bit
+ if (event->eventId == 0xB7)
+ {
+ if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+ {
+ offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+ }
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, offcore_flags));
+ }
+ if ((event->eventId == 0xBB) &&
+ ((cpuid_info.model == NEHALEM_WESTMERE) || (cpuid_info.model == NEHALEM_WESTMERE_M)))
+ {
+ if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+ {
+ offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+ }
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, offcore_flags));
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
- perfmon_threadData[thread_id].counters[index].init = TRUE;
+int neh_uncore_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ uint64_t mask_flags = 0x0ULL;
- if ( nehalem_counter_map[index].type == PMC )
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
{
- flags = (1<<16)|(1<<22);
-
- /* Intel with standard 8 bit event mask: [7:0] */
- flags |= (event->umask<<8) + event->eventId;
+ return 0;
+ }
- if (event->cfgBits != 0) /* set custom cfg and cmask */
+ flags = (1ULL<<22);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->cfgBits != 0x0 && event->eventId != 0x35) /* set custom cfg and cmask */
+ {
+ flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+ }
+ else if (event->cfgBits != 0x0 && event->eventId == 0x35) /* set custom cfg and cmask */
+ {
+ mask_flags |= ((uint64_t)event->cfgBits)<<61;
+ if (event->cmask != 0x0)
{
- flags &= ~(0xFFFFU<<16); /* clear upper 16bits */
- flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+ mask_flags |= ((uint64_t)event->cmask)<<40;
}
-
- msr_write(cpu_id, reg , flags);
-
- if (perfmon_verbose)
+ }
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
{
- printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n",
- cpu_id,
- LLU_CAST reg,
- LLU_CAST flags);
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_ANYTHREAD:
+ flags |= (1ULL<<21);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL)<<24;
+ break;
+ case EVENT_OPTION_MATCH0:
+ mask_flags |= field64(event->options[j].value,3,37)<<2;
+ break;
+ case EVENT_OPTION_OPCODE:
+ mask_flags |= field64(event->options[j].value,0,8)<<40;
+ break;
+ default:
+ break;
+ }
}
}
- else if ( nehalem_counter_map[index].type == UNCORE )
+ if ((mask_flags != 0x0ULL) && (event->eventId == 0x35))
{
- if(haveLock)
+ if ((cpuid_info.model == NEHALEM_BLOOMFIELD) ||
+ (cpuid_info.model == NEHALEM_LYNNFIELD) ||
+ (cpuid_info.model == NEHALEM_LYNNFIELD_M))
{
- flags = (1<<22);
+ DEBUG_PLAIN_PRINT(DEBUGLEV_ONLY_ERROR, Register documented in SDM but ADDR_OPCODE_MATCH event not documented for Nehalem architectures);
+ }
+ VERBOSEPRINTREG(cpu_id, MSR_UNCORE_ADDR_OPCODE_MATCH, LLU_CAST mask_flags, SETUP_UNCORE_MATCH);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_ADDR_OPCODE_MATCH, mask_flags));
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_UNCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
- /* Intel with standard 8 bit event mask: [7:0] */
- flags |= (event->umask<<8) + event->eventId;
+int perfmon_setupCounterThread_nehalem(int thread_id, PerfmonEventSet* eventSet)
+{
+ int haveLock = 0;
+ uint64_t fixed_flags = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- if (event->cfgBits != 0) /* set custom cfg and cmask */
- {
- flags &= ~(0xFFFFU<<16); /* clear upper 16bits */
- flags |= ((event->cmask<<8) + event->cfgBits)<<16;
- }
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveLock = 1;
+ }
- msr_write(cpu_id, reg , flags);
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL));
+ }
+ if (haveLock && (eventSet->regTypeMask & ~(0xF)))
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
- if (perfmon_verbose)
- {
- printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n",
- cpu_id,
- LLU_CAST reg,
- LLU_CAST flags);
- }
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ eventSet->events[i].threadCounter[thread_id].init = TRUE;
+ switch (type)
+ {
+ case PMC:
+ neh_pmc_setup(cpu_id, index, event);
+ break;
+ case FIXED:
+ fixed_flags |= neh_fixed_setup(cpu_id, index, event);
+ break;
+ case UNCORE:
+ if (haveLock)
+ {
+ if (index < NUM_COUNTERS_UNCORE_NEHALEM-1)
+ {
+ neh_uncore_setup(cpu_id, index, event);
+ }
+ else
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_UNCORE_FIXED_CTR_CTRL, LLU_CAST 0x1ULL, SETUP_UPMCFIX);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_FIXED_CTR_CTRL, 0x1ULL));
+ }
+ }
+ break;
+ default:
+ break;
}
}
- else if (nehalem_counter_map[index].type == FIXED)
+ if (fixed_flags != 0x0ULL)
{
- fixed_flags |= (0x2 <<(index*4));
- msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
}
+ return 0;
}
-void perfmon_startCountersThread_nehalem(int thread_id)
+int perfmon_startCountersThread_nehalem(int thread_id, PerfmonEventSet* eventSet)
{
int haveLock = 0;
uint64_t flags = 0x0ULL;
uint64_t uflags = 0x0ULL;
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
haveLock = 1;
- msr_write(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL);
- /* Fixed Uncore counter */
- uflags = 0x100000000ULL;
}
- for ( int i=0; i<NUM_PMC; i++ )
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- if (nehalem_counter_map[i].type == PMC)
- {
- msr_write(cpu_id, nehalem_counter_map[i].counterRegister , 0x0ULL);
- flags |= (1<<(i-OFFSET_PMC)); /* enable counter */
- }
- else if (nehalem_counter_map[i].type == FIXED)
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- msr_write(cpu_id, nehalem_counter_map[i].counterRegister , 0x0ULL);
- flags |= (1ULL<<(i+32)); /* enable fixed counter */
+ continue;
}
- else if (nehalem_counter_map[i].type == UNCORE)
+ RegisterIndex index = eventSet->events[i].index;
+ uint64_t counter = counter_map[index].counterRegister;
+ eventSet->events[i].threadCounter[thread_id].startData = 0;
+ eventSet->events[i].threadCounter[thread_id].counterData = 0;
+ switch(type)
{
- if(haveLock)
- {
- msr_write(cpu_id, nehalem_counter_map[i].counterRegister , 0x0ULL);
- uflags |= (1<<(i-OFFSET_UPMC)); /* enable uncore counter */
- }
+ case PMC:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter, 0x0ULL));
+ flags |= (1ULL<<(index - cpuid_info.perf_num_fixed_ctr)); /* enable counter */
+ break;
+ case FIXED:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter, 0x0ULL));
+ flags |= (1ULL<<(index+32)); /* enable fixed counter */
+ break;
+ case UNCORE:
+ if(haveLock)
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter, 0x0ULL));
+ if (index < NUM_COUNTERS_UNCORE_NEHALEM-1)
+ {
+ uflags |= (1ULL<<(index-NUM_COUNTERS_CORE_NEHALEM)); /* enable uncore counter */
+ }
+ else
+ {
+ uflags |= (1ULL<<32);
+ }
+ }
+ break;
+ default:
+ break;
}
}
}
- if (perfmon_verbose)
+ if (haveLock && (uflags != 0x0ULL) && (eventSet->regTypeMask & ~(0xF)))
{
- printf("perfmon_start_counters: Write Register 0x%X , \
- Flags: 0x%llX \n",MSR_PERF_GLOBAL_CTRL, LLU_CAST flags);
- printf("perfmon_start_counters: Write Register 0x%X , \
- Flags: 0x%llX \n",MSR_UNCORE_PERF_GLOBAL_CTRL, LLU_CAST uflags);
+ VERBOSEPRINTREG(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, LLU_CAST uflags, UNFREEZE_UNCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, uflags));
}
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
- if (haveLock) msr_write(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, uflags);
- msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x30000000FULL);
+ if ((flags != 0x0ULL) && (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED))))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, UNFREEZE_PMC_AND_FIXED);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<63)|(1ULL<<62)|flags));
+ }
+ return 0;
}
-void perfmon_stopCountersThread_nehalem(int thread_id)
+#define NEH_CHECK_OVERFLOW(offset) \
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+ { \
+ uint64_t tmp = 0x0ULL; \
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &tmp)); \
+ if (tmp & (1ULL<<offset)) \
+ { \
+ eventSet->events[i].threadCounter[thread_id].overflows++; \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (tmp & (1ULL<<offset)))); \
+ } \
+ }
+
+#define NEH_CHECK_UNCORE_OVERFLOW(offset) \
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+ { \
+ uint64_t tmp = 0x0ULL; \
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_STATUS, &tmp)); \
+ if (tmp & (1ULL<<offset)) \
+ { \
+ eventSet->events[i].threadCounter[thread_id].overflows++; \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_OVF_CTRL, (tmp & (1ULL<<offset)))); \
+ } \
+ }
+
+int perfmon_stopCountersThread_nehalem(int thread_id, PerfmonEventSet* eventSet)
{
- uint64_t flags;
int haveLock = 0;
- int cpu_id = perfmon_threadData[thread_id].processorId;
-
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+ uint64_t counter_result = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
haveLock = 1;
- msr_write(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL);
}
- for ( int i=0; i<NUM_COUNTERS_NEHALEM; i++ )
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
+
+ if (haveLock && (eventSet->regTypeMask & ~(0xF)))
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ VERBOSEPRINTREG(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_UNCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- if (nehalem_counter_map[i].type == UNCORE)
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- if(haveLock)
- {
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, nehalem_counter_map[i].counterRegister);
- }
+ continue;
}
- else
+ counter_result = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ uint64_t counter = counter_map[index].counterRegister;
+ switch (type)
{
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, nehalem_counter_map[i].counterRegister);
+ case PMC:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+ VERBOSEPRINTREG(cpu_id, counter, counter_result, READ_PMC);
+ NEH_CHECK_OVERFLOW(index - cpuid_info.perf_num_fixed_ctr);
+ break;
+ case FIXED:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+ VERBOSEPRINTREG(cpu_id, counter, counter_result, READ_FIXED);
+ NEH_CHECK_OVERFLOW(index + 32);
+ break;
+ case UNCORE:
+ if(haveLock)
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+ VERBOSEPRINTREG(cpu_id, counter, counter_result, READ_UNCORE);
+ if (index < NUM_COUNTERS_UNCORE_NEHALEM-1)
+ {
+ NEH_CHECK_UNCORE_OVERFLOW(index - NUM_COUNTERS_CORE_NEHALEM);
+ }
+ else
+ {
+ NEH_CHECK_UNCORE_OVERFLOW(32);
+ }
+ }
+ break;
+ default:
+ break;
}
+ eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
}
}
+ return 0;
+}
+
+int perfmon_readCountersThread_nehalem(int thread_id, PerfmonEventSet* eventSet)
+{
+ int haveLock = 0;
+ uint64_t counter_result = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+ uint64_t pmc_flags = 0x0ULL;
+ uint64_t uncore_flags = 0x0ULL;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveLock = 1;
+ }
+
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &pmc_flags));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
- flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
+ if (haveLock && (eventSet->regTypeMask & ~(0xF)))
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, &uncore_flags));
+ VERBOSEPRINTREG(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_UNCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
- if((flags & 0x3) || (flags & (0x3ULL<<32)) )
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- printf ("Overflow occured \n");
- printf ("Status: 0x%llX \n", LLU_CAST flags);
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ counter_result = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ uint64_t counter = counter_map[index].counterRegister;
+ switch (type)
+ {
+ case PMC:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+ VERBOSEPRINTREG(cpu_id, counter, counter_result, READ_PMC);
+ NEH_CHECK_OVERFLOW(index - cpuid_info.perf_num_fixed_ctr);
+ break;
+ case FIXED:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+ VERBOSEPRINTREG(cpu_id, counter, counter_result, READ_FIXED);
+ NEH_CHECK_OVERFLOW(index + 32);
+ break;
+ case UNCORE:
+ if(haveLock)
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+ VERBOSEPRINTREG(cpu_id, counter, counter_result, READ_UNCORE);
+ if (index < NUM_COUNTERS_UNCORE_NEHALEM-1)
+ {
+ NEH_CHECK_UNCORE_OVERFLOW(index - NUM_COUNTERS_CORE_NEHALEM);
+ }
+ else
+ {
+ NEH_CHECK_UNCORE_OVERFLOW(32);
+ }
+ }
+ break;
+ default:
+ break;
+ }
+ eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+ }
}
+
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, pmc_flags, UNFREEZE_PMC_AND_FIXED);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, pmc_flags));
+ }
+ if (haveLock && (eventSet->regTypeMask & ~(0xF)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, uncore_flags, UNFREEZE_UNCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, uncore_flags));
+ }
+ return 0;
}
-void perfmon_readCountersThread_nehalem(int thread_id)
+int perfmon_finalizeCountersThread_nehalem(int thread_id, PerfmonEventSet* eventSet)
{
int haveLock = 0;
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ int haveTileLock = 0;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+ uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
haveLock = 1;
}
+ if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+ {
+ haveTileLock = 1;
+ }
- for ( int i=0; i<NUM_COUNTERS_NEHALEM; i++ )
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- if (nehalem_counter_map[i].type == UNCORE)
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- if(haveLock)
- {
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, nehalem_counter_map[i].counterRegister);
- }
+ continue;
}
- else
+ RegisterIndex index = eventSet->events[i].index;
+ uint64_t reg = counter_map[index].configRegister;
+ PciDeviceIndex dev = counter_map[index].device;
+ switch (type)
{
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, nehalem_counter_map[i].counterRegister);
+ case PMC:
+ ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+ if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
+ }
+ else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB) &&
+ ((cpuid_info.model == NEHALEM_WESTMERE) || (cpuid_info.model == NEHALEM_WESTMERE_M)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+ }
+ else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0x35) &&
+ ((cpuid_info.model == NEHALEM_WESTMERE) || (cpuid_info.model == NEHALEM_WESTMERE_M)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_UNCORE_ADDR_OPCODE_MATCH, 0x0ULL, CLEAR_UNCORE_MATCH);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_ADDR_OPCODE_MATCH, 0x0ULL));
+ }
+ break;
+ case FIXED:
+ ovf_values_core |= (1ULL<<(index+32));
+ break;
+ default:
+ break;
}
+ if ((reg) && (((type == PMC)||(type == FIXED))||((type >= UNCORE) && (haveLock))))
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+ }
+ eventSet->events[i].threadCounter[thread_id].init = FALSE;
}
}
+
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core, CLEAR_OVF_CTRL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, CLEAR_PMC_AND_FIXED_CTRL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
+
+ if (haveLock && eventSet->regTypeMask & ~(0xFULL))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_UNCORE_PERF_GLOBAL_OVF_CTRL, 0x0ULL, CLEAR_UNCORE_OVF);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_OVF_CTRL, 0x0ULL));
+ VERBOSEPRINTREG(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL, CLEAR_UNCORE_CTRL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
+ return 0;
}
diff --git a/src/includes/perfmon_nehalemEX.h b/src/includes/perfmon_nehalemEX.h
index ea632cf..b093ba9 100644
--- a/src/includes/perfmon_nehalemEX.h
+++ b/src/includes/perfmon_nehalemEX.h
@@ -3,15 +3,16 @@
*
* Filename: perfmon_nehalemEX.h
*
- * Description: Header File of perfmon module for Nehalem EX.
+ * Description: Header File of perfmon module for Intel Nehalem EX.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -29,839 +30,1248 @@
*/
#include <perfmon_nehalemEX_events.h>
-#include <perfmon_nehalemEX_groups.h>
+#include <perfmon_nehalemEX_counters.h>
+#include <perfmon_nehalemEX_westmereEX_common.h>
+#include <error.h>
+#include <affinity.h>
-#define NUM_COUNTERS_NEHALEMEX 7
-//static int perfmon_numCountersNehalemEX = NUM_COUNTERS_NEHALEMEX;
-static int perfmon_numGroupsNehalemEX = NUM_GROUPS_NEHALEMEX;
static int perfmon_numArchEventsNehalemEX = NUM_ARCH_EVENTS_NEHALEMEX;
+static int perfmon_numCountersNehalemEX = NUM_COUNTERS_NEHALEMEX;
/* This SUCKS: There are only subtle difference between NehalemEX
- * and Westmere EX Uncore. Still one of them is that one field is
- * 1 bit shifted. Thank you Intel for this mess!!! Do you want
+ * and Westmere EX Uncore. Still one of them is that one field is
+ * 1 bit shifted. Thank you Intel for this mess!!! Do you want
* to change the register definitions for every architecture?*/
+int perfmon_init_nehalemEX(int cpu_id)
+{
+ lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+ lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+ return 0;
+}
-void perfmon_init_nehalemEX(PerfmonThread *thread)
+uint32_t nex_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
{
- uint64_t flags = 0x0ULL;
- int cpu_id = thread->processorId;
- perfmon_verbose = 1;
- msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL2, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL3, 0x0ULL);
- msr_write(cpu_id, MSR_PMC0, 0x0ULL);
- msr_write(cpu_id, MSR_PMC1, 0x0ULL);
- msr_write(cpu_id, MSR_PMC2, 0x0ULL);
- msr_write(cpu_id, MSR_PMC3, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_FIXED_CTR0, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_FIXED_CTR1, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_FIXED_CTR2, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
-
- /* initialize fixed counters
- * FIXED 0: Instructions retired
- * FIXED 1: Clocks unhalted core
- * FIXED 2: Clocks unhalted ref */
- //msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
-
- /* Preinit of PERFEVSEL registers */
- //flags |= (1<<22); /* enable flag */
- //flags |= (1<<16); /* user mode flag */
-
- /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
- msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
- msr_write(cpu_id, MSR_PERFEVTSEL2, flags);
- msr_write(cpu_id, MSR_PERFEVTSEL3, flags);*/
-
- /* Initialize uncore */
- /* MBOX */
- thread->counters[PMC7].id = 0;
- thread->counters[PMC8].id = 1;
- thread->counters[PMC9].id = 2;
- thread->counters[PMC10].id = 3;
- thread->counters[PMC11].id = 4;
- thread->counters[PMC12].id = 5;
- westmereEX_PMunits[MBOX0].ctrlRegister = MSR_M0_PMON_BOX_CTRL;
- westmereEX_PMunits[MBOX0].statusRegister = MSR_M0_PMON_BOX_STATUS;
- westmereEX_PMunits[MBOX0].ovflRegister = MSR_M0_PMON_BOX_OVF_CTRL;
-
- thread->counters[PMC13].id = 0;
- thread->counters[PMC14].id = 1;
- thread->counters[PMC15].id = 2;
- thread->counters[PMC16].id = 3;
- thread->counters[PMC17].id = 4;
- thread->counters[PMC18].id = 5;
- westmereEX_PMunits[MBOX1].ctrlRegister = MSR_M1_PMON_BOX_CTRL;
- westmereEX_PMunits[MBOX1].statusRegister = MSR_M1_PMON_BOX_STATUS;
- westmereEX_PMunits[MBOX1].ovflRegister = MSR_M1_PMON_BOX_OVF_CTRL;
-
- /* BBOX */
- thread->counters[PMC19].id = 0;
- thread->counters[PMC20].id = 1;
- thread->counters[PMC21].id = 2;
- thread->counters[PMC22].id = 3;
- westmereEX_PMunits[BBOX0].ctrlRegister = MSR_B0_PMON_BOX_CTRL;
- westmereEX_PMunits[BBOX0].statusRegister = MSR_B0_PMON_BOX_STATUS;
- westmereEX_PMunits[BBOX0].ovflRegister = MSR_B0_PMON_BOX_OVF_CTRL;
-
- thread->counters[PMC23].id = 0;
- thread->counters[PMC24].id = 1;
- thread->counters[PMC25].id = 2;
- thread->counters[PMC26].id = 3;
- westmereEX_PMunits[BBOX1].ctrlRegister = MSR_B1_PMON_BOX_CTRL;
- westmereEX_PMunits[BBOX1].statusRegister = MSR_B1_PMON_BOX_STATUS;
- westmereEX_PMunits[BBOX1].ovflRegister = MSR_B1_PMON_BOX_OVF_CTRL;
-
- /* RBOX */
- thread->counters[PMC27].id = 0;
- thread->counters[PMC28].id = 1;
- thread->counters[PMC29].id = 2;
- thread->counters[PMC30].id = 3;
- thread->counters[PMC31].id = 4;
- thread->counters[PMC32].id = 5;
- thread->counters[PMC33].id = 6;
- thread->counters[PMC34].id = 7;
- westmereEX_PMunits[RBOX0].ctrlRegister = MSR_R0_PMON_BOX_CTRL;
- westmereEX_PMunits[RBOX0].statusRegister = MSR_R0_PMON_BOX_STATUS;
- westmereEX_PMunits[RBOX0].ovflRegister = MSR_R0_PMON_BOX_OVF_CTRL;
-
- thread->counters[PMC35].id = 0;
- thread->counters[PMC36].id = 1;
- thread->counters[PMC37].id = 2;
- thread->counters[PMC38].id = 3;
- thread->counters[PMC39].id = 4;
- thread->counters[PMC40].id = 5;
- thread->counters[PMC41].id = 6;
- thread->counters[PMC42].id = 7;
- westmereEX_PMunits[RBOX1].ctrlRegister = MSR_R1_PMON_BOX_CTRL;
- westmereEX_PMunits[RBOX1].statusRegister = MSR_R1_PMON_BOX_STATUS;
- westmereEX_PMunits[RBOX1].ovflRegister = MSR_R1_PMON_BOX_OVF_CTRL;
-
- /* WBOX */
- thread->counters[PMC43].id = 0;
- thread->counters[PMC44].id = 1;
- thread->counters[PMC45].id = 2;
- thread->counters[PMC46].id = 3;
- thread->counters[PMC47].id = 31;
- westmereEX_PMunits[WBOX].ctrlRegister = MSR_W_PMON_BOX_CTRL;
- westmereEX_PMunits[WBOX].statusRegister = MSR_W_PMON_BOX_STATUS;
- westmereEX_PMunits[WBOX].ovflRegister = MSR_W_PMON_BOX_OVF_CTRL;
-
- thread->counters[PMC48].id = 0;
- westmereEX_PMunits[UBOX].ctrlRegister = MSR_U_PMON_GLOBAL_CTRL;
- westmereEX_PMunits[UBOX].statusRegister = MSR_U_PMON_GLOBAL_STATUS;
- westmereEX_PMunits[UBOX].ovflRegister = MSR_U_PMON_GLOBAL_OVF_CTRL;
-
- /* Set IDs for all CBOXes */
- for (int i=PMC49; i<=PMC88; i+= 5)
- {
- for(int j=0; j<5; j++)
+ int j;
+ uint32_t flags = (1ULL<<(1+(index*4)));
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
{
- thread->counters[i].id = j;
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<(index*4));
+ break;
+ case EVENT_OPTION_ANYTHREAD:
+ flags |= (1ULL<<(2+(index*4)));
+ default:
+ break;
}
}
- westmereEX_PMunits[CBOX0].ctrlRegister = MSR_C0_PMON_BOX_CTRL;
- westmereEX_PMunits[CBOX0].statusRegister = MSR_C0_PMON_BOX_STATUS;
- westmereEX_PMunits[CBOX0].ovflRegister = MSR_C0_PMON_BOX_OVF_CTRL;
- westmereEX_PMunits[CBOX1].ctrlRegister = MSR_C1_PMON_BOX_CTRL;
- westmereEX_PMunits[CBOX1].statusRegister = MSR_C1_PMON_BOX_STATUS;
- westmereEX_PMunits[CBOX1].ovflRegister = MSR_C1_PMON_BOX_OVF_CTRL;
- westmereEX_PMunits[CBOX2].ctrlRegister = MSR_C2_PMON_BOX_CTRL;
- westmereEX_PMunits[CBOX2].statusRegister = MSR_C2_PMON_BOX_STATUS;
- westmereEX_PMunits[CBOX2].ovflRegister = MSR_C2_PMON_BOX_OVF_CTRL;
- westmereEX_PMunits[CBOX3].ctrlRegister = MSR_C3_PMON_BOX_CTRL;
- westmereEX_PMunits[CBOX3].statusRegister = MSR_C3_PMON_BOX_STATUS;
- westmereEX_PMunits[CBOX3].ovflRegister = MSR_C3_PMON_BOX_OVF_CTRL;
- westmereEX_PMunits[CBOX4].ctrlRegister = MSR_C4_PMON_BOX_CTRL;
- westmereEX_PMunits[CBOX4].statusRegister = MSR_C4_PMON_BOX_STATUS;
- westmereEX_PMunits[CBOX4].ovflRegister = MSR_C4_PMON_BOX_OVF_CTRL;
- westmereEX_PMunits[CBOX5].ctrlRegister = MSR_C5_PMON_BOX_CTRL;
- westmereEX_PMunits[CBOX5].statusRegister = MSR_C5_PMON_BOX_STATUS;
- westmereEX_PMunits[CBOX5].ovflRegister = MSR_C5_PMON_BOX_OVF_CTRL;
- westmereEX_PMunits[CBOX6].ctrlRegister = MSR_C6_PMON_BOX_CTRL;
- westmereEX_PMunits[CBOX6].statusRegister = MSR_C6_PMON_BOX_STATUS;
- westmereEX_PMunits[CBOX6].ovflRegister = MSR_C6_PMON_BOX_OVF_CTRL;
- westmereEX_PMunits[CBOX7].ctrlRegister = MSR_C7_PMON_BOX_CTRL;
- westmereEX_PMunits[CBOX7].statusRegister = MSR_C7_PMON_BOX_STATUS;
- westmereEX_PMunits[CBOX7].ovflRegister = MSR_C7_PMON_BOX_OVF_CTRL;
-
- thread->counters[PMC99].id = 0;
- thread->counters[PMC100].id = 1;
- thread->counters[PMC101].id = 2;
- thread->counters[PMC102].id = 3;
- westmereEX_PMunits[SBOX0].ctrlRegister = MSR_S0_PMON_BOX_CTRL;
- westmereEX_PMunits[SBOX0].statusRegister = MSR_S0_PMON_BOX_STATUS;
- westmereEX_PMunits[SBOX0].ovflRegister = MSR_S0_PMON_BOX_OVF_CTRL;
- thread->counters[PMC103].id = 0;
- thread->counters[PMC104].id = 1;
- thread->counters[PMC105].id = 2;
- thread->counters[PMC106].id = 3;
- westmereEX_PMunits[SBOX1].ctrlRegister = MSR_S1_PMON_BOX_CTRL;
- westmereEX_PMunits[SBOX1].statusRegister = MSR_S1_PMON_BOX_STATUS;
- westmereEX_PMunits[SBOX1].ovflRegister = MSR_S1_PMON_BOX_OVF_CTRL;
-
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
- lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id))
- {
- msr_write(cpu_id, MSR_W_PMON_BOX_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_W_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_W_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_W_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_W_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_W_PMON_FIXED_CTR, 0x0ULL);
-
- msr_write(cpu_id, MSR_M0_PMON_BOX_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL4, 0x0ULL);
- msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL5, 0x0ULL);
-
- msr_write(cpu_id, MSR_M1_PMON_BOX_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL4, 0x0ULL);
- msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL5, 0x0ULL);
-
- msr_write(cpu_id, MSR_B0_PMON_BOX_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL3, 0x0ULL);
-
- msr_write(cpu_id, MSR_B1_PMON_BOX_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL3, 0x0ULL);
-
- msr_write(cpu_id, MSR_R0_PMON_BOX_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL4, 0x0ULL);
- msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL5, 0x0ULL);
- msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL6, 0x0ULL);
- msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL7, 0x0ULL);
-
- msr_write(cpu_id, MSR_R1_PMON_BOX_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL8, 0x0ULL);
- msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL9, 0x0ULL);
- msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL10, 0x0ULL);
- msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL11, 0x0ULL);
- msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL12, 0x0ULL);
- msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL13, 0x0ULL);
- msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL14, 0x0ULL);
- msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL15, 0x0ULL);
-
- msr_write(cpu_id, MSR_U_PMON_GLOBAL_EVNT_SEL, 0x0ULL);
-
- msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL4, 0x0ULL);
-
- msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL4, 0x0ULL);
-
- msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL4, 0x0ULL);
-
- msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL4, 0x0ULL);
-
- msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL4, 0x0ULL);
-
- msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL4, 0x0ULL);
-
- msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL4, 0x0ULL);
-
- msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL4, 0x0ULL);
-
- msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL3, 0x0ULL);
-
- msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL3, 0x0ULL);
-
- flags = 0x0UL;
- flags |= (1<<29); /* reset all */
- msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, flags );
- }
+ return flags;
}
-/* MBOX macros */
-#define MBOX_GATE_NEHEX(NUM) \
-flags = 0x41ULL; \
-switch (event->cfgBits) \
-{ \
- case 0x00: /* primary Event */ \
- flags |= (event->eventId<<9); \
- break; \
- case 0x01: /* secondary Events */ \
- /* TODO fvid index is missing defaults to 0 */ \
- flags |= (1<<7); /* toggle flag mode */ \
- flags |= (event->eventId<<19); \
- switch (event->eventId) \
- { \
- case 0x00: /* CYCLES_DSP_FILL: DSP */ \
- { \
- uint64_t dsp_flags = 0x0ULL; \
- dsp_flags |= (event->umask<<7); \
- msr_write(cpu_id, MSR_M##NUM##_PMON_DSP, dsp_flags); \
- } \
- break; \
- case 0x01: /* CYCLES_SCHED_MODE: ISS */ \
- { \
- uint32_t iss_flags = 0x0UL; \
- iss_flags |= (event->umask<<4); \
- msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags); \
- } \
- break; \
- case 0x05: /* CYCLES_PGT_STATE: PGT */ \
- { \
- uint32_t pgt_flags = 0x0UL; \
- pgt_flags |= (event->umask<<6); \
- msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags); \
- } \
- break; \
- case 0x06: /* BCMD_SCHEDQ_OCCUPANCY: MAP */ \
- { \
- uint32_t map_flags = 0x0UL; \
- map_flags |= (event->umask<<6); \
- msr_write(cpu_id, MSR_M##NUM##_PMON_MAP, map_flags); \
- } \
- break; \
- } \
- break; \
- case 0x02: /* DRAM_CMD: PLD/ISS */ \
- flags |= (event->eventId<<9); \
- { \
- uint32_t pld_flags = 0x0UL; \
- uint32_t iss_flags = 0x0UL; \
- pld_flags |= (event->umask<<8); \
- if (event->cmask != 0) \
- { \
- iss_flags |= (event->cmask<<7); \
- pld_flags |= 1; /* toggle cmd flag */ \
- } \
- msr_write(cpu_id, MSR_M##NUM##_PMON_PLD, pld_flags); \
- msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags); \
- } \
- break; \
- case 0x03: /* DSP_FILL: DSP */ \
- flags |= (event->eventId<<9); \
- { \
- uint64_t dsp_flags = 0x0ULL; \
- dsp_flags |= (event->umask<<7); \
- msr_write(cpu_id, MSR_M##NUM##_PMON_DSP, dsp_flags); \
- } \
- break; \
- case 0x04: /* DRAM_MISC: PLD */ \
- flags |= (event->eventId<<9); \
- { \
- uint64_t pld_flags = 0x0ULL; \
- switch (event->cmask) \
- { \
- case 0x0: \
- pld_flags |= (1<<16); \
- pld_flags |= (event->umask<<19); \
- break; \
- case 0x1: \
- pld_flags |= (event->umask<<18); \
- break; \
- case 0x2: \
- pld_flags |= (event->umask<<17); \
- break; \
- case 0x3: \
- pld_flags |= (event->umask<<7); \
- break; \
- } \
- msr_write(cpu_id, MSR_M##NUM##_PMON_PLD, pld_flags); \
- } \
- break; \
- case 0x05: /* FRM_TYPE: ISS */ \
- flags |= (event->eventId<<9); \
- { \
- uint32_t iss_flags = 0x0UL; \
- iss_flags |= event->umask; \
- msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags); \
- } \
- break; \
- case 0x06: /* FVC_EV0: FVC */ \
- flags |= (event->eventId<<9); \
- { \
- uint32_t fvc_flags = 0x0UL; \
- fvc_flags |= (event->umask<<11); \
- if (event->umask == 0x5) \
- { \
- fvc_flags |= (event->cmask<<5); \
- } \
- else \
- { \
- fvc_flags |= (event->cmask<<8); \
- } \
- msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags); \
- VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV0) \
- } \
- break; \
- case 0x07: /* FVC_EV1: FVC */ \
- flags |= (event->eventId<<9); \
- { \
- uint32_t fvc_flags = 0x0UL; \
- fvc_flags |= (event->umask<<14); \
- if (event->umask == 0x5) \
- { \
- fvc_flags |= (event->cmask<<5); \
- } \
- else \
- { \
- fvc_flags |= (event->cmask<<8); \
- } \
- msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags); \
- VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV1) \
- } \
- break; \
- case 0x08: /* FVC_EV2: FVC */ \
- flags |= (event->eventId<<9); \
- { \
- uint32_t fvc_flags = 0x0UL; \
- fvc_flags |= (event->umask<<17); \
- if (event->umask == 0x5) \
- { \
- fvc_flags |= (event->cmask<<5); \
- } \
- else \
- { \
- fvc_flags |= (event->cmask<<8); \
- } \
- msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags); \
- VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV2) \
- } \
- break; \
- case 0x09: /* FVC_EV3: FVC(ZDP) */ \
- flags |= (event->eventId<<9); \
- { \
- uint32_t fvc_flags = 0x0UL; \
- fvc_flags |= (event->umask<<20); \
- if (event->umask == 0x5) \
- { \
- fvc_flags |= (event->cmask<<5); \
- } \
- else \
- { \
- fvc_flags |= (event->cmask<<8); \
- } \
- msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags); \
- } \
- break; \
- case 0x0A: /* ISS_SCHED: ISS */ \
- flags |= (event->eventId<<9); \
- { \
- uint32_t iss_flags = 0x0UL; \
- iss_flags |= (event->umask<<10); \
- msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags); \
- } \
- break; \
- case 0x0B: /* PGT_PAGE_EV: PGT */ \
- flags |= (event->eventId<<9); \
- { \
- uint32_t pgt_flags = 0x0UL; \
- pgt_flags |= event->umask; \
- msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags); \
- } \
- break; \
- case 0x0C: /* PGT_PAGE_EV2: PGT */ \
- flags |= (event->eventId<<9); \
- { \
- uint32_t pgt_flags = 0x0UL; \
- pgt_flags |= (event->umask<<11); \
- msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags); \
- } \
- break; \
- case 0x0D: /* THERM_TRP_DN: THR */ \
- flags |= (event->eventId<<9); \
- { \
- uint32_t thr_flags = 0x0UL; \
- thr_flags |= (1<<3); \
- thr_flags |= (event->umask<<9); \
- msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, thr_flags); \
- } \
- break; \
-}
-
-
-void perfmon_setupCounterThread_nehalemEX(
- int thread_id,
- PerfmonEvent* event,
- PerfmonCounterIndex index)
+int nex_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
{
+ int j;
uint64_t flags = 0x0ULL;
- int haveLock = 0;
+ uint64_t offcore_flags = 0x0ULL;
uint64_t reg = counter_map[index].configRegister;
- int cpu_id = perfmon_threadData[thread_id].processorId;
- uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
- perfmon_threadData[thread_id].counters[index].init = TRUE;
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ flags |= (1ULL<<22)|(1ULL<<16);
+ /* Intel with standard 8 bit event mask: [7:0] */
+ flags |= (event->umask<<8) + event->eventId;
+
+ if (event->cfgBits != 0 &&
+ ((event->eventId != 0xB7) || (event->eventId != 0xBB)))
{
- haveLock = 1;
+ /* set custom cfg and cmask */
+ flags |= ((event->cmask<<8) + event->cfgBits)<<16;
}
- switch (counter_map[index].type)
+ if (event->numberOfOptions > 0)
{
- case PMC:
- flags = (1<<22)|(1<<16);
-
- /* Intel with standard 8 bit event mask: [7:0] */
- flags |= (event->umask<<8) + event->eventId;
-
- if (event->cfgBits != 0) /* set custom cfg and cmask */
+ for (j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
{
- flags &= ~(0xFFFFU<<16); /* clear upper 16bits */
- flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<17);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL)<<24;
+ break;
+ case EVENT_OPTION_MATCH0:
+ offcore_flags |= (event->options[j].value & 0xFFULL);
+ break;
+ case EVENT_OPTION_MATCH1:
+ offcore_flags |= (event->options[j].value & 0xF7ULL)<<8;
+ break;
+ default:
+ break;
}
+ }
+ }
+ if (event->eventId == 0xB7)
+ {
+ if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+ {
+ offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+ }
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, offcore_flags));
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+ VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_PMC)
+ currentConfig[cpu_id][index] = flags;
+ }
- msr_write(cpu_id, reg , flags);
- VERBOSEPRINTREG(cpu_id, reg, flags, PMC_EV_SEL)
- break;
+ return 0;
+}
- case FIXED:
- fixed_flags |= (0x2<<(index*4));
- msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
- break;
- case MBOX0:
- if (haveLock)
+
+
+
+int nex_mbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ uint64_t flags = 0x41ULL;
+ uint64_t subflags1 = 0x0ULL;
+ uint64_t subflags2 = 0x0ULL;
+ int number;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (((counter_map[index].configRegister& 0xFF0) == 0xCA0) ||
+ ((counter_map[index].configRegister& 0xFF0) == 0xCB0))
+ number = 0;
+ else
+ number = 1;
+
+ if (event->numberOfOptions > 0 && (event->cfgBits == 0x02 || event->cfgBits == 0x04))
+ {
+ for (int j=0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
{
- MBOX_GATE_NEHEX(0);
- msr_write(cpu_id, reg , flags);
- VERBOSEPRINTREG(cpu_id, reg, flags, MBOX0_CTRL)
+ case EVENT_OPTION_MATCH0:
+ subflags2 = (event->options[j].value & 0x3FFFFFFFFULL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ADDR_MATCH], subflags2));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ADDR_MATCH], subflags2, SETUP_MBOX_ADDR_MATCH);
+ break;
+ case EVENT_OPTION_MASK0:
+ subflags2 = ((event->options[j].value & 0x1FFFFFFC0ULL)>>6);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ADDR_MASK], subflags2));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ADDR_MASK], subflags2, SETUP_MBOX_ADDR_MASK);
+ break;
+ default:
+ break;
}
+ }
+ subflags2 = 0x0ULL;
+ }
+ switch (event->cfgBits)
+ {
+ case 0x00:
+ flags |= (event->eventId & 0x1FULL)<<9;
break;
-
- case MBOX1:
- if (haveLock)
+ case 0x01:
+ flags |= (1ULL<<7);
+ flags |= (event->eventId & 0x7ULL)<<19;
+ switch (event->eventId)
{
- MBOX_GATE_NEHEX(1);
- msr_write(cpu_id, reg , flags);
- VERBOSEPRINTREG(cpu_id, reg, flags, MBOX1_CTRL)
+ case 0x00:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][DSP], &subflags1));
+ subflags1 |= (event->umask & 0xFULL)<<7;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][DSP], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][DSP], subflags1, SETUP_MBOX_DSP);
+ break;
+ case 0x01:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], &subflags1));
+ subflags1 |= (event->umask & 0x7ULL)<<4;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ISS], subflags1, SETUP_MBOX_ISS);
+ break;
+ case 0x05:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PGT], &subflags1));
+ subflags1 |= (event->umask & 0x1ULL)<<6;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PGT], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][PGT], subflags1, SETUP_MBOX_PGT);
+ break;
+ case 0x06:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][MAP], &subflags1));
+ subflags1 |= (event->umask & 0x7ULL)<<6;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][MAP], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][MAP], subflags1, SETUP_MBOX_MAP);
+ break;
}
break;
-
- case BBOX0:
- case BBOX1:
- if (haveLock)
+ case 0x02:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PLD], &subflags1));
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], &subflags2));
+ subflags1 |= (event->umask & 0x1FULL)<<8;
+ if ((event->cmask & 0xF0ULL) != 0)
{
- flags = 0x1ULL; /* set enable bit */
- flags |= (event->eventId<<1);
- msr_write(cpu_id, reg , flags);
- VERBOSEPRINTREG(cpu_id, reg, flags, BBOX_CTRL)
+ subflags1 |= (1ULL<<0);
}
+ if ((event->cmask & 0xFULL) != 0)
+ {
+ subflags2 |= (event->cmask & 0x7ULL)<<7;
+ }
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PLD], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][PLD], subflags1, SETUP_MBOX_PLD);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], subflags2));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ISS], subflags2, SETUP_MBOX_ISS);
break;
-
- case RBOX0:
- if (haveLock)
+ case 0x03:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][DSP], &subflags1));
+ subflags1 |= (event->umask & 0xFULL)<<7;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][DSP], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][DSP], subflags1, SETUP_MBOX_DSP);
+ break;
+ case 0x04:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PLD], &subflags1));
+ switch (event->cmask)
{
- RBOX_GATE(0);
- msr_write(cpu_id, reg , flags);
- VERBOSEPRINTREG(cpu_id, reg, flags, RBOX0_CTRL)
+ case 0x0:
+ subflags1 |= (1ULL<<16);
+ subflags1 |= (event->umask & 0x1FULL)<<19;
+ break;
+ case 0x1:
+ subflags1 |= (event->umask & 0x1ULL)<<18;
+ break;
+ case 0x2:
+ subflags1 |= (event->umask & 0x1ULL)<<17;
+ break;
+ case 0x3:
+ subflags1 |= (event->umask & 0x1ULL)<<7;
+ break;
}
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PLD], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][PLD], subflags1, SETUP_MBOX_PLD);
break;
-
- case RBOX1:
- if (haveLock)
+ case 0x05:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], &subflags1));
+ subflags1 |= (event->umask & 0xFULL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ISS], subflags1, SETUP_MBOX_ISS);
+ break;
+ case 0x06:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], &subflags1));
+ subflags1 |= (event->umask & 0x7ULL)<<12;
+ if (event->umask == 0x5)
{
- RBOX_GATE(1);
- msr_write(cpu_id, reg , flags);
- VERBOSEPRINTREG(cpu_id, reg, flags, RBOX1_CTRL)
+ subflags1 |= (event->cmask & 0x7ULL)<<6;
}
+ else
+ {
+ subflags1 |= (event->cmask & 0x7ULL)<<9;
+ }
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ZDP], subflags1, SETUP_MBOX_ZDP);
break;
-
- case WBOX:
- if (haveLock)
+ case 0x07:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], &subflags1));
+ subflags1 |= (event->umask & 0x7ULL)<<15;
+ if (event->umask == 0x5)
{
- if (event->eventId == 0xFF) /* Fixed Counter */
- {
- flags = 0x1ULL; /* set enable bit */
- }
- else
- {
- flags |= (1<<22); /* set enable bit */
- flags |= (event->umask<<8) + event->eventId;
- }
- msr_write(cpu_id, reg , flags);
- VERBOSEPRINTREG(cpu_id, reg, flags, WBOX_CTRL)
+ subflags1 |= (event->cmask & 0x7ULL)<<6;
}
+ else
+ {
+ subflags1 |= (event->cmask & 0x7ULL)<<9;
+ }
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ZDP], subflags1, SETUP_MBOX_ZDP);
break;
-
- case UBOX:
- if (haveLock)
+ case 0x08:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], &subflags1));
+ subflags1 |= (event->umask & 0x7ULL)<<18;
+ if (event->umask == 0x5)
{
- flags = 0x0ULL;
- flags |= (1<<22);
- flags |= event->eventId;
- fprintf(stderr, "Setup UBOX with value 0x%llx in register 0x%llx, event 0x%x \n", LLU_CAST flags, LLU_CAST reg,event->eventId);
- msr_write(cpu_id, reg , flags);
- VERBOSEPRINTREG(cpu_id, reg, flags, UBOX_CTRL)
+ subflags1 |= (event->cmask & 0x7ULL)<<6;
}
+ else
+ {
+ subflags1 |= (event->cmask & 0x7ULL)<<9;
+ }
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ZDP], subflags1, SETUP_MBOX_ZDP);
break;
-
- case CBOX0:
- case CBOX1:
- case CBOX2:
- case CBOX3:
- case CBOX4:
- case CBOX5:
- case CBOX6:
- case CBOX7:
- if (haveLock)
- {
- flags = 0x0ULL;
- flags |= (1<<22);
- flags |= (event->umask<<8) + event->eventId;
- fprintf(stderr, "Setup CBOX with value 0x%llx in register 0x%llx, event 0x%x umask 0x%x \n", LLU_CAST flags, LLU_CAST reg,event->eventId, event->umask);
- msr_write(cpu_id, reg , flags);
- VERBOSEPRINTREG(cpu_id, reg, flags, CBOX_CTRL)
+ case 0x09:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], &subflags1));
+ subflags1 |= (event->umask & 0x7ULL)<<21;
+ if (event->umask == 0x5)
+ {
+ subflags1 |= (event->cmask & 0x7ULL)<<6;
+ }
+ else
+ {
+ subflags1 |= (event->cmask & 0x7ULL)<<9;
}
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ZDP], subflags1, SETUP_MBOX_ZDP);
+ break;
+ case 0x0A:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], &subflags1));
+ subflags1 |= (event->umask & 0x1ULL)<<10;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ISS], subflags1, SETUP_MBOX_ISS);
+ break;
+ case 0x0B:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PGT], &subflags1));
+ subflags1 |= (event->umask & 0x1ULL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PGT], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][PGT], subflags1, SETUP_MBOX_PGT);
+ break;
+ case 0x0C:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PGT], &subflags1));
+ subflags1 |= (event->umask & 0x1ULL)<<11;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PGT], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][PGT], subflags1, SETUP_MBOX_PGT);
break;
- case SBOX0:
- case SBOX1:
- if (haveLock)
+ case 0x0D:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][THR], &subflags1));
+ subflags1 |= (event->umask & 0x3ULL)<<9;
+ if (event->cmask == 0x0)
+ {
+ subflags1 |= (1ULL<<3);
+ }
+ else
+ {
+ subflags1 &= ~(1ULL<<3);
+ subflags1 |= (event->cmask & 0x7ULL)<<4;
+ }
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][THR], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][THR], subflags1, SETUP_MBOX_THR);
+ break;
+ case 0x0E:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][THR], &subflags1));
+ subflags1 |= (event->umask & 0x3ULL)<<7;
+ if (event->cmask == 0x0)
+ {
+ subflags1 |= (1ULL<<3);
+ }
+ else
{
- flags = 0x0ULL;
- flags |= (1<<22);
- flags |= (event->umask<<8);
- flags |= (event->eventId);
- msr_write(cpu_id, reg , flags);
- VERBOSEPRINTREG(cpu_id, reg, flags, SBOX_CTRL)
+ subflags1 &= ~(1ULL<<3);
+ subflags1 |= (event->cmask & 0x7ULL)<<4;
}
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][THR], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][THR], subflags1, SETUP_MBOX_THR);
break;
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_MBOX)
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
- default:
- /* should never be reached */
+
+int nex_rbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ uint64_t flags = 0x01ULL;
+ uint64_t subflags = 0x0ULL;
+ int number;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if ((counter_map[index].configRegister & 0xFF0) == 0xE10)
+ number = 0;
+ else if ((counter_map[index].configRegister & 0xFF0) == 0xE30)
+ number = 1;
+
+ switch (event->eventId) {
+ case 0x00:
+ flags |= (event->umask & 0x1FULL)<<1;
+ subflags |= (event->cfgBits<<event->cmask);
+ switch (event->umask)
+ {
+ case 0x00:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF0][0], subflags));
+ break;
+ case 0x01:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF1][0], subflags));
+ break;
+ case 0x06:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF0][1], subflags));
+ break;
+ case 0x07:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF1][1], subflags));
+ break;
+ case 0x0C:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF0][2], subflags));
+ break;
+ case 0x0D:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF1][2], subflags));
+ break;
+ case 0x12:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF0][3], subflags));
+ break;
+ case 0x13:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF1][3], subflags));
+ break;
+ }
+ break;
+ case 0x01:
+ flags |= (event->umask & 0x1FULL)<<1;
+ subflags |= (event->cfgBits & 0xFULL);
+ if (event->cmask != 0x0)
+ {
+ subflags |= (event->cmask & 0xFULL)<<4;
+ }
+ switch (event->umask)
+ {
+ case 0x02:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][0], subflags));
+ break;
+ case 0x03:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][0], (subflags<<8)));
+ break;
+ case 0x08:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][1], subflags));
+ break;
+ case 0x09:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][1], (subflags<<8)));
+ break;
+ case 0x0E:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][2], subflags));
+ break;
+ case 0x0F:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][2], (subflags<<8)));
+ break;
+ case 0x14:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][3], subflags));
+ break;
+ case 0x15:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][3], (subflags<<8)));
+ break;
+ }
break;
}
+ if (flags != currentConfig[cpu_id][index])
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_RBOX)
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
}
+int nex_bbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x1ULL; /* set enable bit */
+ uint64_t reg = counter_map[index].configRegister;
+ RegisterType type = counter_map[index].type;
-/* Actions for Performance Monitoring Session:
- *
- * Core Counters (counter is always enabled in PERVSEL register):
- * 1) Disable counters in global ctrl Register MSR_PERF_GLOBAL_CTRL
- * 2) Zero according counter registers
- * 3) Set enable bit in global register flag
- * 4) Write global register flag
- *
- * Uncore Counters (only one core per socket):
- * 1) Set reset flag in global U Box control register
- * 2) Zero according counter registers
- * 3) Set enable bit in Box control register
- * 4) Write according uncore Box ctrl register
- * 3) Set enable bit in global U Box control register
- * */
-
-void perfmon_startCountersThread_nehalemEX(int thread_id)
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+
+ flags |= (event->eventId<<1);
+ if (event->numberOfOptions > 0)
+ {
+ for (j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_MATCH0:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1, event->options[j].value & 0xFFFFFFFFFFFFFFFULL));
+ VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, LLU_CAST event->options[j].value & 0xFFFFFFFFFFFFFFFULL, SETUP_BBOX_MATCH);
+ break;
+ case EVENT_OPTION_MASK0:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2, event->options[j].value & 0xFFFFFFFFFFFFFFFULL));
+ VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister2, LLU_CAST event->options[j].value & 0xFFFFFFFFFFFFFFFULL, SETUP_BBOX_MASK);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags));
+ VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_BBOX);
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int nex_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
{
- int haveLock = 0;
+ int j;
uint64_t flags = 0x0ULL;
- uint32_t uflags[NUM_UNITS];
- int enable_ubox = 0;
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ uint64_t reg = counter_map[index].configRegister;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+
+ flags = (1ULL<<22);
+ flags |=(event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for (j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0x1FULL) << 24);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags));
+ VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_CBOX);
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+int nex_wbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ uint64_t flags = 0x0ULL;
+ uint64_t reg = counter_map[index].configRegister;
+ int j;
- if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
{
- uint32_t ubflags = 0x0UL;
- ubflags |= (1<<29); /* reset all */
- haveLock = 1;
- // msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
- // VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags, UBOX_GLOBAL_CTRL)
+ return 0;
}
+ flags |= (1ULL<<22); /* set enable bit */
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for (j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0xFFULL) << 24);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags));
+ VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_WBOX);
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int nex_sbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ int match_mask = 0;
+ uint64_t flags = 0x0ULL;
+ uint64_t reg = counter_map[index].configRegister;
+ RegisterType type = counter_map[index].type;
- for ( int i=0; i<NUM_UNITS; i++ )
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
{
- uflags[i] = 0x0UL;
+ return 0;
}
- for ( int i=0; i<NUM_PMC; i++ )
+ flags = (1ULL<<22);
+ flags |=(event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE) {
- if (westmereEX_counter_map[i].type == PMC)
+ if (event->eventId == 0x0)
+ {
+ for (j = 0; j < event->numberOfOptions; j++)
{
- msr_write(cpu_id, westmereEX_counter_map[i].counterRegister , 0x0ULL);
- flags |= (1<<(i-OFFSET_PMC)); /* enable counter */
+ if ((event->options[j].type == EVENT_OPTION_MATCH0) ||
+ (event->options[j].type == EVENT_OPTION_MASK0))
+ {
+ match_mask = 1;
+ break;
+ }
+ }
+ if (match_mask) {
+
+ if (type == SBOX0)
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_S0_PMON_MM_CFG, 0x0ULL, CLEAR_MM_CFG);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_S0_PMON_MM_CFG, 0x0ULL));
+ }
+ else
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_S1_PMON_MM_CFG, 0x0ULL, CLEAR_MM_CFG);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_S1_PMON_MM_CFG, 0x0ULL));
+ }
}
- else if (westmereEX_counter_map[i].type == FIXED)
+ }
+ for (j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0xFFULL) << 24);
+ break;
+ case EVENT_OPTION_MATCH0:
+ if (event->eventId == 0x0)
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1, event->options[j].value));
+ VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, LLU_CAST event->options[j].value, SETUP_SBOX_MATCH);
+ }
+ break;
+ case EVENT_OPTION_MASK0:
+ if (event->eventId == 0x0)
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2, event->options[j].value));
+ VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister2, LLU_CAST event->options[j].value, SETUP_SBOX_MASK);
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ if (match_mask)
+ {
+ if (type == SBOX0)
{
- msr_write(cpu_id, westmereEX_counter_map[i].counterRegister , 0x0ULL);
- flags |= (1ULL<<(i+32)); /* enable fixed counter */
+ VERBOSEPRINTREG(cpu_id, MSR_S0_PMON_MM_CFG, (1ULL<<63), SET_MM_CFG);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_S0_PMON_MM_CFG, (1ULL<<63)));
}
- else if (westmereEX_counter_map[i].type > UNCORE)
+ else
{
- if(haveLock)
+ VERBOSEPRINTREG(cpu_id, MSR_S1_PMON_MM_CFG, (1ULL<<63), SET_MM_CFG);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_S1_PMON_MM_CFG, (1ULL<<63)));
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags));
+ VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_SBOX);
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+#define NEX_FREEZE_UNCORE \
+ if (haveLock && (eventSet->regTypeMask & ~(0xF))) \
+ { \
+ uint64_t tmp = 0x0ULL; \
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, &tmp)); \
+ tmp &= ~(1ULL<<28); \
+ VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST tmp, FREEZE_UNCORE) \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, tmp)); \
+ }
+
+
+int perfmon_setupCounterThread_nehalemEX(int thread_id, PerfmonEventSet* eventSet)
+{
+ int haveLock = 0;
+ int haveTileLock = 0;
+ uint64_t flags = 0x0ULL;
+ uint64_t fixed_flags = 0x0ULL;
+ uint64_t ubox_flags = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveLock = 1;
+ }
+ if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+ {
+ haveTileLock = 1;
+ }
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
+
+ if (haveLock && (eventSet->regTypeMask & ~(0xFULL)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, 0x0ULL, FREEZE_UNCORE)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, 0x0ULL));
+ }
+ if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(MBOX0))))
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_TIMESTAMP, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_DSP, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_ISS, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_MAP, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_MSC_THR, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_PGT, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_PLD, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_ZDP, 0x0ULL));
+ }
+ if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(MBOX1))))
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_TIMESTAMP, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_DSP, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_ISS, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_MAP, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_MSC_THR, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_PGT, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_PLD, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_ZDP, 0x0ULL));
+ }
+ if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(RBOX0))))
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF0_P0, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF0_P1, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF0_P2, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF0_P3, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF1_P0, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF1_P1, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF1_P2, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF1_P3, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_QLX_P0, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_QLX_P1, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_QLX_P2, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_QLX_P3, 0x0ULL));
+ }
+ if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(RBOX1))))
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF0_P0, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF0_P1, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF0_P2, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF0_P3, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF1_P0, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF1_P1, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF1_P2, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF1_P3, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_QLX_P0, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_QLX_P1, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_QLX_P2, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_QLX_P3, 0x0ULL));
+ }
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ flags = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ uint64_t reg = counter_map[index].configRegister;
+ eventSet->events[i].threadCounter[thread_id].init = TRUE;
+ switch (type)
+ {
+ case PMC:
+ nex_pmc_setup(cpu_id, index, event);
+ break;
+
+ case FIXED:
+ fixed_flags |= nex_fixed_setup(cpu_id, index, event);
+ break;
+
+ case MBOX0:
+ case MBOX1:
+ nex_mbox_setup(cpu_id, index, event);
+ break;
+
+ case BBOX0:
+ case BBOX1:
+ nex_bbox_setup(cpu_id, index, event);
+ break;
+
+ case RBOX0:
+ case RBOX1:
+ nex_rbox_setup(cpu_id, index, event);
+ break;
+
+ case CBOX0:
+ case CBOX1:
+ case CBOX2:
+ case CBOX3:
+ case CBOX4:
+ case CBOX5:
+ case CBOX6:
+ case CBOX7:
+ case CBOX8:
+ case CBOX9:
+ nex_cbox_setup(cpu_id, index, event);
+ break;
+
+ case SBOX0:
+ case SBOX1:
+ nex_sbox_setup(cpu_id, index, event);
+ break;
+
+ case WBOX:
+ nex_wbox_setup(cpu_id, index, event);
+ break;
+
+ case WBOX0FIX:
+ if (haveLock && eventSet->regTypeMask & (REG_TYPE_MASK(WBOX0FIX)))
{
- msr_write(cpu_id, westmereEX_counter_map[i].counterRegister , 0x0ULL);
- uflags[westmereEX_counter_map[i].type] |=
- (1<<(perfmon_threadData[thread_id].counters[i].id)); /* enable uncore counter */
- if (westmereEX_counter_map[i].type == UBOX)
+ flags = 0x1ULL;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags));
+ VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_WBOXFIX)
+ eventSet->regTypeMask |= REG_TYPE_MASK(WBOX);
+ }
+ break;
+
+ case UBOX:
+ if (haveLock && eventSet->regTypeMask & (REG_TYPE_MASK(UBOX)))
+ {
+ flags |= (1ULL<<22); /* set enable bit */
+ flags |= event->eventId;
+ for (int j=0;j<event->numberOfOptions;j++)
{
- enable_ubox = 1;
+ if (event->options[j].type == EVENT_OPTION_EDGE)
+ {
+ flags |= (1ULL<<18);
+ break;
+ }
}
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags));
+ VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_UBOX);
+ ubox_flags = 0x1ULL;
}
- }
+ break;
+
+ default:
+ break;
}
}
- VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, GLOBAL_CTRL);
+ if (fixed_flags != 0x0ULL)
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
+ }
+ if (ubox_flags != 0x0ULL)
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST ubox_flags, ACTIVATE_UBOX);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, ubox_flags));
+ }
+ return 0;
+}
- if (haveLock)
+#define NEX_RESET_ALL_UNCORE_COUNTERS \
+ if (haveLock && (eventSet->regTypeMask & ~(0xF))) \
+ { \
+ uint64_t tmp = 0x0ULL; \
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, &tmp)); \
+ tmp |= (1ULL<<29); \
+ VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST tmp, RESET_ALL_UNCORE_COUNTERS); \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, tmp)); \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, 0x0U)); \
+ }
+
+#define NEX_UNFREEZE_UNCORE \
+ if (haveLock && (eventSet->regTypeMask & ~(0xF))) \
+ { \
+ uint64_t tmp = 0x0ULL; \
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, &tmp)); \
+ tmp |= (1ULL<<28); \
+ VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST tmp, UNFREEZE_UNCORE); \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, tmp)); \
+ }
+
+#define NEX_UNFREEZE_BOX(id, flags) \
+ if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id)))) \
+ { \
+ VERBOSEPRINTREG(cpu_id, box_map[id].ctrlRegister, LLU_CAST flags, UNFREEZE_BOX); \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ctrlRegister, flags)); \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ovflRegister, flags)); \
+ }
+
+int perfmon_startCountersThread_nehalemEX(int thread_id, PerfmonEventSet* eventSet)
+{
+ int haveLock = 0;
+ uint64_t core_ctrl_flags = 0x0ULL;
+ uint32_t uflags[NUM_UNITS] = { [0 ... NUM_UNITS-1] = 0x0U };
+ int cpu_id = groupSet->threads[thread_id].processorId;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
- for ( int i=0; i<NUM_UNITS; i++ )
+ haveLock = 1;
+ }
+
+ NEX_RESET_ALL_UNCORE_COUNTERS;
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- /* if counters are enabled write the according box ctrl register */
- if (uflags[i])
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ uint64_t counter1 = counter_map[index].counterRegister;
+ eventSet->events[i].threadCounter[thread_id].startData = 0;
+ eventSet->events[i].threadCounter[thread_id].counterData = 0;
+ switch (type)
{
- msr_write(cpu_id, westmereEX_PMunits[i].ctrlRegister, uflags[i]);
- VERBOSEPRINTREG(cpu_id, westmereEX_PMunits[i].ctrlRegister, LLU_CAST uflags[i], BOXCTRL);
+ case PMC:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+ core_ctrl_flags |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+ break;
+ case FIXED:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+ core_ctrl_flags |= (1ULL<<(index+32));
+ break;
+ case WBOX0FIX:
+ if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(WBOX0FIX)))
+ {
+ uflags[WBOX] |= (1ULL<<31);
+ }
+ break;
+ default:
+ if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(counter_map[index].type)))
+ {
+ uflags[counter_map[index].type] |= (1<<getCounterTypeOffset(index));
+ }
+ break;
}
}
+ }
- /* set global enable flag in U BOX ctrl register */
- uint32_t ubflags = 0x0UL;
- ubflags |= (1<<28); /* enable all */
- if (enable_ubox)
+ if (haveLock)
+ {
+ for ( int i=0; i<NUM_UNITS; i++ )
{
- ubflags |= (1<<0);
+ if (uflags[i] != 0x0U)
+ {
+ NEX_UNFREEZE_BOX(i, uflags[i]);
+ }
}
- VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST ubflags, UBOX_GLOBAL_CTRL);
- msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
}
+
+ NEX_UNFREEZE_UNCORE;
+
/* Finally enable counters */
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
- msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x30000000FULL);
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST core_ctrl_flags, GLOBAL_CTRL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, core_ctrl_flags));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<63)|(1ULL<<62)|core_ctrl_flags));
+ }
+ return 0;
}
-void perfmon_stopCountersThread_nehalemEX(int thread_id)
+#define NEX_CHECK_OVERFLOW(id, offset) \
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+ { \
+ uint64_t tmp = 0x0ULL; \
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, box_map[id].statusRegister, &tmp)); \
+ if (tmp & (1ULL<<offset)) \
+ { \
+ eventSet->events[i].threadCounter[thread_id].overflows++; \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].statusRegister, (tmp & (1ULL<<offset)))); \
+ } \
+ }
+
+#define NEX_CHECK_UNCORE_OVERFLOW(id, offset) \
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+ { \
+ uint64_t tmp = 0x0ULL; \
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, box_map[id].statusRegister, &tmp)); \
+ if (tmp & (1ULL<<offset)) \
+ { \
+ eventSet->events[i].threadCounter[thread_id].overflows++; \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ovflRegister, (tmp & (1ULL<<offset)))); \
+ } \
+ }
+
+int perfmon_stopCountersThread_nehalemEX(int thread_id, PerfmonEventSet* eventSet)
{
int haveLock = 0;
- int cpu_id = perfmon_threadData[thread_id].processorId;
-
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+ uint64_t counter_result = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
- uint32_t ubflags = 0x0UL;
haveLock = 1;
- // ubflags |= (1<<29); /* reset all */
- msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
}
- for ( int i=0; i<NUM_COUNTERS_WESTMEREEX; i++ )
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST 0x0ULL, FREEZE_PMC_AND_FIXED);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
+ NEX_FREEZE_UNCORE;
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- if (westmereEX_counter_map[i].type > UNCORE)
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- if(haveLock)
- {
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
-
- VERBOSEPRINTREG(cpu_id, westmereEX_counter_map[i].counterRegister,
- LLU_CAST perfmon_threadData[thread_id].counters[i].counterData, READ_UNCORE);
- }
+ continue;
}
- else
+ counter_result = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ uint64_t reg = counter_map[index].configRegister;
+ switch (type)
{
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
-
- VERBOSEPRINTREG(cpu_id, westmereEX_counter_map[i].counterRegister,
- LLU_CAST perfmon_threadData[thread_id].counters[i].counterData, READ_CORE);
+ case PMC:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter_map[index].counterRegister, &counter_result));
+ NEX_CHECK_OVERFLOW(PMC, index-cpuid_info.perf_num_fixed_ctr);
+ VERBOSEPRINTREG(cpu_id, counter_map[index].counterRegister, LLU_CAST counter_result, READ_PMC);
+ break;
+ case FIXED:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter_map[index].counterRegister, &counter_result));
+ NEX_CHECK_OVERFLOW(PMC, index+32);
+ VERBOSEPRINTREG(cpu_id, counter_map[index].counterRegister, LLU_CAST counter_result, READ_FIXED);
+ break;
+ default:
+ if(haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(counter_map[index].type)))
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter_map[index].counterRegister, &counter_result));
+ NEX_CHECK_UNCORE_OVERFLOW(counter_map[index].type, getCounterTypeOffset(index));
+ VERBOSEPRINTREG(cpu_id, counter_map[index].counterRegister, LLU_CAST counter_result, READ_UNCORE);
+ }
+ break;
}
+ eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
}
}
-#if 0
- flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
- printf ("Status: 0x%llX \n", LLU_CAST flags);
- if((flags & 0x3) || (flags & (0x3ULL<<32)) )
- {
- printf ("Overflow occured \n");
- }
-#endif
+ return 0;
}
-void perfmon_readCountersThread_nehalemEX(int thread_id)
+int perfmon_readCountersThread_nehalemEX(int thread_id, PerfmonEventSet* eventSet)
{
int haveLock = 0;
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+ uint64_t counter_result = 0x0ULL;
+ uint64_t core_ctrl_flags = 0x0ULL;
if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
haveLock = 1;
}
- for ( int i=0; i<NUM_COUNTERS_WESTMEREEX; i++ )
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &core_ctrl_flags));
+ }
+ NEX_FREEZE_UNCORE;
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- if (westmereEX_counter_map[i].type > UNCORE)
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- if(haveLock)
- {
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
- }
+ continue;
}
- else
+ counter_result = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ uint64_t counter = counter_map[index].counterRegister;
+ switch (type)
{
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
+ case PMC:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+ NEX_CHECK_OVERFLOW(PMC, index-cpuid_info.perf_num_fixed_ctr);
+ VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, READ_PMC);
+ break;
+ case FIXED:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+ NEX_CHECK_OVERFLOW(PMC, index+32);
+ VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, READ_FIXED);
+ break;
+ default:
+ if(haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(counter_map[index].type)))
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+ NEX_CHECK_UNCORE_OVERFLOW(counter_map[index].type, getCounterTypeOffset(index));
+ VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, READ_UNCORE);
+ }
+ break;
}
+ eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
}
}
+
+ NEX_UNFREEZE_UNCORE;
+ if ((eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED))) && (core_ctrl_flags != 0x0ULL))
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, core_ctrl_flags));
+ }
+ return 0;
}
+int perfmon_finalizeCountersThread_nehalemEX(int thread_id, PerfmonEventSet* eventSet)
+{
+ int haveLock = 0;
+ int haveTileLock = 0;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+ uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveLock = 1;
+ }
+ if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+ {
+ haveTileLock = 1;
+ }
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ uint64_t reg = counter_map[index].configRegister;
+ PciDeviceIndex dev = counter_map[index].device;
+ switch (type)
+ {
+ case PMC:
+ ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+ if ((haveTileLock) && (event->eventId == 0xB7))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
+ }
+ else if ((haveTileLock) && (event->eventId == 0xBB))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+ }
+ break;
+ case FIXED:
+ ovf_values_core |= (1ULL<<(index+32));
+ break;
+ case MBOX0:
+ case MBOX1:
+ if (haveLock && ((event->cfgBits == 0x02) || (event->cfgBits == 0x04)))
+ {
+ VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, 0x0ULL, CLEAR_MATCH0);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1, 0x0ULL));
+ VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister2, 0x0ULL, CLEAR_MASK0);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2, 0x0ULL));
+ }
+ break;
+ case SBOX0:
+ if (haveLock && (event->eventId == 0x00))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_S0_PMON_MM_CFG, 0x0ULL, CLEAR_MM_CFG);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_S0_PMON_MM_CFG, 0x0ULL));
+ VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, 0x0ULL, CLEAR_MATCH0);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1, 0x0ULL));
+ VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister2, 0x0ULL, CLEAR_MASK0);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2, 0x0ULL));
+ }
+ break;
+ case SBOX1:
+ if (haveLock && (event->eventId == 0x00))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_S1_PMON_MM_CFG, 0x0ULL, CLEAR_MM_CFG);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_S1_PMON_MM_CFG, 0x0ULL));
+ VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, 0x0ULL, CLEAR_MATCH0);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1, 0x0ULL));
+ VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister2, 0x0ULL, CLEAR_MASK0);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2, 0x0ULL));
+ }
+ break;
+ case BBOX0:
+ case BBOX1:
+ if (haveLock && ((event->eventId == 0x01) ||
+ (event->eventId == 0x02) ||
+ (event->eventId == 0x03) ||
+ (event->eventId == 0x04) ||
+ (event->eventId == 0x05) ||
+ (event->eventId == 0x06)))
+ {
+ VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, 0x0ULL, CLEAR_MATCH0);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1, 0x0ULL));
+ VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister2, 0x0ULL, CLEAR_MASK0);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2, 0x0ULL));
+ }
+ break;
+ default:
+ break;
+ }
+ if ((reg) && (((dev == MSR_DEV) && (type < UNCORE)) || (((haveLock) && (type > UNCORE)))))
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+ }
+ eventSet->events[i].threadCounter[thread_id].init = FALSE;
+ }
+
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core, CLEAR_OVF_CTRL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, CLEAR_PMC_AND_FIXED_CTRL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
+
+ if (haveLock && (eventSet->regTypeMask & ~(0xFULL)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_OVF_CTRL, 0x0ULL, CLEAR_UNCORE_OVF);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_OVF_CTRL, 0x0ULL));
+ VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, 0x0ULL, CLEAR_UNCORE_CTRL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, 0x0ULL));
+ }
+ return 0;
+}
diff --git a/src/includes/perfmon_nehalemEX_counters.h b/src/includes/perfmon_nehalemEX_counters.h
new file mode 100644
index 0000000..d40da5c
--- /dev/null
+++ b/src/includes/perfmon_nehalemEX_counters.h
@@ -0,0 +1,185 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: perfmon_nehalemEX_counters.h
+ *
+ * Description: Counter Header File of perfmon module for Intel Westmere EX.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#define NUM_COUNTERS_CORE_NEHALEMEX 7
+#define NUM_COUNTERS_UNCORE_NEHALEMEX 105
+#define NUM_COUNTERS_NEHALEMEX 105
+
+#define NEX_VALID_OPTIONS_FIXED EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_ANYTHREAD_MASK
+#define NEX_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+#define NEX_VALID_OPTIONS_MBOX EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+#define NEX_VALID_OPTIONS_BBOX EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+#define NEX_VALID_OPTIONS_CBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_EDGE_MASK
+#define NEX_VALID_OPTIONS_SBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_EDGE_MASK
+#define NEX_VALID_OPTIONS_WBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_EDGE_MASK
+
+static RegisterMap nehalemEX_counter_map[NUM_COUNTERS_NEHALEMEX] = {
+ /* Fixed Counters: instructions retired, cycles unhalted core */
+ {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, NEX_VALID_OPTIONS_FIXED},
+ {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, NEX_VALID_OPTIONS_FIXED},
+ {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, NEX_VALID_OPTIONS_FIXED},
+ /* PMC Counters: 4 48bit wide */
+ {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, NEX_VALID_OPTIONS_PMC},
+ {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, NEX_VALID_OPTIONS_PMC},
+ {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, NEX_VALID_OPTIONS_PMC},
+ {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, NEX_VALID_OPTIONS_PMC},
+ /* MBOX */
+ {"MBOX0C0",PMC7, MBOX0, MSR_M0_PMON_EVNT_SEL0, MSR_M0_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_MBOX},
+ {"MBOX0C1",PMC8, MBOX0, MSR_M0_PMON_EVNT_SEL1, MSR_M0_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_MBOX},
+ {"MBOX0C2",PMC9, MBOX0, MSR_M0_PMON_EVNT_SEL2, MSR_M0_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_MBOX},
+ {"MBOX0C3",PMC10, MBOX0, MSR_M0_PMON_EVNT_SEL3, MSR_M0_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_MBOX},
+ {"MBOX0C4",PMC11, MBOX0, MSR_M0_PMON_EVNT_SEL4, MSR_M0_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_MBOX},
+ {"MBOX0C5",PMC12, MBOX0, MSR_M0_PMON_EVNT_SEL5, MSR_M0_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_MBOX},
+ {"MBOX1C0",PMC13, MBOX1, MSR_M1_PMON_EVNT_SEL0, MSR_M1_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_MBOX},
+ {"MBOX1C1",PMC14, MBOX1, MSR_M1_PMON_EVNT_SEL1, MSR_M1_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_MBOX},
+ {"MBOX1C2",PMC15, MBOX1, MSR_M1_PMON_EVNT_SEL2, MSR_M1_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_MBOX},
+ {"MBOX1C3",PMC16, MBOX1, MSR_M1_PMON_EVNT_SEL3, MSR_M1_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_MBOX},
+ {"MBOX1C4",PMC17, MBOX1, MSR_M1_PMON_EVNT_SEL4, MSR_M1_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_MBOX},
+ {"MBOX1C5",PMC18, MBOX1, MSR_M1_PMON_EVNT_SEL5, MSR_M1_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_MBOX},
+ /* BBOX */
+ {"BBOX0C0",PMC19, BBOX0, MSR_B0_PMON_EVNT_SEL0, MSR_B0_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_BBOX},
+ {"BBOX0C1",PMC20, BBOX0, MSR_B0_PMON_EVNT_SEL1, MSR_B0_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_BBOX},
+ {"BBOX0C2",PMC21, BBOX0, MSR_B0_PMON_EVNT_SEL2, MSR_B0_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_BBOX},
+ {"BBOX0C3",PMC22, BBOX0, MSR_B0_PMON_EVNT_SEL3, MSR_B0_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_BBOX},
+ {"BBOX1C0",PMC23, BBOX1, MSR_B1_PMON_EVNT_SEL0, MSR_B1_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_BBOX},
+ {"BBOX1C1",PMC24, BBOX1, MSR_B1_PMON_EVNT_SEL1, MSR_B1_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_BBOX},
+ {"BBOX1C2",PMC25, BBOX1, MSR_B1_PMON_EVNT_SEL2, MSR_B1_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_BBOX},
+ {"BBOX1C3",PMC26, BBOX1, MSR_B1_PMON_EVNT_SEL3, MSR_B1_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_BBOX},
+ /* RBOX */
+ {"RBOX0C0",PMC27, RBOX0, MSR_R0_PMON_EVNT_SEL0, MSR_R0_PMON_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX0C1",PMC28, RBOX0, MSR_R0_PMON_EVNT_SEL1, MSR_R0_PMON_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX0C2",PMC29, RBOX0, MSR_R0_PMON_EVNT_SEL2, MSR_R0_PMON_CTR2, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX0C3",PMC30, RBOX0, MSR_R0_PMON_EVNT_SEL3, MSR_R0_PMON_CTR3, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX0C4",PMC31, RBOX0, MSR_R0_PMON_EVNT_SEL4, MSR_R0_PMON_CTR4, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX0C5",PMC32, RBOX0, MSR_R0_PMON_EVNT_SEL5, MSR_R0_PMON_CTR5, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX0C6",PMC33, RBOX0, MSR_R0_PMON_EVNT_SEL6, MSR_R0_PMON_CTR6, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX0C7",PMC34, RBOX0, MSR_R0_PMON_EVNT_SEL7, MSR_R0_PMON_CTR7, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX1C0",PMC35, RBOX1, MSR_R1_PMON_EVNT_SEL8, MSR_R1_PMON_CTR8, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX1C1",PMC36, RBOX1, MSR_R1_PMON_EVNT_SEL9, MSR_R1_PMON_CTR9, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX1C2",PMC37, RBOX1, MSR_R1_PMON_EVNT_SEL10, MSR_R1_PMON_CTR10, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX1C3",PMC38, RBOX1, MSR_R1_PMON_EVNT_SEL11, MSR_R1_PMON_CTR11, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX1C4",PMC39, RBOX1, MSR_R1_PMON_EVNT_SEL12, MSR_R1_PMON_CTR12, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX1C5",PMC40, RBOX1, MSR_R1_PMON_EVNT_SEL13, MSR_R1_PMON_CTR13, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX1C6",PMC41, RBOX1, MSR_R1_PMON_EVNT_SEL14, MSR_R1_PMON_CTR14, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX1C7",PMC42, RBOX1, MSR_R1_PMON_EVNT_SEL15, MSR_R1_PMON_CTR15, 0, 0, EVENT_OPTION_NONE_MASK},
+ /* WBOX */
+ {"WBOX0",PMC43, WBOX, MSR_W_PMON_EVNT_SEL0, MSR_W_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_WBOX},
+ {"WBOX1",PMC44, WBOX, MSR_W_PMON_EVNT_SEL1, MSR_W_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_WBOX},
+ {"WBOX2",PMC45, WBOX, MSR_W_PMON_EVNT_SEL2, MSR_W_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_WBOX},
+ {"WBOX3",PMC46, WBOX, MSR_W_PMON_EVNT_SEL3, MSR_W_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_WBOX},
+ {"WBOXFIX",PMC47, WBOX0FIX, MSR_W_PMON_FIXED_CTR_CTL, MSR_W_PMON_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+ /* UBOX */
+ {"UBOX0",PMC48, UBOX, MSR_U_PMON_GLOBAL_EVNT_SEL, MSR_U_PMON_GLOBAL_CTR, 0, 0, EVENT_OPTION_EDGE_MASK},
+ /* CBOXes */
+ {"CBOX0C0",PMC49, CBOX0, MSR_C0_PMON_EVNT_SEL0, MSR_C0_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX0C1",PMC50, CBOX0, MSR_C0_PMON_EVNT_SEL1, MSR_C0_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX0C2",PMC51, CBOX0, MSR_C0_PMON_EVNT_SEL2, MSR_C0_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX0C3",PMC52, CBOX0, MSR_C0_PMON_EVNT_SEL3, MSR_C0_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX0C4",PMC53, CBOX0, MSR_C0_PMON_EVNT_SEL4, MSR_C0_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX0C5",PMC54, CBOX0, MSR_C0_PMON_EVNT_SEL5, MSR_C0_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX1C0",PMC55, CBOX1, MSR_C1_PMON_EVNT_SEL0, MSR_C1_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX1C1",PMC56, CBOX1, MSR_C1_PMON_EVNT_SEL1, MSR_C1_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX1C2",PMC57, CBOX1, MSR_C1_PMON_EVNT_SEL2, MSR_C1_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX1C3",PMC58, CBOX1, MSR_C1_PMON_EVNT_SEL3, MSR_C1_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX1C4",PMC59, CBOX1, MSR_C1_PMON_EVNT_SEL4, MSR_C1_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX1C5",PMC60, CBOX1, MSR_C1_PMON_EVNT_SEL5, MSR_C1_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX2C0",PMC61, CBOX2, MSR_C2_PMON_EVNT_SEL0, MSR_C2_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX2C1",PMC62, CBOX2, MSR_C2_PMON_EVNT_SEL1, MSR_C2_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX2C2",PMC63, CBOX2, MSR_C2_PMON_EVNT_SEL2, MSR_C2_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX2C3",PMC64, CBOX2, MSR_C2_PMON_EVNT_SEL3, MSR_C2_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX2C4",PMC65, CBOX2, MSR_C2_PMON_EVNT_SEL4, MSR_C2_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX2C5",PMC66, CBOX2, MSR_C2_PMON_EVNT_SEL5, MSR_C2_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX3C0",PMC67, CBOX3, MSR_C3_PMON_EVNT_SEL0, MSR_C3_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX3C1",PMC68, CBOX3, MSR_C3_PMON_EVNT_SEL1, MSR_C3_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX3C2",PMC69, CBOX3, MSR_C3_PMON_EVNT_SEL2, MSR_C3_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX3C3",PMC70, CBOX3, MSR_C3_PMON_EVNT_SEL3, MSR_C3_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX3C4",PMC71, CBOX3, MSR_C3_PMON_EVNT_SEL4, MSR_C3_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX3C5",PMC72, CBOX3, MSR_C3_PMON_EVNT_SEL5, MSR_C3_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX4C0",PMC73, CBOX4, MSR_C4_PMON_EVNT_SEL0, MSR_C4_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX4C1",PMC74, CBOX4, MSR_C4_PMON_EVNT_SEL1, MSR_C4_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX4C2",PMC75, CBOX4, MSR_C4_PMON_EVNT_SEL2, MSR_C4_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX4C3",PMC76, CBOX4, MSR_C4_PMON_EVNT_SEL3, MSR_C4_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX4C4",PMC77, CBOX4, MSR_C4_PMON_EVNT_SEL4, MSR_C4_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX4C5",PMC78, CBOX4, MSR_C4_PMON_EVNT_SEL5, MSR_C4_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX5C0",PMC79, CBOX5, MSR_C5_PMON_EVNT_SEL0, MSR_C5_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX5C1",PMC80, CBOX5, MSR_C5_PMON_EVNT_SEL1, MSR_C5_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX5C2",PMC81, CBOX5, MSR_C5_PMON_EVNT_SEL2, MSR_C5_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX5C3",PMC82, CBOX5, MSR_C5_PMON_EVNT_SEL3, MSR_C5_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX5C4",PMC83, CBOX5, MSR_C5_PMON_EVNT_SEL4, MSR_C5_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX5C5",PMC84, CBOX5, MSR_C5_PMON_EVNT_SEL5, MSR_C5_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX6C0",PMC85, CBOX6, MSR_C6_PMON_EVNT_SEL0, MSR_C6_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX6C1",PMC86, CBOX6, MSR_C6_PMON_EVNT_SEL1, MSR_C6_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX6C2",PMC87, CBOX6, MSR_C6_PMON_EVNT_SEL2, MSR_C6_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX6C3",PMC88, CBOX6, MSR_C6_PMON_EVNT_SEL3, MSR_C6_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX6C4",PMC89, CBOX6, MSR_C6_PMON_EVNT_SEL4, MSR_C6_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX6C4",PMC90, CBOX6, MSR_C6_PMON_EVNT_SEL5, MSR_C6_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX7C0",PMC91, CBOX7, MSR_C7_PMON_EVNT_SEL0, MSR_C7_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX7C1",PMC92, CBOX7, MSR_C7_PMON_EVNT_SEL1, MSR_C7_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX7C2",PMC93, CBOX7, MSR_C7_PMON_EVNT_SEL2, MSR_C7_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX7C3",PMC94, CBOX7, MSR_C7_PMON_EVNT_SEL3, MSR_C7_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX7C4",PMC95, CBOX7, MSR_C7_PMON_EVNT_SEL4, MSR_C7_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ {"CBOX7C5",PMC96, CBOX7, MSR_C7_PMON_EVNT_SEL5, MSR_C7_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_CBOX},
+ /* SBOXes */
+ {"SBOX0C0",PMC97, SBOX0, MSR_S0_PMON_EVNT_SEL0, MSR_S0_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_SBOX},
+ {"SBOX0C1",PMC98, SBOX0, MSR_S0_PMON_EVNT_SEL1, MSR_S0_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_SBOX},
+ {"SBOX0C2",PMC99, SBOX0, MSR_S0_PMON_EVNT_SEL2, MSR_S0_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_SBOX},
+ {"SBOX0C3",PMC100, SBOX0, MSR_S0_PMON_EVNT_SEL3, MSR_S0_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_SBOX},
+ {"SBOX1C0",PMC101, SBOX1, MSR_S1_PMON_EVNT_SEL0, MSR_S1_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_SBOX},
+ {"SBOX1C1",PMC102, SBOX1, MSR_S1_PMON_EVNT_SEL1, MSR_S1_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_SBOX},
+ {"SBOX1C2",PMC103, SBOX1, MSR_S1_PMON_EVNT_SEL2, MSR_S1_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_SBOX},
+ {"SBOX1C3",PMC104, SBOX1, MSR_S1_PMON_EVNT_SEL3, MSR_S1_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_SBOX}
+};
+
+
+static BoxMap nehalemEX_box_map[NUM_UNITS] = {
+ [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+ [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+ [MBOX0] = {MSR_M0_PMON_BOX_CTRL, MSR_M0_PMON_BOX_STATUS, MSR_M0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_M0_PMON_ADDR_MATCH, MSR_M0_PMON_ADDR_MASK},
+ [MBOX1] = {MSR_M1_PMON_BOX_CTRL, MSR_M1_PMON_BOX_STATUS, MSR_M1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_M1_PMON_ADDR_MATCH, MSR_M1_PMON_ADDR_MASK},
+ [BBOX0] = {MSR_B0_PMON_BOX_CTRL, MSR_B0_PMON_BOX_STATUS, MSR_B0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_B0_PMON_MATCH,MSR_B0_PMON_MASK},
+ [BBOX1] = {MSR_B1_PMON_BOX_CTRL, MSR_B1_PMON_BOX_STATUS, MSR_B1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_B1_PMON_MATCH,MSR_B1_PMON_MASK},
+ [RBOX0] = {MSR_R0_PMON_BOX_CTRL, MSR_R0_PMON_BOX_STATUS, MSR_R0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [RBOX1] = {MSR_R1_PMON_BOX_CTRL, MSR_R1_PMON_BOX_STATUS, MSR_R1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [SBOX0] = {MSR_S0_PMON_BOX_CTRL, MSR_S0_PMON_BOX_STATUS, MSR_S0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_S0_PMON_MATCH, MSR_S0_PMON_MASK},
+ [SBOX1] = {MSR_S1_PMON_BOX_CTRL, MSR_S1_PMON_BOX_STATUS, MSR_S1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_S1_PMON_MATCH, MSR_S1_PMON_MASK},
+ [CBOX0] = {MSR_C0_PMON_BOX_CTRL, MSR_C0_PMON_BOX_STATUS, MSR_C0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [CBOX1] = {MSR_C1_PMON_BOX_CTRL, MSR_C1_PMON_BOX_STATUS, MSR_C1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [CBOX2] = {MSR_C2_PMON_BOX_CTRL, MSR_C2_PMON_BOX_STATUS, MSR_C2_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [CBOX3] = {MSR_C3_PMON_BOX_CTRL, MSR_C3_PMON_BOX_STATUS, MSR_C3_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [CBOX4] = {MSR_C4_PMON_BOX_CTRL, MSR_C4_PMON_BOX_STATUS, MSR_C4_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [CBOX5] = {MSR_C5_PMON_BOX_CTRL, MSR_C5_PMON_BOX_STATUS, MSR_C5_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [CBOX6] = {MSR_C6_PMON_BOX_CTRL, MSR_C6_PMON_BOX_STATUS, MSR_C6_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [CBOX7] = {MSR_C7_PMON_BOX_CTRL, MSR_C7_PMON_BOX_STATUS, MSR_C7_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [WBOX] = {MSR_W_PMON_BOX_CTRL, MSR_W_PMON_BOX_STATUS, MSR_W_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [WBOX0FIX] = {MSR_W_PMON_BOX_CTRL, MSR_W_PMON_BOX_STATUS, MSR_W_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [UBOX] = {MSR_U_PMON_GLOBAL_CTRL, MSR_U_PMON_GLOBAL_STATUS, MSR_U_PMON_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+};
diff --git a/src/includes/perfmon_nehalemEX_events.txt b/src/includes/perfmon_nehalemEX_events.txt
index 565f5ca..1c4cf31 100644
--- a/src/includes/perfmon_nehalemEX_events.txt
+++ b/src/includes/perfmon_nehalemEX_events.txt
@@ -1,16 +1,17 @@
# =======================================================================================
-#
+#
# Filename: perfmon_nehalemEX_events.txt
-#
-# Description: Event list for Intel NehalemEX
-#
-# Version: 3.1.3
-# Released: 4.11.2014
-#
-# Author: Jan Treibig (jt), jan.treibig at gmail.com
+#
+# Description: Event list for Intel Nehalem EX
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
+# Author: Jan Treibig (jt), jan.treibig at gmail.com
+# Thomas Roehl (tr), thomas.roehl at googlemail.com
# Project: likwid
#
-# Copyright (C) 2014 Jan Treibig
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
@@ -69,12 +70,6 @@ UMASK_DTLB_LOAD_MISSES_PDE_MISS 0x20
UMASK_DTLB_LOAD_MISSES_PDP_MISS 0x40
UMASK_DTLB_LOAD_MISSES_LARGE_WALK_COMPLETED 0x80
-EVENT_MEMORY_DISAMBIGURATION 0x09 PMC
-UMASK_MEMORY_DISAMBIGURATION_RESET 0x01
-UMASK_MEMORY_DISAMBIGURATION_SUCCESS 0x01
-UMASK_MEMORY_DISAMBIGURATION_WATCHDOG 0x01
-UMASK_MEMORY_DISAMBIGURATION_WATCH_CYCLES 0x01
-
EVENT_MEM_INST_RETIRED 0x0B PMC
UMASK_MEM_INST_RETIRED_LOADS 0x01
UMASK_MEM_INST_RETIRED_STORES 0x02
@@ -84,8 +79,8 @@ EVENT_MEM_STORE_RETIRED_DTLB 0x0C PMC
UMASK_MEM_STORE_RETIRED_DTLB_MISS 0x01
EVENT_UOPS_ISSUED 0x0E PMC
-UMASK_UOPS_ISSUED_ANY 0x01
-UMASK_UOPS_ISSUED_FUSED 0x02
+UMASK_UOPS_ISSUED_ANY 0x01
+UMASK_UOPS_ISSUED_FUSED 0x02
EVENT_MEM_UNCORE_RETIRED 0x0F PMC
UMASK_MEM_UNCORE_RETIRED_OTHER_CORE_L2_HITM 0x02
@@ -519,8 +514,12 @@ UMASK_SIMD_INT_64_PACKED_LOGICAL 0x10
UMASK_SIMD_INT_64_PACKED_ARITH 0x20
UMASK_SIMD_INT_64_SHUFFLE_MOVE 0x40
-EVENT_UNCORE_CYCLES 0xFF WBOX4
-UMASK_UNCORE_CYCLES 0x00
+EVENT_OFFCORE_RESPONSE_0 0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS 0x01 0xFF 0xFF
+
+EVENT_UNCORE_CLOCK 0xFF WBOXFIX
+UMASK_UNCORE_CLOCK 0x00
EVENT_C_CYCLES_TURBO 0x04 WBOX
UMASK_C_CYCLES_TURBO_C0 0x01
@@ -592,40 +591,32 @@ UMASK_TM1_ON_C7 0x80
UMASK_TM1_ON_C_ALL 0xFF
EVENT_BBOX_CMDS_ALL 0x1A MBOX
-UMASK_BBOX_CMDS_ALL 0xFF
+UMASK_BBOX_CMDS_ALL 0x00 0x00 0x00
-EVENT_BCMD_SCHEDQ_OCCUPANCY 0x06 MBOX
-UMASK_BCMD_SCHEDQ_OCCUPANCY_READS 0x00 0x01 0x00
-UMASK_BCMD_SCHEDQ_OCCUPANCY_WRITES 0x01 0x01 0x00
-UMASK_BCMD_SCHEDQ_OCCUPANCY_MERGE 0x02 0x01 0x00
-UMASK_BCMD_SCHEDQ_OCCUPANCY_V2F 0x03 0x01 0x00
-UMASK_BCMD_SCHEDQ_OCCUPANCY_V2V 0x04 0x01 0x00
-UMASK_BCMD_SCHEDQ_OCCUPANCY_F2V 0x05 0x01 0x00
-UMASK_BCMD_SCHEDQ_OCCUPANCY_F2B 0x06 0x01 0x00
-UMASK_BCMD_SCHEDQ_OCCUPANCY_SPRWR 0x07 0x01 0x00
-UMASK_BCMD_SCHEDQ_OCCUPANCY_ALL 0x08 0x01 0x00
+EVENT_REFRESH 0x06 MBOX
+UMASK_REFRESH 0x00 0x00 0x00
-EVENT_BBOX_CYCLES 0x1B MBOX
-UMASK_BBOX_CYCLES 0xFF
+EVENT_MBOX_CLOCKTICKS 0x1B MBOX
+UMASK_MBOX_CLOCKTICKS 0x00 0x00 0x00
-EVENT_CYCLES_DSP_FILL 0x00 MBOX
-UMASK_CYCLES_DSP_FILL_RDQ_FULL 0x01 0x01 0x00
-UMASK_CYCLES_DSP_FILL_WRQ_FULL 0x02 0x01 0x00
-UMASK_CYCLES_DSP_FILL_RDQ_EMPTY 0x04 0x01 0x00
-UMASK_CYCLES_DSP_FILL_WRQ_EMPTY 0x08 0x01 0x00
+EVENT_CYCLES_DSP_FILL 0x00 MBOX
+UMASK_CYCLES_DSP_FILL_RDQ_FULL 0x01 0x01 0x00
+UMASK_CYCLES_DSP_FILL_WRQ_FULL 0x02 0x01 0x00
+UMASK_CYCLES_DSP_FILL_RDQ_EMPTY 0x04 0x01 0x00
+UMASK_CYCLES_DSP_FILL_WRQ_EMPTY 0x08 0x01 0x00
-EVENT_CYCLES_MFULL 0x01 MBOX
-UMASK_CYCLES_MFULL 0x00 0x01 0x00
+EVENT_CYCLES_MFULL 0x01 MBOX
+UMASK_CYCLES_MFULL 0x00 0x00 0x00
-EVENT_CYCLES_PGT_STATE 0x05 MBOX
-UMASK_CYCLES_PGT_STATE_CLOSED 0x00 0x01 0x00
-UMASK_CYCLES_PGT_STATE_OPEN 0x01 0x01 0x00
+EVENT_CYCLES_PGT_STATE 0x05 MBOX
+UMASK_CYCLES_PGT_STATE_CLOSED 0x00 0x01 0x00
+UMASK_CYCLES_PGT_STATE_OPEN 0x01 0x01 0x00
-EVENT_CYCLES_RETRYQ_STARVED 0x04 MBOX
-UMASK_CYCLES_RETRYQ_STARVED 0x00 0x01 0x00
+EVENT_CYCLES_RETRYQ_STARVED 0x04 MBOX
+UMASK_CYCLES_RETRYQ_STARVED 0x00 0x01 0x00
-EVENT_CYCLES_RETRYQ_MFULL 0x03 MBOX
-UMASK_CYCLES_RETRYQ_MFULL 0x00 0x01 0x00
+EVENT_CYCLES_RETRYQ_MFULL 0x03 MBOX
+UMASK_CYCLES_RETRYQ_MFULL 0x00 0x01 0x00
EVENT_CYCLES_SCHED_MODE 0x01 MBOX
UMASK_CYCLES_SCHED_MODE_TRADEOFF 0x00 0x01 0x00
@@ -634,34 +625,35 @@ UMASK_CYCLES_SCHED_MODE_WRPRIO 0x02 0x01 0x00
UMASK_CYCLES_SCHED_MODE_ADAPTIVE 0x03 0x01 0x00
EVENT_DRAM_CMD 0x0A MBOX
+OPTIONS_DRAM_CMD EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
UMASK_DRAM_CMD_ALL 0x00 0x02 0x00
-UMASK_DRAM_CMD_ILLEGAL 0x01 0x02 0x00
+UMASK_DRAM_CMD_ILLEGAL 0x00 0x02 0x00
UMASK_DRAM_CMD_PREALL 0x01 0x02 0x00
-UMASK_DRAM_CMD_PREALL_TRDOFF 0x01 0x02 0x00
-UMASK_DRAM_CMD_PREALL_RDPRIO 0x01 0x02 0x01
-UMASK_DRAM_CMD_PREALL_WRPRIO 0x01 0x02 0x02
-UMASK_DRAM_CMD_PREALL_ADAPTIVE 0x01 0x02 0x02
+UMASK_DRAM_CMD_PREALL_TRDOFF 0x01 0x02 0x10
+UMASK_DRAM_CMD_PREALL_RDPRIO 0x01 0x02 0x11
+UMASK_DRAM_CMD_PREALL_WRPRIO 0x01 0x02 0x12
+UMASK_DRAM_CMD_PREALL_ADAPTIVE 0x01 0x02 0x13
UMASK_DRAM_CMD_RAS 0x02 0x02 0x00
-UMASK_DRAM_CMD_RAS_TRDOFF 0x02 0x02 0x00
-UMASK_DRAM_CMD_RAS_RDPRIO 0x02 0x02 0x01
-UMASK_DRAM_CMD_RAS_WRPRIO 0x02 0x02 0x02
-UMASK_DRAM_CMD_RAS_ADAPTIVE 0x02 0x02 0x02
-UMASK_DRAM_CMD_CAS_RD_OPN 0x02 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_OPN 0x02 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_OPN_TRDOFF 0x02 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_OPN_RDPRIO 0x04 0x02 0x01
-UMASK_DRAM_CMD_CAS_WR_OPN_WRPRIO 0x04 0x02 0x02
-UMASK_DRAM_CMD_CAS_WR_OPN_ADAPTIVE 0x04 0x02 0x03
+UMASK_DRAM_CMD_RAS_TRDOFF 0x02 0x02 0x10
+UMASK_DRAM_CMD_RAS_RDPRIO 0x02 0x02 0x11
+UMASK_DRAM_CMD_RAS_WRPRIO 0x02 0x02 0x12
+UMASK_DRAM_CMD_RAS_ADAPTIVE 0x02 0x02 0x13
+UMASK_DRAM_CMD_CAS_RD_OPN 0x03 0x02 0x00
+UMASK_DRAM_CMD_CAS_WR_OPN 0x04 0x02 0x00
+UMASK_DRAM_CMD_CAS_WR_OPN_TRDOFF 0x04 0x02 0x10
+UMASK_DRAM_CMD_CAS_WR_OPN_RDPRIO 0x04 0x02 0x11
+UMASK_DRAM_CMD_CAS_WR_OPN_WRPRIO 0x04 0x02 0x12
+UMASK_DRAM_CMD_CAS_WR_OPN_ADAPTIVE 0x04 0x02 0x13
UMASK_DRAM_CMD_CAS_RD_CLS 0x05 0x02 0x00
-UMASK_DRAM_CMD_CAS_RD_CLS_TRDOFF 0x05 0x02 0x00
-UMASK_DRAM_CMD_CAS_RD_CLS_RDPRIO 0x05 0x02 0x01
-UMASK_DRAM_CMD_CAS_RD_CLS_WRPRIO 0x05 0x02 0x02
-UMASK_DRAM_CMD_CAS_RD_CLS_ADAPTIVE 0x05 0x02 0x03
+UMASK_DRAM_CMD_CAS_RD_CLS_TRDOFF 0x05 0x02 0x10
+UMASK_DRAM_CMD_CAS_RD_CLS_RDPRIO 0x05 0x02 0x11
+UMASK_DRAM_CMD_CAS_RD_CLS_WRPRIO 0x05 0x02 0x12
+UMASK_DRAM_CMD_CAS_RD_CLS_ADAPTIVE 0x05 0x02 0x13
UMASK_DRAM_CMD_CAS_WR_CLS 0x06 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_CLS_TRDOFF 0x06 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_CLS_RDPRIO 0x06 0x02 0x01
-UMASK_DRAM_CMD_CAS_WR_CLS_WRPRIO 0x06 0x02 0x02
-UMASK_DRAM_CMD_CAS_WR_CLS_ADAPTIVE 0x06 0x02 0x03
+UMASK_DRAM_CMD_CAS_WR_CLS_TRDOFF 0x06 0x02 0x10
+UMASK_DRAM_CMD_CAS_WR_CLS_RDPRIO 0x06 0x02 0x11
+UMASK_DRAM_CMD_CAS_WR_CLS_WRPRIO 0x06 0x02 0x12
+UMASK_DRAM_CMD_CAS_WR_CLS_ADAPTIVE 0x06 0x02 0x13
UMASK_DRAM_CMD_MRS 0x07 0x02 0x00
UMASK_DRAM_CMD_RFR 0x09 0x02 0x00
UMASK_DRAM_CMD_ENSR 0x0A 0x02 0x00
@@ -688,33 +680,16 @@ UMASK_DSP_FILL_WRQ_FULL 0x02 0x03 0x00
UMASK_DSP_FILL_RDQ_EMPTY 0x04 0x03 0x00
UMASK_DSP_FILL_WRQ_EMPTY 0x08 0x03 0x00
-EVENT_DRAM_MISC 0x0B MBOX
-UMASK_DRAM_MISC_RETRIES_ALL 0x00 0x04 0x03
-UMASK_DRAM_MISC_RETRIES_FVID 0x01 0x04 0x03
-UMASK_DRAM_MISC_VALID 0x01 0x04 0x02
-UMASK_DRAM_MISC_NON_NOP_TRKL 0x01 0x04 0x01
-
-UMASK_DRAM_MISC_ILLEGAL 0x00 0x04 0x00
-UMASK_DRAM_MISC_PREALL 0x01 0x04 0x00
-UMASK_DRAM_MISC_RAS 0x02 0x04 0x00
-UMASK_DRAM_MISC_CAS_RD_OPN 0x03 0x04 0x00
-UMASK_DRAM_MISC_CAS_WR_OPN 0x04 0x04 0x00
-UMASK_DRAM_MISC_CAS_RD_CLS 0x05 0x04 0x00
-UMASK_DRAM_MISC_CAS_WR_CLS 0x06 0x04 0x00
-UMASK_DRAM_MISC_MRS 0x07 0x04 0x00
-UMASK_DRAM_MISC_RFR 0x09 0x04 0x00
-UMASK_DRAM_MISC_ENSR 0x0A 0x04 0x00
-UMASK_DRAM_MISC_EXSR 0x0B 0x04 0x00
-UMASK_DRAM_MISC_NOP 0x0C 0x04 0x00
-UMASK_DRAM_MISC_TRKL 0x10 0x04 0x00
-UMASK_DRAM_MISC_PRE 0x11 0x04 0x00
-UMASK_DRAM_MISC_SYNC 0x12 0x04 0x00
-UMASK_DRAM_MISC_CKE_HI 0x14 0x04 0x00
-UMASK_DRAM_MISC_CKE_LO 0x15 0x04 0x00
-UMASK_DRAM_MISC_SOFT_RST 0x17 0x04 0x00
-UMASK_DRAM_MISC_WR_CFG 0x1C 0x04 0x00
-UMASK_DRAM_MISC_RD_CFG 0x1D 0x04 0x00
-UMASK_DRAM_MISC_ZQCAL 0x1E 0x04 0x00
+EVENT_BCMD_SCHEDQ_OCCUPANCY 0x06 MBOX
+UMASK_BCMD_SCHEDQ_OCCUPANCY_READS 0x00 0x01 0x00
+UMASK_BCMD_SCHEDQ_OCCUPANCY_WRITES 0x01 0x01 0x00
+UMASK_BCMD_SCHEDQ_OCCUPANCY_MERGE 0x02 0x01 0x00
+UMASK_BCMD_SCHEDQ_OCCUPANCY_V2F 0x03 0x01 0x00
+UMASK_BCMD_SCHEDQ_OCCUPANCY_V2V 0x04 0x01 0x00
+UMASK_BCMD_SCHEDQ_OCCUPANCY_F2V 0x05 0x01 0x00
+UMASK_BCMD_SCHEDQ_OCCUPANCY_F2B 0x06 0x01 0x00
+UMASK_BCMD_SCHEDQ_OCCUPANCY_SPRWR 0x07 0x01 0x00
+UMASK_BCMD_SCHEDQ_OCCUPANCY_ALL 0x08 0x01 0x00
EVENT_FRM_TYPE 0x09 MBOX
UMASK_FRM_TYPE_3CMD 0x00 0x05 0x00
@@ -750,12 +725,12 @@ UMASK_FVC_EV1_FAST_RESET 0x04 0x07 0x00
UMASK_FVC_EV1_BBOX_CMDS_READS 0x05 0x07 0x00
UMASK_FVC_EV1_BBOX_CMDS_WRITES 0x05 0x07 0x01
UMASK_FVC_EV1_BBOX_RSP_ACK 0x06 0x07 0x00
-UMASK_FVC_EV1_BBOX_RSP_RETRY 0x06 0x07 0x10
-UMASK_FVC_EV1_BBOX_RSP_COR 0x06 0x07 0x20
-UMASK_FVC_EV1_BBOX_RSP_UNCOR 0x06 0x07 0x30
-UMASK_FVC_EV1_BBOX_RSP_SPEC_ACK 0x06 0x07 0x40
-UMASK_FVC_EV1_BBOX_RSP_SPR_ACK 0x06 0x07 0x50
-UMASK_FVC_EV1_BBOX_RSP_SPR_UNCORE 0x06 0x07 0x70
+UMASK_FVC_EV1_BBOX_RSP_RETRY 0x06 0x07 0x01
+UMASK_FVC_EV1_BBOX_RSP_COR 0x06 0x07 0x02
+UMASK_FVC_EV1_BBOX_RSP_UNCOR 0x06 0x07 0x03
+UMASK_FVC_EV1_BBOX_RSP_SPEC_ACK 0x06 0x07 0x04
+UMASK_FVC_EV1_BBOX_RSP_SPR_ACK 0x06 0x07 0x05
+UMASK_FVC_EV1_BBOX_RSP_SPR_UNCORE 0x06 0x07 0x07
UMASK_FVC_EV1_SMI_NB_TRIG 0x07 0x07 0x00
EVENT_FVC_EV2 0x0F MBOX
@@ -767,30 +742,30 @@ UMASK_FVC_EV2_FAST_RESET 0x04 0x08 0x00
UMASK_FVC_EV2_BBOX_CMDS_READS 0x05 0x08 0x00
UMASK_FVC_EV2_BBOX_CMDS_WRITES 0x05 0x08 0x01
UMASK_FVC_EV2_BBOX_RSP_ACK 0x06 0x08 0x00
-UMASK_FVC_EV2_BBOX_RSP_RETRY 0x06 0x08 0x10
-UMASK_FVC_EV2_BBOX_RSP_COR 0x06 0x08 0x20
-UMASK_FVC_EV2_BBOX_RSP_UNCOR 0x06 0x08 0x30
-UMASK_FVC_EV2_BBOX_RSP_SPEC_ACK 0x06 0x08 0x40
-UMASK_FVC_EV2_BBOX_RSP_SPR_ACK 0x06 0x08 0x50
-UMASK_FVC_EV2_BBOX_RSP_SPR_UNCORE 0x06 0x08 0x70
+UMASK_FVC_EV2_BBOX_RSP_RETRY 0x06 0x08 0x01
+UMASK_FVC_EV2_BBOX_RSP_COR 0x06 0x08 0x02
+UMASK_FVC_EV2_BBOX_RSP_UNCOR 0x06 0x08 0x03
+UMASK_FVC_EV2_BBOX_RSP_SPEC_ACK 0x06 0x08 0x04
+UMASK_FVC_EV2_BBOX_RSP_SPR_ACK 0x06 0x08 0x05
+UMASK_FVC_EV2_BBOX_RSP_SPR_UNCORE 0x06 0x08 0x07
UMASK_FVC_EV2_SMI_NB_TRIG 0x07 0x08 0x00
EVENT_FVC_EV3 0x10 MBOX
UMASK_FVC_EV3_SMI_CRC_ERR 0x00 0x09 0x00
-UMASK_FVC_EV3_MEM_ECC_ERR 0x00 0x09 0x00
-UMASK_FVC_EV3_POISON_TXN 0x00 0x09 0x00
-UMASK_FVC_EV3_ALERT_FRAMES 0x00 0x09 0x00
-UMASK_FVC_EV3_FAST_RESET 0x00 0x09 0x00
-UMASK_FVC_EV3_BBOX_CMDS_READS 0x00 0x09 0x00
-UMASK_FVC_EV3_BBOX_CMDS_WRITES 0x00 0x09 0x01
-UMASK_FVC_EV3_BBOX_RSP_ACK 0x00 0x09 0x00
-UMASK_FVC_EV3_BBOX_RSP_RETRY 0x00 0x09 0x10
-UMASK_FVC_EV3_BBOX_RSP_COR 0x00 0x09 0x20
-UMASK_FVC_EV3_BBOX_RSP_UNCOR 0x00 0x09 0x30
-UMASK_FVC_EV3_BBOX_RSP_SPEC_ACK 0x00 0x09 0x40
-UMASK_FVC_EV3_BBOX_RSP_SPR_ACK 0x00 0x09 0x50
-UMASK_FVC_EV3_BBOX_RSP_SPR_UNCORE 0x00 0x09 0x70
-UMASK_FVC_EV3_SMI_NB_TRIG 0x00 0x09 0x00
+UMASK_FVC_EV3_MEM_ECC_ERR 0x01 0x09 0x00
+UMASK_FVC_EV3_POISON_TXN 0x02 0x09 0x00
+UMASK_FVC_EV3_ALERT_FRAMES 0x03 0x09 0x00
+UMASK_FVC_EV3_FAST_RESET 0x04 0x09 0x00
+UMASK_FVC_EV3_BBOX_CMDS_READS 0x05 0x09 0x00
+UMASK_FVC_EV3_BBOX_CMDS_WRITES 0x05 0x09 0x01
+UMASK_FVC_EV3_BBOX_RSP_ACK 0x06 0x09 0x00
+UMASK_FVC_EV3_BBOX_RSP_RETRY 0x06 0x09 0x01
+UMASK_FVC_EV3_BBOX_RSP_COR 0x06 0x09 0x02
+UMASK_FVC_EV3_BBOX_RSP_UNCOR 0x06 0x09 0x03
+UMASK_FVC_EV3_BBOX_RSP_SPEC_ACK 0x06 0x09 0x04
+UMASK_FVC_EV3_BBOX_RSP_SPR_ACK 0x06 0x09 0x05
+UMASK_FVC_EV3_BBOX_RSP_SPR_UNCORE 0x06 0x09 0x07
+UMASK_FVC_EV3_SMI_NB_TRIG 0x07 0x09 0x00
EVENT_FVID_RACE 0x18 MBOX
UMASK_FVID_RACE 0x00 0x00 0x00
@@ -798,9 +773,8 @@ UMASK_FVID_RACE 0x00 0x00 0x00
EVENT_INFLIGHT_CMDS 0x1D MBOX
UMASK_INFLIGHT_CMDS 0x00 0x00 0x00
-EVENT_ISS_SCHED 0x08 MBOX
-UMASK_ISS_SCHED_CHANGES 0x00 0x0A 0x00
-UMASK_ISS_SCHED_FRAME_BEAT 0x01 0x0A 0x00
+EVENT_SCHED_MODE_CHANGES 0x08 MBOX
+UMASK_SCHED_MODE_CHANGES 0x00 0x00 0x00
EVENT_MA_PAR_ERR 0x0C MBOX
UMASK_MA_PAR_ERR 0x00 0x00 0x00
@@ -808,6 +782,9 @@ UMASK_MA_PAR_ERR 0x00 0x00 0x00
EVENT_MULTICAS 0x17 MBOX
UMASK_MULTICAS 0x00 0x00 0x00
+EVENT_PAGE_EMPTY 0x15 MBOX
+UMASK_PAGE_EMPTY 0x00 0x00 0x00
+
EVENT_PAGE_HIT 0x14 MBOX
UMASK_PAGE_HIT 0x00 0x00 0x00
@@ -821,9 +798,8 @@ EVENT_PGT_PAGE_EV 0x16 MBOX
UMASK_PGT_PAGE_EV_OPN2CLS 0x00 0x0B 0x00
UMASK_PGT_PAGE_EV_CLS2OPN 0x01 0x0B 0x00
-EVENT_PGT_PAGE_EV2 0x15 MBOX
-UMASK_PGT_PAGE_EV2_AUTO_CLS 0x00 0x0C 0x00
-UMASK_PGT_PAGE_EV2_PAGE_EMPTY 0x01 0x0C 0x00
+EVENT_RETRIES 0x0B MBOX
+UMASK_RETRIES_ALL 0x00 0x00 0x00
EVENT_REFRESH 0x06 MBOX
UMASK_REFRESH 0x00 0x00 0x00
@@ -845,12 +821,44 @@ UMASK_THERM_TRP_DN_ALL_GT_MID_RISE 0x03 0x0D 0x00
UMASK_THERM_TRP_DN_ALL_GT_MID_FALL 0x02 0x0D 0x00
UMASK_THERM_TRP_DN_ALL_GT_LO 0x01 0x0D 0x00
UMASK_THERM_TRP_DN_ALL_LT_LO 0x00 0x0D 0x00
+UMASK_THERM_TRP_DN_DIMM0_GT_MID_RISE 0x03 0x0D 0x01
+UMASK_THERM_TRP_DN_DIMM1_GT_MID_RISE 0x03 0x0D 0x02
+UMASK_THERM_TRP_DN_DIMM2_GT_MID_RISE 0x03 0x0D 0x03
+UMASK_THERM_TRP_DN_DIMM3_GT_MID_RISE 0x03 0x0D 0x04
+UMASK_THERM_TRP_DN_DIMM0_GT_MID_FALL 0x02 0x0D 0x01
+UMASK_THERM_TRP_DN_DIMM1_GT_MID_FALL 0x02 0x0D 0x02
+UMASK_THERM_TRP_DN_DIMM2_GT_MID_FALL 0x02 0x0D 0x03
+UMASK_THERM_TRP_DN_DIMM3_GT_MID_FALL 0x02 0x0D 0x04
+UMASK_THERM_TRP_DN_DIMM0_GT_LO 0x01 0x0D 0x01
+UMASK_THERM_TRP_DN_DIMM1_GT_LO 0x01 0x0D 0x02
+UMASK_THERM_TRP_DN_DIMM2_GT_LO 0x01 0x0D 0x03
+UMASK_THERM_TRP_DN_DIMM3_GT_LO 0x01 0x0D 0x04
+UMASK_THERM_TRP_DN_DIMM0_LT_LO 0x00 0x0D 0x01
+UMASK_THERM_TRP_DN_DIMM1_LT_LO 0x00 0x0D 0x02
+UMASK_THERM_TRP_DN_DIMM2_LT_LO 0x00 0x0D 0x03
+UMASK_THERM_TRP_DN_DIMM3_LT_LO 0x00 0x0D 0x04
EVENT_THERM_TRP_UP 0x04 MBOX
UMASK_THERM_TRP_UP_ALL_GT_MID_RISE 0x03 0x0E 0x00
UMASK_THERM_TRP_UP_ALL_GT_MID_FALL 0x02 0x0E 0x00
UMASK_THERM_TRP_UP_ALL_GT_LO 0x01 0x0E 0x00
UMASK_THERM_TRP_UP_ALL_LT_LO 0x00 0x0E 0x00
+UMASK_THERM_TRP_UP_DIMM0_GT_MID_RISE 0x03 0x0E 0x01
+UMASK_THERM_TRP_UP_DIMM1_GT_MID_RISE 0x03 0x0E 0x02
+UMASK_THERM_TRP_UP_DIMM2_GT_MID_RISE 0x03 0x0E 0x03
+UMASK_THERM_TRP_UP_DIMM3_GT_MID_RISE 0x03 0x0E 0x04
+UMASK_THERM_TRP_UP_DIMM0_GT_MID_FALL 0x02 0x0E 0x01
+UMASK_THERM_TRP_UP_DIMM1_GT_MID_FALL 0x02 0x0E 0x02
+UMASK_THERM_TRP_UP_DIMM2_GT_MID_FALL 0x02 0x0E 0x03
+UMASK_THERM_TRP_UP_DIMM3_GT_MID_FALL 0x02 0x0E 0x04
+UMASK_THERM_TRP_UP_DIMM0_GT_LO 0x01 0x0E 0x01
+UMASK_THERM_TRP_UP_DIMM1_GT_LO 0x01 0x0E 0x02
+UMASK_THERM_TRP_UP_DIMM2_GT_LO 0x01 0x0E 0x03
+UMASK_THERM_TRP_UP_DIMM3_GT_LO 0x01 0x0E 0x04
+UMASK_THERM_TRP_UP_DIMM0_LT_LO 0x00 0x0E 0x01
+UMASK_THERM_TRP_UP_DIMM1_LT_LO 0x00 0x0E 0x02
+UMASK_THERM_TRP_UP_DIMM2_LT_LO 0x00 0x0E 0x03
+UMASK_THERM_TRP_UP_DIMM3_LT_LO 0x00 0x0E 0x04
EVENT_TRANS_CMDS 0x12 MBOX
UMASK_TRANS_CMDS 0x00 0x00 0x00
@@ -859,112 +867,164 @@ EVENT_TT_CMD_CONFLICT 0x19 MBOX
UMASK_TT_CMD_CONFLICT 0x00 0x00 0x00
EVENT_ACK_BEFORE_LAST_SNP 0x19 BBOX0C3|BBOX1C3
-UMASK_ACK_BEFORE_LAST_SNP 0x03
+UMASK_ACK_BEFORE_LAST_SNP 0x00
EVENT_ADDR_IN_MATCH 0x04 BBOX0C2|BBOX1C2
-UMASK_ADDR_IN_MATCH 0x02
+OPTIONS_ADDR_IN_MATCH EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_ADDR_IN_MATCH 0x00
EVENT_CONFLICTS 0x17 BBOX0C3|BBOX1C3
-UMASK_CONFLICTS 0x03
+UMASK_CONFLICTS 0x00
EVENT_COHQ_BYPASS 0x0E BBOX0C3|BBOX1C3
-UMASK_COHQ_BYPASS 0x03
+UMASK_COHQ_BYPASS 0x00
-EVENT_COHQ_IMT_ALLOC_WAIT 0x0E BBOX0C3|BBOX1C3
-UMASK_COHQ_IMT_ALLOC_WAIT 0x03
+EVENT_COHQ_IMT_ALLOC_WAIT 0x13 BBOX0C3|BBOX1C3
+UMASK_COHQ_IMT_ALLOC_WAIT 0x00
-EVENT_DIRQ_INSERTS 0x17 BBOX0C1|BBOX1C1
-UMASK_DIRQ_INSERTS 0x01
+EVENT_DIRQ_INSERTS 0x17 BBOX0C1|BBOX1C1
+UMASK_DIRQ_INSERTS 0x00
EVENT_DIRQ_OCCUPANCY 0x17 BBOX0C0|BBOX1C0
UMASK_DIRQ_OCCUPANCY 0x00
EVENT_DEMAND_FETCH 0x0F BBOX0C3|BBOX1C3
-UMASK_DEMAND_FETCH 0x03
+UMASK_DEMAND_FETCH 0x00
EVENT_DRSQ_INSERTS 0x09 BBOX0C1|BBOX1C1
-UMASK_DRSQ_INSERTS 0x01
+UMASK_DRSQ_INSERTS 0x00
EVENT_DRSQ_OCCUPANCY 0x09 BBOX0C0|BBOX1C0
UMASK_DRSQ_OCCUPANCY 0x00
EVENT_EARLY_ACK 0x02 BBOX0C3|BBOX1C3
-UMASK_EARLY_ACK 0x03
+UMASK_EARLY_ACK 0x00
EVENT_IMPLICIT_WBS 0x12 BBOX0C3|BBOX1C3
-UMASK_IMPLICIT_WBS 0x03
+UMASK_IMPLICIT_WBS 0x00
-EVENT_IMT_FULL 0x12 BBOX0C3|BBOX1C3
-UMASK_IMT_FULL 0x03
+EVENT_IMT_FULL 0x16 BBOX0C3|BBOX1C3
+UMASK_IMT_FULL 0x00
EVENT_IMT_INSERTS_ALL 0x07 BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_ALL 0x01
+UMASK_IMT_INSERTS_ALL 0x00
EVENT_IMT_INSERTS_INVITOE 0x0F BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_INVITOE 0x01
+UMASK_IMT_INSERTS_INVITOE 0x00
EVENT_IMT_INSERTS_IOH 0x0A BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_IOH 0x01
+UMASK_IMT_INSERTS_IOH 0x00
EVENT_IMT_INSERTS_IOH_INVITOE 0x10 BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_IOH_INVITOE 0x01
+UMASK_IMT_INSERTS_IOH_INVITOE 0x00
EVENT_IMT_INSERTS_IOH_WR 0x0D BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_IOH_WR 0x01
+UMASK_IMT_INSERTS_IOH_WR 0x00
EVENT_IMT_INSERTS_NON_IOH 0x0B BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_NON_IOH 0x01
+UMASK_IMT_INSERTS_NON_IOH 0x00
EVENT_IMT_INSERTS_NON_IOH_INVITOE 0x1C BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_NON_IOH_INVITOE 0x01
+UMASK_IMT_INSERTS_NON_IOH_INVITOE 0x00
-EVENT_INSERTS_NON_IOH_RD 0x1F BBOX0C1|BBOX1C1
-UMASK_INSERTS_NON_IOH_RD 0x01
+EVENT_IMT_INSERTS_NON_IOH_RD 0x1F BBOX0C1|BBOX1C1
+UMASK_IMT_INSERTS_NON_IOH_RD 0x00
EVENT_IMT_INSERTS_NON_IOH_WR 0x0E BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_NON_IOH_WR 0x01
+UMASK_IMT_INSERTS_NON_IOH_WR 0x00
EVENT_IMT_INSERTS_RD 0x1D BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_RD 0x01
+UMASK_IMT_INSERTS_RD 0x00
EVENT_IMT_INSERTS_WR 0x0C BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_WR 0x01
+UMASK_IMT_INSERTS_WR 0x00
EVENT_IMT_NE_CYCLES 0x07 BBOX0C2|BBOX1C2
-UMASK_IMT_NE_CYCLES 0x02
+UMASK_IMT_NE_CYCLES 0x00
EVENT_IMT_PREALLOC 0x06 BBOX0C3|BBOX1C3
-UMASK_IMT_PREALLOC 0x03
+UMASK_IMT_PREALLOC 0x00
EVENT_IMT_VALID_OCCUPANCY 0x07 BBOX0C0|BBOX1C0
UMASK_IMT_VALID_OCCUPANCY 0x00
EVENT_MSG_ADDR_IN_MATCH 0x01 BBOX0C0|BBOX1C0
+OPTIONS_MSG_ADDR_IN_MATCH EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
UMASK_MSG_ADDR_IN_MATCH 0x00
-EVENT_MSGS_B_TO_S 0x03 BBOX0C1|BBOX1C1
-UMASK_MSGS_B_TO_S 0x01
-
EVENT_MSGS_B_TO_S 0x03 BBOX0C2|BBOX1C2
-UMASK_MSGS_B_TO_S 0x02
+UMASK_MSGS_B_TO_S 0x00
+
+EVENT_MSGS_S_TO_B 0x02 BBOX0C2|BBOX1C2
+UMASK_MSGS_S_TO_B 0x00
EVENT_MSG_IN_MATCH 0x01 BBOX0C1|BBOX1C1
-UMASK_MSG_IN_MATCH 0x01
+OPTIONS_MSG_IN_MATCH EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_IN_MATCH 0x00
EVENT_MSGS_IN_NON_SNP 0x01 BBOX0C2|BBOX1C2
-UMASK_MSGS_IN_NON_SNP 0x02
+UMASK_MSGS_IN_NON_SNP 0x00
EVENT_MSG_OPCODE_ADDR_IN_MATCH 0x03 BBOX0C0|BBOX1C0
+OPTIONS_MSG_OPCODE_ADDR_IN_MATCH EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
UMASK_MSG_OPCODE_ADDR_IN_MATCH 0x00
EVENT_MSG_OPCODE_IN_MATCH 0x05 BBOX0C1|BBOX1C1
-UMASK_MSG_OPCODE_IN_MATCH 0x01
+OPTIONS_MSG_OPCODE_IN_MATCH EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_OPCODE_IN_MATCH 0x00
EVENT_MSG_OPCODE_OUT_MATCH 0x06 BBOX0C1|BBOX1C1
-UMASK_MSG_OPCODE_OUT_MATCH 0x01
+OPTIONS_MSG_OPCODE_OUT_MATCH EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_OPCODE_OUT_MATCH 0x00
EVENT_MSG_OUT_MATCH 0x02 BBOX0C1|BBOX1C1
-UMASK_MSG_OUT_MATCH 0x01
+OPTIONS_MSG_OUT_MATCH EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_OUT_MATCH 0x00
+
+EVENT_OPCODE_ADDR_IN_MATCH 0x02 BBOX0C0|BBOX1C0
+OPTIONS_OPCODE_ADDR_IN_MATCH EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_OPCODE_ADDR_IN_MATCH 0x00
+
+EVENT_OPCODE_IN_MATCH 0x03 BBOX0C1|BBOX1C1
+OPTIONS_OPCODE_IN_MATCH EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_OPCODE_IN_MATCH 0x00
+
+EVENT_OPCODE_OUT_MATCH 0x04 BBOX0C1|BBOX1C1
+OPTIONS_OPCODE_OUT_MATCH EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_OPCODE_OUT_MATCH 0x00
+
+EVENT_RBOX_VNA_UNAVAIL 0x15 BBOX0C3|BBOX1C3
+UMASK_RBOX_VNA_UNAVAIL 0x00
+
+EVENT_SBOX_VN0_UNAVAIL 0x14 BBOX0C3|BBOX1C3
+UMASK_SBOX_VN0_UNAVAIL 0x00
+
+EVENT_SNPOQ_INSERTS 0x12 BBOX0C1|BBOX1C1
+UMASK_SNPOQ_INSERTS 0x00
+
+EVENT_SNPOQ_OCCUPANCY 0x12 BBOX0C0|BBOX1C0
+UMASK_SNPOQ_OCCUPANCY 0x00
+
+EVENT_TF_ALL 0x04 BBOX0C0|BBOX1C0
+UMASK_TF_ALL 0x00
+
+EVENT_TF_INVITOE 0x06 BBOX0C0|BBOX1C0
+UMASK_TF_INVITOE 0x00
+
+EVENT_TF_IOH 0x0B BBOX0C0|BBOX1C0
+UMASK_TF_IOH 0x00
+
+EVENT_TF_IOH_INVITOE 0x0F BBOX0C0|BBOX1C0
+UMASK_TF_IOH_INVITOE 0x00
+
+EVENT_TF_IOH_NON_INVITOE_RD 0x1C BBOX0C0|BBOX1C0
+UMASK_TF_IOH_NON_INVITOE_RD 0x00
+
+EVENT_TF_IOH_WR 0x0D BBOX0C0|BBOX1C0
+UMASK_TF_IOH_WR 0x00
+
+EVENT_TF_WR 0x05 BBOX0C0|BBOX1C0
+UMASK_TF_WR 0x00
EVENT_ALLOC_TO_ARB 0x00 RBOX0
UMASK_ALLOC_TO_ARB_PORT0_IPERF0_NCB 0x00 0x01 0x09
@@ -974,7 +1034,7 @@ UMASK_ALLOC_TO_ARB_PORT0_IPERF0_NDR 0x00 0x08 0x09
UMASK_ALLOC_TO_ARB_PORT0_IPERF0_SNP 0x00 0x10 0x09
UMASK_ALLOC_TO_ARB_PORT0_IPERF0_HOM_VN0 0x00 0x20 0x09
UMASK_ALLOC_TO_ARB_PORT0_IPERF0_HOM_VN1 0x00 0x40 0x09
-UMASK_ALLOC_TO_ARB_PORT0_IPERF0_ALL 0x00 0xFF 0x09
+UMASK_ALLOC_TO_ARB_PORT0_IPERF0_ALL 0x00 0x7F 0x09
UMASK_ALLOC_TO_ARB_PORT0_IPERF1_NCB 0x01 0x01 0x09
UMASK_ALLOC_TO_ARB_PORT0_IPERF1_NCS 0x01 0x02 0x09
UMASK_ALLOC_TO_ARB_PORT0_IPERF1_DRS_VN01 0x01 0x04 0x09
@@ -982,7 +1042,7 @@ UMASK_ALLOC_TO_ARB_PORT0_IPERF1_NDR 0x01 0x08 0x09
UMASK_ALLOC_TO_ARB_PORT0_IPERF1_SNP 0x01 0x10 0x09
UMASK_ALLOC_TO_ARB_PORT0_IPERF1_HOM_VN0 0x01 0x20 0x09
UMASK_ALLOC_TO_ARB_PORT0_IPERF1_HOM_VN1 0x01 0x40 0x09
-UMASK_ALLOC_TO_ARB_PORT0_IPERF1_ALL 0x01 0xFF 0x09
+UMASK_ALLOC_TO_ARB_PORT0_IPERF1_ALL 0x01 0x7F 0x09
UMASK_ALLOC_TO_ARB_PORT1_IPERF0_NCB 0x06 0x01 0x09
UMASK_ALLOC_TO_ARB_PORT1_IPERF0_NCS 0x06 0x02 0x09
UMASK_ALLOC_TO_ARB_PORT1_IPERF0_DRS_VN01 0x06 0x04 0x09
@@ -990,7 +1050,7 @@ UMASK_ALLOC_TO_ARB_PORT1_IPERF0_NDR 0x06 0x08 0x09
UMASK_ALLOC_TO_ARB_PORT1_IPERF0_SNP 0x06 0x10 0x09
UMASK_ALLOC_TO_ARB_PORT1_IPERF0_HOM_VN0 0x06 0x20 0x09
UMASK_ALLOC_TO_ARB_PORT1_IPERF0_HOM_VN1 0x06 0x40 0x09
-UMASK_ALLOC_TO_ARB_PORT1_IPERF0_ALL 0x06 0xFF 0x09
+UMASK_ALLOC_TO_ARB_PORT1_IPERF0_ALL 0x06 0x7F 0x09
UMASK_ALLOC_TO_ARB_PORT1_IPERF1_NCB 0x07 0x01 0x09
UMASK_ALLOC_TO_ARB_PORT1_IPERF1_NCS 0x07 0x02 0x09
UMASK_ALLOC_TO_ARB_PORT1_IPERF1_DRS_VN01 0x07 0x04 0x09
@@ -998,7 +1058,7 @@ UMASK_ALLOC_TO_ARB_PORT1_IPERF1_NDR 0x07 0x08 0x09
UMASK_ALLOC_TO_ARB_PORT1_IPERF1_SNP 0x07 0x10 0x09
UMASK_ALLOC_TO_ARB_PORT1_IPERF1_HOM_VN0 0x07 0x20 0x09
UMASK_ALLOC_TO_ARB_PORT1_IPERF1_HOM_VN1 0x07 0x40 0x09
-UMASK_ALLOC_TO_ARB_PORT1_IPERF1_ALL 0x07 0xFF 0x09
+UMASK_ALLOC_TO_ARB_PORT1_IPERF1_ALL 0x07 0x7F 0x09
UMASK_ALLOC_TO_ARB_PORT2_IPERF0_NCB 0x0C 0x01 0x09
UMASK_ALLOC_TO_ARB_PORT2_IPERF0_NCS 0x0C 0x02 0x09
UMASK_ALLOC_TO_ARB_PORT2_IPERF0_DRS_VN01 0x0C 0x04 0x09
@@ -1006,7 +1066,7 @@ UMASK_ALLOC_TO_ARB_PORT2_IPERF0_NDR 0x0C 0x08 0x09
UMASK_ALLOC_TO_ARB_PORT2_IPERF0_SNP 0x0C 0x10 0x09
UMASK_ALLOC_TO_ARB_PORT2_IPERF0_HOM_VN0 0x0C 0x20 0x09
UMASK_ALLOC_TO_ARB_PORT2_IPERF0_HOM_VN1 0x0C 0x40 0x09
-UMASK_ALLOC_TO_ARB_PORT2_IPERF0_ALL 0x0C 0xFF 0x09
+UMASK_ALLOC_TO_ARB_PORT2_IPERF0_ALL 0x0C 0x7F 0x09
UMASK_ALLOC_TO_ARB_PORT2_IPERF1_NCB 0x0D 0x01 0x09
UMASK_ALLOC_TO_ARB_PORT2_IPERF1_NCS 0x0D 0x02 0x09
UMASK_ALLOC_TO_ARB_PORT2_IPERF1_DRS_VN01 0x0D 0x04 0x09
@@ -1014,7 +1074,7 @@ UMASK_ALLOC_TO_ARB_PORT2_IPERF1_NDR 0x0D 0x08 0x09
UMASK_ALLOC_TO_ARB_PORT2_IPERF1_SNP 0x0D 0x10 0x09
UMASK_ALLOC_TO_ARB_PORT2_IPERF1_HOM_VN0 0x0D 0x20 0x09
UMASK_ALLOC_TO_ARB_PORT2_IPERF1_HOM_VN1 0x0D 0x40 0x09
-UMASK_ALLOC_TO_ARB_PORT2_IPERF1_ALL 0x0D 0xFF 0x09
+UMASK_ALLOC_TO_ARB_PORT2_IPERF1_ALL 0x0D 0x7F 0x09
UMASK_ALLOC_TO_ARB_PORT3_IPERF0_NCB 0x12 0x01 0x09
UMASK_ALLOC_TO_ARB_PORT3_IPERF0_NCS 0x12 0x02 0x09
UMASK_ALLOC_TO_ARB_PORT3_IPERF0_DRS_VN01 0x12 0x04 0x09
@@ -1022,7 +1082,7 @@ UMASK_ALLOC_TO_ARB_PORT3_IPERF0_NDR 0x12 0x08 0x09
UMASK_ALLOC_TO_ARB_PORT3_IPERF0_SNP 0x12 0x10 0x09
UMASK_ALLOC_TO_ARB_PORT3_IPERF0_HOM_VN0 0x12 0x20 0x09
UMASK_ALLOC_TO_ARB_PORT3_IPERF0_HOM_VN1 0x12 0x40 0x09
-UMASK_ALLOC_TO_ARB_PORT3_IPERF0_ALL 0x12 0xFF 0x09
+UMASK_ALLOC_TO_ARB_PORT3_IPERF0_ALL 0x12 0x7F 0x09
UMASK_ALLOC_TO_ARB_PORT3_IPERF1_NCB 0x13 0x01 0x09
UMASK_ALLOC_TO_ARB_PORT3_IPERF1_NCS 0x13 0x02 0x09
UMASK_ALLOC_TO_ARB_PORT3_IPERF1_DRS_VN01 0x13 0x04 0x09
@@ -1030,7 +1090,7 @@ UMASK_ALLOC_TO_ARB_PORT3_IPERF1_NDR 0x13 0x08 0x09
UMASK_ALLOC_TO_ARB_PORT3_IPERF1_SNP 0x13 0x10 0x09
UMASK_ALLOC_TO_ARB_PORT3_IPERF1_HOM_VN0 0x13 0x20 0x09
UMASK_ALLOC_TO_ARB_PORT3_IPERF1_HOM_VN1 0x13 0x40 0x09
-UMASK_ALLOC_TO_ARB_PORT3_IPERF1_ALL 0x13 0xFF 0x09
+UMASK_ALLOC_TO_ARB_PORT3_IPERF1_ALL 0x13 0x7F 0x09
EVENT_ALLOC_TO_ARB 0x00 RBOX1
UMASK_ALLOC_TO_ARB_PORT4_IPERF0_NCB 0x00 0x01 0x09
@@ -1040,7 +1100,7 @@ UMASK_ALLOC_TO_ARB_PORT4_IPERF0_NDR 0x00 0x08 0x09
UMASK_ALLOC_TO_ARB_PORT4_IPERF0_SNP 0x00 0x10 0x09
UMASK_ALLOC_TO_ARB_PORT4_IPERF0_HOM_VN0 0x00 0x20 0x09
UMASK_ALLOC_TO_ARB_PORT4_IPERF0_HOM_VN1 0x00 0x40 0x09
-UMASK_ALLOC_TO_ARB_PORT4_IPERF0_ALL 0x00 0xFF 0x09
+UMASK_ALLOC_TO_ARB_PORT4_IPERF0_ALL 0x00 0x7F 0x09
UMASK_ALLOC_TO_ARB_PORT4_IPERF1_NCB 0x01 0x01 0x09
UMASK_ALLOC_TO_ARB_PORT4_IPERF1_NCS 0x01 0x02 0x09
UMASK_ALLOC_TO_ARB_PORT4_IPERF1_DRS_VN01 0x01 0x04 0x09
@@ -1048,7 +1108,7 @@ UMASK_ALLOC_TO_ARB_PORT4_IPERF1_NDR 0x01 0x08 0x09
UMASK_ALLOC_TO_ARB_PORT4_IPERF1_SNP 0x01 0x10 0x09
UMASK_ALLOC_TO_ARB_PORT4_IPERF1_HOM_VN0 0x01 0x20 0x09
UMASK_ALLOC_TO_ARB_PORT4_IPERF1_HOM_VN1 0x01 0x40 0x09
-UMASK_ALLOC_TO_ARB_PORT4_IPERF1_ALL 0x01 0xFF 0x09
+UMASK_ALLOC_TO_ARB_PORT4_IPERF1_ALL 0x01 0x7F 0x09
UMASK_ALLOC_TO_ARB_PORT5_IPERF0_NCB 0x06 0x01 0x09
UMASK_ALLOC_TO_ARB_PORT5_IPERF0_NCS 0x06 0x02 0x09
UMASK_ALLOC_TO_ARB_PORT5_IPERF0_DRS_VN01 0x06 0x04 0x09
@@ -1056,7 +1116,7 @@ UMASK_ALLOC_TO_ARB_PORT5_IPERF0_NDR 0x06 0x08 0x09
UMASK_ALLOC_TO_ARB_PORT5_IPERF0_SNP 0x06 0x10 0x09
UMASK_ALLOC_TO_ARB_PORT5_IPERF0_HOM_VN0 0x06 0x20 0x09
UMASK_ALLOC_TO_ARB_PORT5_IPERF0_HOM_VN1 0x06 0x40 0x09
-UMASK_ALLOC_TO_ARB_PORT5_IPERF0_ALL 0x06 0xFF 0x09
+UMASK_ALLOC_TO_ARB_PORT5_IPERF0_ALL 0x06 0x7F 0x09
UMASK_ALLOC_TO_ARB_PORT5_IPERF1_NCB 0x07 0x01 0x09
UMASK_ALLOC_TO_ARB_PORT5_IPERF1_NCS 0x07 0x02 0x09
UMASK_ALLOC_TO_ARB_PORT5_IPERF1_DRS_VN01 0x07 0x04 0x09
@@ -1064,7 +1124,7 @@ UMASK_ALLOC_TO_ARB_PORT5_IPERF1_NDR 0x07 0x08 0x09
UMASK_ALLOC_TO_ARB_PORT5_IPERF1_SNP 0x07 0x10 0x09
UMASK_ALLOC_TO_ARB_PORT5_IPERF1_HOM_VN0 0x07 0x20 0x09
UMASK_ALLOC_TO_ARB_PORT5_IPERF1_HOM_VN1 0x07 0x40 0x09
-UMASK_ALLOC_TO_ARB_PORT5_IPERF1_ALL 0x07 0xFF 0x09
+UMASK_ALLOC_TO_ARB_PORT5_IPERF1_ALL 0x07 0x7F 0x09
UMASK_ALLOC_TO_ARB_PORT6_IPERF0_NCB 0x0C 0x01 0x09
UMASK_ALLOC_TO_ARB_PORT6_IPERF0_NCS 0x0C 0x02 0x09
UMASK_ALLOC_TO_ARB_PORT6_IPERF0_DRS_VN01 0x0C 0x04 0x09
@@ -1072,7 +1132,7 @@ UMASK_ALLOC_TO_ARB_PORT6_IPERF0_NDR 0x0C 0x08 0x09
UMASK_ALLOC_TO_ARB_PORT6_IPERF0_SNP 0x0C 0x10 0x09
UMASK_ALLOC_TO_ARB_PORT6_IPERF0_HOM_VN0 0x0C 0x20 0x09
UMASK_ALLOC_TO_ARB_PORT6_IPERF0_HOM_VN1 0x0C 0x40 0x09
-UMASK_ALLOC_TO_ARB_PORT6_IPERF0_ALL 0x0C 0xFF 0x09
+UMASK_ALLOC_TO_ARB_PORT6_IPERF0_ALL 0x0C 0x7F 0x09
UMASK_ALLOC_TO_ARB_PORT6_IPERF1_NCB 0x0D 0x01 0x09
UMASK_ALLOC_TO_ARB_PORT6_IPERF1_NCS 0x0D 0x02 0x09
UMASK_ALLOC_TO_ARB_PORT6_IPERF1_DRS_VN01 0x0D 0x04 0x09
@@ -1080,7 +1140,7 @@ UMASK_ALLOC_TO_ARB_PORT6_IPERF1_NDR 0x0D 0x08 0x09
UMASK_ALLOC_TO_ARB_PORT6_IPERF1_SNP 0x0D 0x10 0x09
UMASK_ALLOC_TO_ARB_PORT6_IPERF1_HOM_VN0 0x0D 0x20 0x09
UMASK_ALLOC_TO_ARB_PORT6_IPERF1_HOM_VN1 0x0D 0x40 0x09
-UMASK_ALLOC_TO_ARB_PORT6_IPERF1_ALL 0x0D 0xFF 0x09
+UMASK_ALLOC_TO_ARB_PORT6_IPERF1_ALL 0x0D 0x7F 0x09
UMASK_ALLOC_TO_ARB_PORT7_IPERF0_NCB 0x12 0x01 0x09
UMASK_ALLOC_TO_ARB_PORT7_IPERF0_NCS 0x12 0x02 0x09
UMASK_ALLOC_TO_ARB_PORT7_IPERF0_DRS_VN01 0x12 0x04 0x09
@@ -1088,7 +1148,7 @@ UMASK_ALLOC_TO_ARB_PORT7_IPERF0_NDR 0x12 0x08 0x09
UMASK_ALLOC_TO_ARB_PORT7_IPERF0_SNP 0x12 0x10 0x09
UMASK_ALLOC_TO_ARB_PORT7_IPERF0_HOM_VN0 0x12 0x20 0x09
UMASK_ALLOC_TO_ARB_PORT7_IPERF0_HOM_VN1 0x12 0x40 0x09
-UMASK_ALLOC_TO_ARB_PORT7_IPERF0_ALL 0x12 0xFF 0x09
+UMASK_ALLOC_TO_ARB_PORT7_IPERF0_ALL 0x12 0x7F 0x09
UMASK_ALLOC_TO_ARB_PORT7_IPERF1_NCB 0x13 0x01 0x09
UMASK_ALLOC_TO_ARB_PORT7_IPERF1_NCS 0x13 0x02 0x09
UMASK_ALLOC_TO_ARB_PORT7_IPERF1_DRS_VN01 0x13 0x04 0x09
@@ -1096,7 +1156,7 @@ UMASK_ALLOC_TO_ARB_PORT7_IPERF1_NDR 0x13 0x08 0x09
UMASK_ALLOC_TO_ARB_PORT7_IPERF1_SNP 0x13 0x10 0x09
UMASK_ALLOC_TO_ARB_PORT7_IPERF1_HOM_VN0 0x13 0x20 0x09
UMASK_ALLOC_TO_ARB_PORT7_IPERF1_HOM_VN1 0x13 0x40 0x09
-UMASK_ALLOC_TO_ARB_PORT7_IPERF1_ALL 0x13 0xFF 0x09
+UMASK_ALLOC_TO_ARB_PORT7_IPERF1_ALL 0x13 0x7F 0x09
EVENT_EOT_INSERTS 0x00 RBOX0
@@ -2236,7 +2296,7 @@ UMASK_QUE_ARB_BID_PORT0_QLX0_HOM 0x02 0x00 0x00
UMASK_QUE_ARB_BID_PORT0_QLX0_SNP 0x02 0x00 0x01
UMASK_QUE_ARB_BID_PORT0_QLX0_NDR 0x02 0x00 0x02
UMASK_QUE_ARB_BID_PORT0_QLX0_NCS 0x02 0x00 0x03
-UMASK_QUE_ARB_BID_PORT0_QLX0_DRS 0x02 0x00 0x02
+UMASK_QUE_ARB_BID_PORT0_QLX0_DRS 0x02 0x00 0x04
UMASK_QUE_ARB_BID_PORT0_QLX0_NCB 0x02 0x00 0x05
UMASK_QUE_ARB_BID_PORT0_QLX1_HOM 0x03 0x00 0x00
UMASK_QUE_ARB_BID_PORT0_QLX1_SNP 0x03 0x00 0x01
@@ -3313,6 +3373,7 @@ EVENT_TO_R_NDR_MSGQ_OCCUPANCY 0x0D SBOX
UMASK_TO_R_NDR_MSGQ_OCCUPANCY 0x00
EVENT_TO_R_PROG_EV 0x00 SBOX
+OPTIONS_TO_R_PROG_EV EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
UMASK_TO_R_PROG_EV 0x00
EVENT_TO_R_B_REQUESTS 0x6C SBOX
diff --git a/src/includes/perfmon_nehalemEX_westmereEX_common.h b/src/includes/perfmon_nehalemEX_westmereEX_common.h
new file mode 100644
index 0000000..655d5c0
--- /dev/null
+++ b/src/includes/perfmon_nehalemEX_westmereEX_common.h
@@ -0,0 +1,94 @@
+#ifndef PERFMON_NEX_WEX_COMMON
+#define PERFMON_NEX_WEX_COMMON
+
+#include <registers.h>
+
+enum nex_wex_mbox_reg_ids {
+ ADDR_MATCH = 0,
+ ADDR_MASK,
+ ZDP,
+ DSP,
+ ISS,
+ PGT,
+ MAP,
+ PLD,
+ THR,
+ NUM_MBOX_IDS
+};
+
+static uint64_t nex_wex_mbox_regs[2][NUM_MBOX_IDS] = {
+ [0] = {
+ [ADDR_MATCH] = MSR_M0_PMON_ADDR_MATCH,
+ [ADDR_MASK] = MSR_M0_PMON_ADDR_MASK,
+ [ZDP] = MSR_M0_PMON_ZDP,
+ [DSP] = MSR_M0_PMON_DSP,
+ [ISS] = MSR_M0_PMON_ISS,
+ [PGT] = MSR_M0_PMON_PGT,
+ [MAP] = MSR_M0_PMON_MAP,
+ [PLD] = MSR_M0_PMON_PLD,
+ [THR] = MSR_M0_PMON_MSC_THR,
+ },
+ [1] = {
+ [ADDR_MATCH] = MSR_M1_PMON_ADDR_MATCH,
+ [ADDR_MASK] = MSR_M1_PMON_ADDR_MASK,
+ [ZDP] = MSR_M1_PMON_ZDP,
+ [DSP] = MSR_M1_PMON_DSP,
+ [ISS] = MSR_M1_PMON_ISS,
+ [PGT] = MSR_M1_PMON_PGT,
+ [MAP] = MSR_M1_PMON_MAP,
+ [PLD] = MSR_M1_PMON_PLD,
+ [THR] = MSR_M1_PMON_MSC_THR,
+ },
+};
+
+enum nex_wex_rbox_reg_type {
+ IPERF0 = 0,
+ IPERF1,
+ QLX,
+ NUM_RBOX_REG_TYPES
+};
+
+static uint64_t nex_wex_rbox_regs[2][NUM_RBOX_REG_TYPES][4] = {
+ [0] = {
+ [IPERF0] = {
+ [0] = MSR_R0_PMON_IPERF0_P0,
+ [1] = MSR_R0_PMON_IPERF0_P1,
+ [2] = MSR_R0_PMON_IPERF0_P2,
+ [3] = MSR_R0_PMON_IPERF0_P3,
+ },
+ [IPERF1] = {
+ [0] = MSR_R0_PMON_IPERF1_P0,
+ [1] = MSR_R0_PMON_IPERF1_P1,
+ [2] = MSR_R0_PMON_IPERF1_P2,
+ [3] = MSR_R0_PMON_IPERF1_P3,
+ },
+ [QLX] = {
+ [0] = MSR_R0_PMON_QLX_P0,
+ [1] = MSR_R0_PMON_QLX_P1,
+ [2] = MSR_R0_PMON_QLX_P2,
+ [3] = MSR_R0_PMON_QLX_P3,
+ },
+ },
+ [1] = {
+ [IPERF0] = {
+ [0] = MSR_R1_PMON_IPERF0_P0,
+ [1] = MSR_R1_PMON_IPERF0_P1,
+ [2] = MSR_R1_PMON_IPERF0_P2,
+ [3] = MSR_R1_PMON_IPERF0_P3,
+ },
+ [IPERF1] = {
+ [0] = MSR_R1_PMON_IPERF1_P0,
+ [1] = MSR_R1_PMON_IPERF1_P1,
+ [2] = MSR_R1_PMON_IPERF1_P2,
+ [3] = MSR_R1_PMON_IPERF1_P3,
+ },
+ [QLX] = {
+ [0] = MSR_R1_PMON_QLX_P0,
+ [1] = MSR_R1_PMON_QLX_P1,
+ [2] = MSR_R1_PMON_QLX_P2,
+ [3] = MSR_R1_PMON_QLX_P3,
+ },
+ },
+};
+
+#endif
diff --git a/src/includes/perfmon_nehalem_counters.h b/src/includes/perfmon_nehalem_counters.h
index d3831c1..55d0d88 100644
--- a/src/includes/perfmon_nehalem_counters.h
+++ b/src/includes/perfmon_nehalem_counters.h
@@ -3,15 +3,16 @@
*
* Filename: perfmon_nehalem_counters.h
*
- * Description: Counter Header File of perfmon module for Nehalem.
+ * Description: Counter Header File of perfmon module for Intel Nehalem.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -29,27 +30,38 @@
*/
#define NUM_COUNTERS_CORE_NEHALEM 7
-#define NUM_COUNTERS_UNCORE_NEHALEM 15
-#define NUM_COUNTERS_NEHALEM 15
+#define NUM_COUNTERS_UNCORE_NEHALEM 16
+#define NUM_COUNTERS_NEHALEM 16
-static PerfmonCounterMap nehalem_counter_map[NUM_COUNTERS_NEHALEM] = {
+#define NEH_VALID_OPTIONS_FIXED EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_ANYTHREAD_MASK
+#define NEH_VALID_OPTIONS_PMC EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define NEH_VALID_OPTIONS_UNCORE EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+
+static RegisterMap nehalem_counter_map[NUM_COUNTERS_NEHALEM] = {
/* Fixed Counters: instructions retired, cycles unhalted core */
- {"FIXC0",PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0},
- {"FIXC1",PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0},
- {"FIXC2",PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0},
+ {"FIXC0",PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, NEH_VALID_OPTIONS_FIXED},
+ {"FIXC1",PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, NEH_VALID_OPTIONS_FIXED},
+ {"FIXC2",PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, NEH_VALID_OPTIONS_FIXED},
/* PMC Counters: 4 48bit wide */
- {"PMC0",PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
- {"PMC1",PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0},
- {"PMC2",PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0},
- {"PMC3",PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0},
+ {"PMC0",PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, NEH_VALID_OPTIONS_PMC},
+ {"PMC1",PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, NEH_VALID_OPTIONS_PMC},
+ {"PMC2",PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, NEH_VALID_OPTIONS_PMC},
+ {"PMC3",PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, NEH_VALID_OPTIONS_PMC},
/* Uncore PMC Counters: 8 48bit wide */
- {"UPMC0",PMC7, UNCORE, MSR_UNCORE_PERFEVTSEL0, MSR_UNCORE_PMC0, 0, 0},
- {"UPMC1",PMC8, UNCORE, MSR_UNCORE_PERFEVTSEL1, MSR_UNCORE_PMC1, 0, 0},
- {"UPMC2",PMC9, UNCORE, MSR_UNCORE_PERFEVTSEL2, MSR_UNCORE_PMC2, 0, 0},
- {"UPMC3",PMC10, UNCORE, MSR_UNCORE_PERFEVTSEL3, MSR_UNCORE_PMC3, 0, 0},
- {"UPMC4",PMC11, UNCORE, MSR_UNCORE_PERFEVTSEL4, MSR_UNCORE_PMC4, 0, 0},
- {"UPMC5",PMC12, UNCORE, MSR_UNCORE_PERFEVTSEL5, MSR_UNCORE_PMC5, 0, 0},
- {"UPMC6",PMC13, UNCORE, MSR_UNCORE_PERFEVTSEL6, MSR_UNCORE_PMC6, 0, 0},
- {"UPMC7",PMC14, UNCORE, MSR_UNCORE_PERFEVTSEL7, MSR_UNCORE_PMC7, 0, 0}
+ {"UPMC0",PMC7, UNCORE, MSR_UNCORE_PERFEVTSEL0, MSR_UNCORE_PMC0, 0, 0, NEH_VALID_OPTIONS_UNCORE},
+ {"UPMC1",PMC8, UNCORE, MSR_UNCORE_PERFEVTSEL1, MSR_UNCORE_PMC1, 0, 0, NEH_VALID_OPTIONS_UNCORE},
+ {"UPMC2",PMC9, UNCORE, MSR_UNCORE_PERFEVTSEL2, MSR_UNCORE_PMC2, 0, 0, NEH_VALID_OPTIONS_UNCORE},
+ {"UPMC3",PMC10, UNCORE, MSR_UNCORE_PERFEVTSEL3, MSR_UNCORE_PMC3, 0, 0, NEH_VALID_OPTIONS_UNCORE},
+ {"UPMC4",PMC11, UNCORE, MSR_UNCORE_PERFEVTSEL4, MSR_UNCORE_PMC4, 0, 0, NEH_VALID_OPTIONS_UNCORE},
+ {"UPMC5",PMC12, UNCORE, MSR_UNCORE_PERFEVTSEL5, MSR_UNCORE_PMC5, 0, 0, NEH_VALID_OPTIONS_UNCORE},
+ {"UPMC6",PMC13, UNCORE, MSR_UNCORE_PERFEVTSEL6, MSR_UNCORE_PMC6, 0, 0, NEH_VALID_OPTIONS_UNCORE},
+ {"UPMC7",PMC14, UNCORE, MSR_UNCORE_PERFEVTSEL7, MSR_UNCORE_PMC7, 0, 0, NEH_VALID_OPTIONS_UNCORE},
+ {"UPMCFIX",PMC15, UNCORE, MSR_UNCORE_FIXED_CTR_CTRL, MSR_UNCORE_FIXED_CTR0, 0, 0, EVENT_OPTION_NONE_MASK}
+};
+
+static BoxMap nehalem_box_map[NUM_UNITS] = {
+ [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+ [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+ [UNCORE] = {MSR_UNCORE_PERF_GLOBAL_CTRL, MSR_UNCORE_PERF_GLOBAL_STATUS, MSR_UNCORE_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48}
};
diff --git a/src/includes/perfmon_nehalem_events.txt b/src/includes/perfmon_nehalem_events.txt
index 0eeed50..a17b55e 100644
--- a/src/includes/perfmon_nehalem_events.txt
+++ b/src/includes/perfmon_nehalem_events.txt
@@ -1,16 +1,17 @@
# =======================================================================================
-#
+#
# Filename: perfmon_nehalem_events.txt
-#
+#
# Description: Event list for Intel Nehalem
-#
-# Version: 3.1.3
-# Released: 4.11.2014
-#
-# Author: Jan Treibig (jt), jan.treibig at gmail.com
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
+# Author: Jan Treibig (jt), jan.treibig at gmail.com
+# Thomas Roehl (tr), thomas.roehl at googlemail.com
# Project: likwid
#
-# Copyright (C) 2014 Jan Treibig
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
@@ -61,7 +62,7 @@ UMASK_STORE_BLOCK_ANY 0x0F
EVENT_PARTIAL_ADDRESS_ALIAS 0x07 PMC
UMASK_PARTIAL_ADDRESS_ALIAS 0x01
-EVENT_DTLB_LOAD_MISSES 0x08 PMC
+EVENT_DTLB_LOAD_MISSES 0x08 PMC
UMASK_DTLB_LOAD_MISSES_ANY 0x01
UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED 0x02
UMASK_DTLB_LOAD_MISSES_STLB_HIT 0x10
@@ -531,6 +532,13 @@ UMASK_SIMD_INT_64_PACKED_LOGICAL 0x10
UMASK_SIMD_INT_64_PACKED_ARITH 0x20
UMASK_SIMD_INT_64_SHUFFLE_MOVE 0x40
+EVENT_OFFCORE_RESPONSE_0 0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS 0x01 0xFF 0xFF
+
+EVENT_UNCORE_CLOCK 0x00 UPMCFIX
+UMASK_UNCORE_CLOCK 0x00
+
EVENT_UNC_GQ_CYCLES_FULL 0x00 UPMC
UMASK_UNC_GQ_CYCLES_FULL_READ_TRACKER 0x01
UMASK_UNC_GQ_CYCLES_FULL_WRITE_TRACKER 0x02
@@ -720,6 +728,12 @@ UMASK_UNC_QMC_PRIORITY_UPDATES_ANY 0x07
EVENT_UNC_QHL_FRC_ACK_CNFLTS_LOCAL 0x33 UPMC
UMASK_UNC_QHL_FRC_ACK_CNFLTS_LOCAL 0x04
+EVENT_UNC_ADDR_OPCODE_MATCH 0x35 UPMC
+OPTIONS_UNC_ADDR_OPCODE_MATCH_OR EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_UNC_ADDR_OPCODE_MATCH_OR 0x00 0x06
+OPTIONS_UNC_ADDR_OPCODE_MATCH_AND EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_UNC_ADDR_OPCODE_MATCH_AND 0x01 0x01
+
EVENT_UNC_QPI_TX_STALLED_SINGLE_FLIT 0x40 UPMC
UMASK_UNC_QPI_TX_STALLED_SINGLE_FLIT_HOME_LINK_0 0x01
UMASK_UNC_QPI_TX_STALLED_SINGLE_FLIT_SNOOP_LINK_0 0x02
@@ -788,4 +802,3 @@ EVENT_UNC_DRAM_PRE_ALL 0x66 UPMC
UMASK_UNC_DRAM_PRE_ALL_CH0 0x01
UMASK_UNC_DRAM_PRE_ALL_CH1 0x02
UMASK_UNC_DRAM_PRE_ALL_CH2 0x04
-
diff --git a/src/includes/perfmon_p6_events.txt b/src/includes/perfmon_p6_events.txt
index 0db8338..9ad1cbc 100644
--- a/src/includes/perfmon_p6_events.txt
+++ b/src/includes/perfmon_p6_events.txt
@@ -1,16 +1,17 @@
# =======================================================================================
-#
+#
# Filename: perfmon_p6_events.txt
-#
-# Description: Event list for Pentium 3
-#
-# Version: 3.1.3
-# Released: 4.11.2014
-#
-# Author: Jan Treibig (jt), jan.treibig at gmail.com
+#
+# Description: Event list for Intel Pentium 3
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
+# Author: Jan Treibig (jt), jan.treibig at gmail.com
+# Thomas Roehl (tr), thomas.roehl at googlemail.com
# Project: likwid
#
-# Copyright (C) 2014 Jan Treibig
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
diff --git a/src/includes/perfmon_perf.h b/src/includes/perfmon_perf.h
new file mode 100644
index 0000000..8927d51
--- /dev/null
+++ b/src/includes/perfmon_perf.h
@@ -0,0 +1,60 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: perfmon_ivybridgeEP_counters.h
+ *
+ * Description: Header file of example perfmon module for software events using
+ * the perf_event interface
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef PERFMON_PERF_H
+#define PERFMON_PERF_H
+
+#include <perfmon_types.h>
+
+#define MAX_SW_EVENTS 9
+
+
+extern int init_perf_event(int cpu_id);
+
+extern int setup_perf_event(int cpu_id, PerfmonEvent *event);
+
+extern int read_perf_event(int cpu_id, uint64_t eventID, uint64_t *data);
+
+extern int stop_perf_event(int cpu_id, uint64_t eventID);
+extern int stop_all_perf_event(int cpu_id);
+
+extern int clear_perf_event(int cpu_id, uint64_t eventID);
+extern int clear_all_perf_event(int cpu_id);
+
+extern int start_perf_event(int cpu_id, uint64_t eventID);
+extern int start_all_perf_event(int cpu_id);
+
+extern int close_perf_event(int cpu_id, uint64_t eventID);
+
+extern int finalize_perf_event(int cpu_id);
+
+#endif
diff --git a/src/includes/perfmon_phi.h b/src/includes/perfmon_phi.h
index 0f5dd54..ecf31bb 100644
--- a/src/includes/perfmon_phi.h
+++ b/src/includes/perfmon_phi.h
@@ -3,15 +3,16 @@
*
* Filename: perfmon_phi.h
*
- * Description: Header File of perfmon module for Xeon Phi.
+ * Description: Header File of perfmon module for Intel Xeon Phi.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -29,127 +30,207 @@
*/
#include <perfmon_phi_events.h>
-#include <perfmon_phi_groups.h>
#include <perfmon_phi_counters.h>
+#include <error.h>
+#include <affinity.h>
static int perfmon_numCountersPhi = NUM_COUNTERS_PHI;
-static int perfmon_numGroupsPhi = NUM_GROUPS_PHI;
static int perfmon_numArchEventsPhi = NUM_ARCH_EVENTS_PHI;
-void perfmon_init_phi(PerfmonThread *thread)
+int perfmon_init_phi(int cpu_id)
{
- uint32_t flags = 0x0UL;
- int cpu_id = thread->processorId;
-
- msr_write(cpu_id, MSR_MIC_PERFEVTSEL0, 0x0UL);
- msr_write(cpu_id, MSR_MIC_PERFEVTSEL1, 0x0UL);
- msr_write(cpu_id, MSR_MIC_PMC0, 0x0ULL);
- msr_write(cpu_id, MSR_MIC_PMC1, 0x0ULL);
- msr_write(cpu_id, MSR_MIC_SPFLT_CONTROL, 0x0ULL);
- msr_write(cpu_id, MSR_MIC_PERF_GLOBAL_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_MIC_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
-
- flags |= (1<<16); /* user mode flag */
- flags |= (1<<22); /* enable flag */
-
- msr_write(cpu_id, MSR_MIC_PERFEVTSEL0, flags);
- msr_write(cpu_id, MSR_MIC_PERFEVTSEL1, flags);
+ return 0;
}
-void perfmon_setupCounterThread_phi(
- int thread_id,
- PerfmonEvent* event,
- PerfmonCounterIndex index)
+int phi_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
{
uint64_t flags = 0x0ULL;
- uint64_t reg = phi_counter_map[index].configRegister;
- int cpu_id = perfmon_threadData[thread_id].processorId;
- perfmon_threadData[thread_id].counters[index].init = TRUE;
- if (phi_counter_map[index].type == PMC)
+ flags |= (1ULL<<16)|(1ULL<<22);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
{
- flags = (1<<22)|(1<<16);
-
- /* Intel with standard 8 bit event mask: [7:0] */
- flags |= (event->umask<<8) + event->eventId;
-
- msr_write(cpu_id, reg , flags);
-
- if (perfmon_verbose)
+ for(int j=0;j<event->numberOfOptions;j++)
{
- printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n",
- cpu_id,
- LLU_CAST reg,
- LLU_CAST flags);
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<17);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_ANYTHREAD:
+ flags |= (1ULL<<21);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL) <<24;
+ break;
+ default:
+ break;
+ }
}
}
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
}
-void perfmon_startCountersThread_phi(int thread_id)
+int perfmon_setupCounterThread_phi(int thread_id, PerfmonEventSet* eventSet)
{
- uint64_t flags = 0ULL;
- int processorId = perfmon_threadData[thread_id].processorId;
-
- msr_write(processorId, MSR_MIC_PERF_GLOBAL_CTRL, 0x0ULL);
+ int cpu_id = groupSet->threads[thread_id].processorId;
- for ( int i=0; i<NUM_COUNTERS_PHI; i++ )
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- msr_write(processorId, phi_counter_map[i].counterRegister , 0x0ULL);
- flags |= (1<<(i)); /* enable counter */
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ if (type == PMC)
+ {
+ phi_pmc_setup(cpu_id, index, event);
+ eventSet->events[i].threadCounter[thread_id].init = TRUE;
}
}
+ return 0;
+}
+
+int perfmon_startCountersThread_phi(int thread_id, PerfmonEventSet* eventSet)
+{
+ uint64_t flags = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- if (perfmon_verbose)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_CTRL, 0x0ULL));
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- printf("perfmon_start_counters: Write Register 0x%X , \
- Flags: 0x%llX \n",MSR_MIC_PERF_GLOBAL_CTRL, LLU_CAST flags);
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ eventSet->events[i].threadCounter[thread_id].startData = 0;
+ eventSet->events[i].threadCounter[thread_id].counterData = 0;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].counterRegister , 0x0ULL));
+ flags |= (1ULL<<(index)); /* enable counter */
+ }
}
- msr_write(processorId, MSR_MIC_PERF_GLOBAL_CTRL, flags);
- flags |= (1ULL<<63);
- msr_write(processorId, MSR_MIC_SPFLT_CONTROL, flags);
- msr_write(processorId, MSR_MIC_PERF_GLOBAL_OVF_CTRL, 0x000000003ULL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_CTRL, flags));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_SPFLT_CONTROL, flags|(1ULL<<63)));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_OVF_CTRL, flags));
+ return 0;
}
-void perfmon_stopCountersThread_phi(int thread_id)
+int perfmon_stopCountersThread_phi(int thread_id, PerfmonEventSet* eventSet)
{
- uint64_t flags;
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ uint64_t counter_result = 0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- msr_write(cpu_id, MSR_MIC_SPFLT_CONTROL, 0x0ULL);
- msr_write(cpu_id, MSR_MIC_PERF_GLOBAL_CTRL, 0x0ULL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_SPFLT_CONTROL, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_CTRL, 0x0ULL));
- for ( int i=0; i<NUM_COUNTERS_PHI; i++ )
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, phi_counter_map[i].counterRegister);
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ counter_result = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, phi_counter_map[index].counterRegister, &counter_result));
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+ {
+ uint64_t ovf_values = 0x0ULL;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_STATUS, &ovf_values));
+ if (ovf_values & (1ULL<<index))
+ {
+ eventSet->events[i].threadCounter[thread_id].overflows++;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_OVF_CTRL, (1ULL<<index)));
+ }
+ }
+ eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
}
}
+ return 0;
+}
+
+int perfmon_readCountersThread_phi(int thread_id, PerfmonEventSet* eventSet)
+{
+ int cpu_id = groupSet->threads[thread_id].processorId;
+ uint64_t counter_result = 0x0ULL;
+ uint64_t core_flags = 0x0ULL;
- flags = msr_read(cpu_id,MSR_MIC_PERF_GLOBAL_STATUS);
-// printf ("Status: 0x%llX \n", LLU_CAST flags);
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_CTRL, &core_flags));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_CTRL, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_SPFLT_CONTROL, 0x0ULL));
- if((flags & 0x3))
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- printf ("Overflow occured \n");
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ counter_result = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter_map[i].counterRegister, &counter_result));
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+ {
+ uint64_t ovf_values = 0x0ULL;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_STATUS, &ovf_values));
+ if (ovf_values & (1ULL<<index))
+ {
+ eventSet->events[i].threadCounter[thread_id].overflows++;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_OVF_CTRL, (1ULL<<index)));
+ }
+ }
+ eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+ }
}
+
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_SPFLT_CONTROL, core_flags|(1ULL<<63)));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_CTRL, core_flags));
+ return 0;
}
-void perfmon_readCountersThread_phi(int thread_id)
+
+int perfmon_finalizeCountersThread_phi(int thread_id, PerfmonEventSet* eventSet)
{
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+ uint64_t ovf_values_core = 0x0ULL;
- for ( int i=0; i<NUM_COUNTERS_PHI; i++ )
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, phi_counter_map[i].counterRegister);
+ continue;
}
+ RegisterIndex index = eventSet->events[i].index;
+ ovf_values_core |= (1ULL<<(index));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[i].configRegister, 0x0ULL));
}
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_CTRL, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_SPFLT_CONTROL, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+ return 0;
}
-
diff --git a/src/includes/perfmon_phi_counters.h b/src/includes/perfmon_phi_counters.h
index edf0658..5bd8010 100644
--- a/src/includes/perfmon_phi_counters.h
+++ b/src/includes/perfmon_phi_counters.h
@@ -3,15 +3,16 @@
*
* Filename: perfmon_phi_counters.h
*
- * Description: Counter Header File of perfmon module.
+ * Description: Counter Header File of perfmon module for Intel Xeon Phi.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -31,8 +32,14 @@
#define NUM_COUNTERS_PHI 2
#define NUM_COUNTERS_CORE_PHI 2
-static PerfmonCounterMap phi_counter_map[NUM_COUNTERS_PHI] = {
- {"PMC0", PMC0, PMC, MSR_MIC_PERFEVTSEL0, MSR_MIC_PMC0, 0, 0},
- {"PMC1", PMC1, PMC, MSR_MIC_PERFEVTSEL1, MSR_MIC_PMC1, 0, 0}
+#define PHI_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK| \
+ EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_THRESHOLD
+
+static RegisterMap phi_counter_map[NUM_COUNTERS_PHI] = {
+ {"PMC0", PMC0, PMC, MSR_MIC_PERFEVTSEL0, MSR_MIC_PMC0, 0, 0, PHI_VALID_OPTIONS_PMC},
+ {"PMC1", PMC1, PMC, MSR_MIC_PERFEVTSEL1, MSR_MIC_PMC1, 0, 0, PHI_VALID_OPTIONS_PMC}
};
+static BoxMap phi_box_map[NUM_UNITS] = {
+ [PMC] = {MSR_MIC_PERF_GLOBAL_CTRL, MSR_MIC_PERF_GLOBAL_STATUS, MSR_MIC_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 40}
+};
diff --git a/src/includes/perfmon_phi_events.txt b/src/includes/perfmon_phi_events.txt
index d6393ba..1c5434e 100644
--- a/src/includes/perfmon_phi_events.txt
+++ b/src/includes/perfmon_phi_events.txt
@@ -1,16 +1,17 @@
# =======================================================================================
-#
+#
# Filename: perfmon_phi_events.txt
-#
+#
# Description: Event list for Intel Xeon Phi
-#
-# Version: 3.1.3
-# Released: 4.11.2014
-#
-# Author: Jan Treibig (jt), jan.treibig at gmail.com
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
+# Author: Jan Treibig (jt), jan.treibig at gmail.com
+# Thomas Roehl (tr), thomas.roehl at googlemail.com
# Project: likwid
#
-# Copyright (C) 2014 Jan Treibig
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
diff --git a/src/includes/perfmon_pm.h b/src/includes/perfmon_pm.h
index 88346d1..73beaf2 100644
--- a/src/includes/perfmon_pm.h
+++ b/src/includes/perfmon_pm.h
@@ -5,13 +5,14 @@
*
* Description: Header File of perfmon module Pentium M.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -30,136 +31,202 @@
#include <perfmon_pm_events.h>
#include <perfmon_pm_counters.h>
+#include <error.h>
+#include <affinity.h>
-#define NUM_GROUPS_PM 5
static int perfmon_numCounters_pm = NUM_COUNTERS_PM;
-static int perfmon_numGroups_pm = NUM_GROUPS_PM;
static int perfmon_numArchEvents_pm = NUM_ARCH_EVENTS_PM;
-static PerfmonGroupMap pm_group_map[NUM_GROUPS_PM] = {
- {"FLOPS_DP",FLOPS_DP,0,"Double Precision MFlops/s",
- "EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP:PMC0,EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_DP:PMC1"},
- {"FLOPS_SP",FLOPS_SP,0,"Single Precision MFlops/s",
- "EMON_SSE_SSE2_COMP_INST_RETIRED_ALL_SP:PMC0,EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_SP:PMC1"},
- {"L2",L2,0,"L2 cache bandwidth in MBytes/s",
- "L2_LINES_IN_ALL_ALL:PMC0,L2_LINES_OUT_ALL_ALL:PMC1"},
- {"BRANCH",BRANCH,0,"Branch prediction miss rate",
- "BR_INST_EXEC:PMC0,BR_INST_MISSP_EXEC:PMC1"},
- {"CPI",CPI,0,"Cycles per instruction","UOPS_RETIRED:PMC0"}
-};
-
-void perfmon_init_pm(PerfmonThread *thread)
-{
- uint64_t flags = 0x0ULL;
- int cpu_id = thread->processorId;
-
- msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
- /* Preinit of two PMC counters */
- //flags |= (1<<16); /* user mode flag */
- //flags |= (1<<19); /* pin control flag */
- // flags |= (1<<22); /* enable flag */
-
- /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
- msr_write(cpu_id, MSR_PERFEVTSEL1, flags);*/
+int perfmon_init_pm(int cpu_id)
+{
+ return 0;
}
-void perfmon_setupCounterThread_pm(
- int thread_id,
- PerfmonEvent* event,
- PerfmonCounterIndex index)
+int pm_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
{
- uint64_t flags;
- uint64_t reg = pm_counter_map[index].configRegister;
- int cpu_id = perfmon_threadData[thread_id].processorId;
-
- perfmon_threadData[thread_id].counters[index].init = TRUE;
- flags = (1<<16)|(1<<19);
+ uint64_t flags = 0x0ULL;
- /* Intel with standard 8 bit event mask: [7:0] */
+ flags = (1ULL<<16)|(1ULL<<19);
flags |= (event->umask<<8) + event->eventId;
- msr_write(cpu_id, reg , flags);
+ if (event->numberOfOptions > 0)
+ {
+ for(int j=0;j<event->numberOfOptions;j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<17);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL)<<24;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int perfmon_setupCounterThread_pm(int thread_id, PerfmonEventSet* eventSet)
+{
+ int cpu_id = groupSet->threads[thread_id].processorId;
- if (perfmon_verbose)
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n",
- cpu_id,
- LLU_CAST reg,
- LLU_CAST flags);
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ eventSet->events[i].threadCounter[thread_id].init = TRUE;
+ pm_pmc_setup(cpu_id, index, event);
}
+ return 0;
}
-void perfmon_startCountersThread_pm(int thread_id)
+int perfmon_startCountersThread_pm(int thread_id, PerfmonEventSet* eventSet)
{
uint64_t flags = 0ULL;
- int processorId = perfmon_threadData[thread_id].processorId;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- if (perfmon_threadData[thread_id].counters[0].init == TRUE)
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ eventSet->events[i].threadCounter[thread_id].startData = 0;
+ eventSet->events[i].threadCounter[thread_id].counterData = 0;
+ VERBOSEPRINTREG(cpu_id, counter_map[index].counterRegister, LLU_CAST 0x0ULL, SETUP_PMC_CTR);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].counterRegister , 0x0ULL));
+ }
+ }
+ if (eventSet->numberOfEvents > 0)
{
- msr_write(processorId, pm_counter_map[0].counterRegister , 0x0ULL);
- msr_write(processorId, pm_counter_map[1].counterRegister , 0x0ULL);
-
/* on p6 only MSR_PERFEVTSEL0 has the enable bit
* it enables both counters as long MSR_PERFEVTSEL1
* has a valid configuration */
- flags = msr_read(processorId, MSR_PERFEVTSEL0);
- flags |= (1<<22); /* enable flag */
-
- if (perfmon_verbose)
- {
- printf("perfmon_start_counters: Write Register 0x%X , \
- Flags: 0x%llX \n",MSR_PERFEVTSEL0, LLU_CAST flags);
- }
-
- msr_write(processorId, MSR_PERFEVTSEL0, flags);
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERFEVTSEL0, &flags));
+ flags |= (1<<22);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERFEVTSEL0, flags));
+ VERBOSEPRINTREG(cpu_id, MSR_PERFEVTSEL0, LLU_CAST flags, UNFREEZE_PMC);
}
-
+ return 0;
}
-void perfmon_stopCountersThread_pm(int thread_id)
+int perfmon_stopCountersThread_pm(int thread_id, PerfmonEventSet* eventSet)
{
- int i;
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ uint64_t counter_result = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERFEVTSEL0, &counter_result));
+ counter_result &= ~(1<<22);
+ VERBOSEPRINTREG(cpu_id, MSR_PERFEVTSEL0, counter_result, FREEZE_PMC);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERFEVTSEL0, counter_result));
- for (i=0;i<NUM_COUNTERS_PM;i++)
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, pm_counter_map[i].counterRegister);
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ counter_result = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter_map[index].counterRegister, &counter_result));
+ VERBOSEPRINTREG(cpu_id, counter_map[index].counterRegister, counter_result, READ_PMC);
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+ {
+ eventSet->events[i].threadCounter[thread_id].overflows++;
+ }
+ eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
}
}
+ return 0;
}
-void perfmon_printDerivedMetrics_pm(PerfmonGroup group)
+int perfmon_readCountersThread_pm(int thread_id, PerfmonEventSet* eventSet)
{
+ uint64_t counter_result = 0x0ULL;
+ uint64_t pmc_flags = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- switch ( group )
- {
- case FLOPS_DP:
-
- case FLOPS_SP:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERFEVTSEL0, &pmc_flags));
+ pmc_flags &= ~(1<<22);
+ VERBOSEPRINTREG(cpu_id, MSR_PERFEVTSEL0, pmc_flags & ~(1<<22), FREEZE_PMC);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERFEVTSEL0, pmc_flags & ~(1<<22)));
- case L2:
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ counter_result = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ uint32_t counter = counter_map[index].counterRegister;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+ {
+ eventSet->events[i].threadCounter[thread_id].overflows++;
+ }
+ eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+ }
+ }
- case BRANCH:
+ VERBOSEPRINTREG(cpu_id, MSR_PERFEVTSEL0, pmc_flags, UNFREEZE_PMC);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERFEVTSEL0, pmc_flags));
+ return 0;
+}
- case _NOGROUP:
- fprintf (stderr, "The Pentium M supports only two counters. Therefore derived metrics are not computed due to missing runtime!\n" );
- break;
+int perfmon_finalizeCountersThread_pm(int thread_id, PerfmonEventSet* eventSet)
+{
+ int cpu_id = groupSet->threads[thread_id].processorId;
- default:
- fprintf (stderr, "perfmon_printDerivedMetricsCore2: Unknown group! Exiting!\n" );
- exit (EXIT_FAILURE);
- break;
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ uint32_t reg = counter_map[index].configRegister;
+ if ((reg) && ((type == PMC)||(type == FIXED)))
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL));
+ VERBOSEPRINTPCIREG(cpu_id, MSR_DEV, reg, 0x0ULL, CLEAR_CTL);
+ }
+ eventSet->events[i].threadCounter[thread_id].init = FALSE;
}
+ return 0;
}
-
diff --git a/src/includes/perfmon_pm_counters.h b/src/includes/perfmon_pm_counters.h
index 9119096..7e0d6da 100644
--- a/src/includes/perfmon_pm_counters.h
+++ b/src/includes/perfmon_pm_counters.h
@@ -3,15 +3,16 @@
*
* Filename: perfmon_pm_counters.h
*
- * Description: Counter Header File of perfmon module.
+ * Description: Counter Header File of perfmon module for Intel Pentium M.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -31,8 +32,13 @@
#define NUM_COUNTERS_PM 2
#define NUM_COUNTERS_CORE_PM 2
-static PerfmonCounterMap pm_counter_map[NUM_COUNTERS_PM] = {
- {"PMC0",PMC0, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
- {"PMC1",PMC1, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0}
+#define PM_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK
+
+static RegisterMap pm_counter_map[NUM_COUNTERS_PM] = {
+ {"PMC0", PMC0, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, PM_VALID_OPTIONS_PMC},
+ {"PMC1", PMC1, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, PM_VALID_OPTIONS_PMC}
};
+static BoxMap pm_box_map[NUM_UNITS] = {
+ [PMC] = {0, 0, 0, 0, 0, 0, 48}
+};
diff --git a/src/includes/perfmon_pm_events.txt b/src/includes/perfmon_pm_events.txt
index 9ed83a8..45fd7f4 100644
--- a/src/includes/perfmon_pm_events.txt
+++ b/src/includes/perfmon_pm_events.txt
@@ -1,16 +1,16 @@
# =======================================================================================
-#
+#
# Filename: perfmon_pm_events.txt
-#
+#
# Description: Event list for Intel Pentium M
-#
-# Version: 3.1.3
-# Released: 4.11.2014
-#
-# Author: Jan Treibig (jt), jan.treibig at gmail.com
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
+# Author: Jan Treibig (jt), jan.treibig at gmail.com
# Project: likwid
#
-# Copyright (C) 2014 Jan Treibig
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
@@ -262,22 +262,22 @@ UMASK_BUS_HITM_DRV_SELF 0x00
EVENT_BUS_SNOOP_STALL 0x7E PMC
UMASK_BUS_SNOOP_STALL_SELF 0x00
-EVENT_FLOPS 0xC1 PMC
+EVENT_FLOPS 0xC1 PMC0
UMASK_FLOPS 0x00
-EVENT_FP_COMP_OPS_EXE 0x10 PMC
+EVENT_FP_COMP_OPS_EXE 0x10 PMC0
UMASK_FP_COMP_OPS_EXE 0x00
-EVENT_FP_ASSIST 0x11 PMC
+EVENT_FP_ASSIST 0x11 PMC1
UMASK_FP_ASSIST 0x00
-EVENT_MUL 0x12 PMC
+EVENT_MUL 0x12 PMC1
UMASK_MUL 0x00
-EVENT_DIV 0x13 PMC
+EVENT_DIV 0x13 PMC1
UMASK_DIV 0x00
-EVENT_CYCLES_DIV_BUSY 0x14 PMC
+EVENT_CYCLES_DIV_BUSY 0x14 PMC0
UMASK_CYCLES_DIV_BUSY 0x00
EVENT_LD_BLOCKS 0x03 PMC
@@ -289,13 +289,13 @@ UMASK_SB_DRAINS 0x00
EVENT_MISALIGN_MEM_REF 0x05 PMC
UMASK_MISALIGN_MEM_REF 0x00
-EVENT_EMON_KNI_PREF_DISPATCHED 0x07 PMC
+EVENT_EMON_KNI_PREF_DISPATCHED 0x07 PMC0|PMC1
UMASK_EMON_KNI_PREF_DISPATCHED_NTA 0x00
UMASK_EMON_KNI_PREF_DISPATCHED_T1 0x01
UMASK_EMON_KNI_PREF_DISPATCHED_T2 0x02
UMASK_EMON_KNI_PREF_DISPATCHED_WEAK 0x03
-EVENT_EMON_KNI_PREF_MISS 0x4B PMC
+EVENT_EMON_KNI_PREF_MISS 0x4B PMC0|PMC1
UMASK_EMON_KNI_PREF_MISS_NTA 0x00
UMASK_EMON_KNI_PREF_MISS_T1 0x01
UMASK_EMON_KNI_PREF_MISS_T2 0x02
@@ -310,13 +310,13 @@ UMASK_UOPS_RETIRED 0x00
EVENT_INST_DECODED 0xD0 PMC
UMASK_INST_DECODED 0x00
-EVENT_EMON_SSE_SSE2_INST_RETIRED 0xD8 PMC
+EVENT_EMON_SSE_SSE2_INST_RETIRED 0xD8 PMC0|PMC1
UMASK_EMON_SSE_SSE2_INST_RETIRED_ALL_SP 0x00
UMASK_EMON_SSE_SSE2_INST_RETIRED_SCALAR_SP 0x01
UMASK_EMON_SSE_SSE2_INST_RETIRED_PACKED_DP 0x02
UMASK_EMON_SSE_SSE2_INST_RETIRED_SCALAR_DP 0x03
-EVENT_EMON_SSE_SSE2_COMP_INST_RETIRED 0xD9 PMC
+EVENT_EMON_SSE_SSE2_COMP_INST_RETIRED 0xD9 PMC0|PMC1
UMASK_EMON_SSE_SSE2_COMP_INST_RETIRED_ALL_SP 0x00
UMASK_EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_SP 0x01
UMASK_EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP 0x02
diff --git a/src/includes/perfmon_sandybridge.h b/src/includes/perfmon_sandybridge.h
index f11714a..f6f9665 100644
--- a/src/includes/perfmon_sandybridge.h
+++ b/src/includes/perfmon_sandybridge.h
@@ -3,15 +3,16 @@
*
* Filename: perfmon_sandybridge.h
*
- * Description: Header File of perfmon module for Sandy Bridge.
+ * Description: Header File of perfmon module for Intel Sandy Bridge.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -29,666 +30,1858 @@
*/
#include <perfmon_sandybridge_events.h>
-#include <perfmon_sandybridge_groups.h>
#include <perfmon_sandybridge_counters.h>
-
+#include <perfmon_sandybridgeEP_events.h>
+#include <perfmon_sandybridgeEP_counters.h>
+#include <error.h>
+#include <affinity.h>
+
+static int perfmon_numCountersSandybridgeEP = NUM_COUNTERS_SANDYBRIDGEEP;
+static int perfmon_numCoreCountersSandybridgeEP = NUM_COUNTERS_CORE_SANDYBRIDGEEP;
+static int perfmon_numArchEventsSandybridgeEP = NUM_ARCH_EVENTS_SANDYBRIDGEEP;
static int perfmon_numCountersSandybridge = NUM_COUNTERS_SANDYBRIDGE;
-static int perfmon_numGroupsSandybridge = NUM_GROUPS_SANDYBRIDGE;
+static int perfmon_numCoreCountersSandybridge = NUM_COUNTERS_CORE_SANDYBRIDGE;
static int perfmon_numArchEventsSandybridge = NUM_ARCH_EVENTS_SANDYBRIDGE;
-#define OFFSET_PMC 3
+int snb_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event);
+int snbep_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event);
+int (*sandy_cbox_setup)(int, RegisterIndex, PerfmonEvent*);
-void perfmon_init_sandybridge(PerfmonThread *thread)
+int perfmon_init_sandybridge(int cpu_id)
{
- uint64_t flags = 0x0ULL;
- int cpu_id = thread->processorId;
-
- /* Initialize registers */
- msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL2, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL3, 0x0ULL);
- msr_write(cpu_id, MSR_PMC0, 0x0ULL);
- msr_write(cpu_id, MSR_PMC1, 0x0ULL);
- msr_write(cpu_id, MSR_PMC2, 0x0ULL);
- msr_write(cpu_id, MSR_PMC3, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_FIXED_CTR0, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_FIXED_CTR1, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_FIXED_CTR2, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
-
- /* initialize fixed counters
- * FIXED 0: Instructions retired
- * FIXED 1: Clocks unhalted core
- * FIXED 2: Clocks unhalted ref */
- msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
-
- /* Preinit of PERFEVSEL registers */
- //flags |= (1<<22); /* enable flag */
- //flags |= (1<<16); /* user mode flag */
-
- /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
- msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
- msr_write(cpu_id, MSR_PERFEVTSEL2, flags);
- msr_write(cpu_id, MSR_PERFEVTSEL3, flags);*/
-
- /* TODO Robust implementation which also works if stuff is not there */
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
- lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id))
- {
- if ( cpuid_info.model == SANDYBRIDGE_EP )
+ int ret;
+ uint64_t data = 0x0ULL;
+ lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+ lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+ ret = HPMwrite(cpu_id, MSR_DEV, MSR_UNC_CBO_0_PERFEVTSEL0, 0x0ULL);
+ ret += HPMread(cpu_id, MSR_DEV, MSR_UNC_PERF_GLOBAL_CTRL, &data);
+ ret += HPMwrite(cpu_id, MSR_DEV, MSR_UNC_PERF_GLOBAL_CTRL, 0x0ULL);
+ ret += HPMread(cpu_id, MSR_DEV, MSR_UNC_CBO_0_PERFEVTSEL0, &data);
+ if ((cpuid_info.model == SANDYBRIDGE_EP))
+ {
+ sandy_cbox_setup = snbep_cbox_setup;
+ }
+ else if ((ret == 0) && (data == 0x0ULL))
+ {
+ sandy_cbox_setup = snb_cbox_setup;
+ }
+
+ return 0;
+}
+
+uint32_t snb_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint32_t flags = (1ULL<<(1+(index*4)));
+ for(j=0;j<event->numberOfOptions;j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<(index*4));
+ break;
+ case EVENT_OPTION_ANYTHREAD:
+ flags |= (1ULL<<(2+(index*4)));
+ break;
+ default:
+ break;
+ }
+ }
+ return flags;
+}
+
+int snb_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint32_t flags = 0x0U;
+ uint64_t offcore_flags = 0x0ULL;
+
+ flags |= (1ULL<<22); /* enable flag */
+ flags |= (1ULL<<16); /* user mode flag */
+
+ /* Intel with standard 8 bit event mask: [7:0] */
+ flags |= (event->umask<<8) + event->eventId;
+
+ /* set custom cfg and cmask */
+ if ((event->cfgBits != 0) &&
+ (event->eventId != 0xB7) &&
+ (event->eventId != 0xBB))
+ {
+ flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+ }
+ if (event->numberOfOptions > 0)
+ {
+ for(j=0;j<event->numberOfOptions;j++)
{
- /* Only root can access pci address space in direct mode */
- if (accessClient_mode != DAEMON_AM_DIRECT)
+ switch (event->options[j].type)
{
- uint32_t uflags = 0x10100U; /* enable freeze (bit 16), freeze (bit 8) */
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_BOX_CTL, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_BOX_CTL, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_BOX_CTL, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_BOX_CTL, uflags);
-
- uflags = 0x0U;
- uflags |= (1<<22); /* enable flag */
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_CTL_0, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_CTL_1, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_CTL_2, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_CTL_3, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_CTL_0, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_CTL_1, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_CTL_2, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_CTL_3, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_CTL_0, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_CTL_1, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_CTL_2, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_CTL_3, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_CTL_0, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_CTL_1, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_CTL_2, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_CTL_3, uflags);
-
- uflags |= (1<<19); /* reset fixed counter */
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_FIXED_CTL, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_FIXED_CTL, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_FIXED_CTL, uflags);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_FIXED_CTL, uflags);
-
- /* iMC counters need to be manually reset to zero */
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_CTR_0_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_CTR_0_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_CTR_1_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_CTR_1_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_CTR_2_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_CTR_2_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_CTR_0_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_CTR_0_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_CTR_1_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_CTR_1_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_CTR_2_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_CTR_2_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_CTR_0_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_CTR_0_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_CTR_1_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_CTR_1_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_CTR_2_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_CTR_2_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_CTR_0_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_CTR_0_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_CTR_1_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_CTR_1_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_CTR_2_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_CTR_2_B, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
-
- /* FIXME: Not yet tested/ working due to BIOS issues on test
- * machines */
-#if 0
- /* QPI registers can be zeroed with single write */
- uflags = 0x0113UL; /*enable freeze (bit 16), freeze (bit 8), reset */
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0, PCI_UNC_QPI_PMON_BOX_CTL, uflags);
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1, PCI_UNC_QPI_PMON_BOX_CTL, uflags);
- uflags = 0x0UL;
- uflags |= (1UL<<22); /* enable flag */
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0, PCI_UNC_QPI_PMON_CTL_0, uflags);
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0, PCI_UNC_QPI_PMON_CTL_1, uflags);
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0, PCI_UNC_QPI_PMON_CTL_2, uflags);
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0, PCI_UNC_QPI_PMON_CTL_3, uflags);
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1, PCI_UNC_QPI_PMON_CTL_0, uflags);
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1, PCI_UNC_QPI_PMON_CTL_1, uflags);
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1, PCI_UNC_QPI_PMON_CTL_2, uflags);
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1, PCI_UNC_QPI_PMON_CTL_3, uflags);
-#endif
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<17);
+ break;
+ case EVENT_OPTION_ANYTHREAD:
+ flags |= (1ULL<<21);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0xFFULL)<<24);
+ break;
+ case EVENT_OPTION_MATCH0:
+ offcore_flags |= (event->options[j].value & 0x8FFFULL);
+ break;
+ case EVENT_OPTION_MATCH1:
+ offcore_flags |= (event->options[j].value<<16);
+ break;
+ default:
+ break;
}
}
}
-// lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+ if (event->eventId == 0xB7)
+ {
+ if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+ {
+ offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+ }
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, offcore_flags));
+ }
+ else if (event->eventId == 0xBB)
+ {
+ if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+ {
+ offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+ }
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, offcore_flags));
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
}
-#define BOX_GATE_SNB(channel,label) \
- if (perfmon_verbose) { \
- printf("[%d] perfmon_setup_counter (label): Write Register 0x%llX , Flags: 0x%llX \n", \
- cpu_id, \
- LLU_CAST reg, \
- LLU_CAST flags); \
- } \
- if(haveLock) { \
- uflags = (1<<22); \
- uflags |= (event->umask<<8) + event->eventId; \
- pci_write(cpu_id, channel, reg, uflags); \
+int snb_mbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint32_t flags = 0x0U;
+ PciDeviceIndex dev = counter_map[index].device;
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(dev, cpu_id))
+ {
+ return -ENODEV;
+ }
+
+ flags = (1ULL<<22);
+ flags |= (event->umask<<8) + event->eventId;
+ for(j=0;j<event->numberOfOptions;j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0xFFULL)<<24);
+ break;
+ default:
+ break;
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, LLU_CAST flags, SETUP_MBOX);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
}
+ return 0;
+}
-void perfmon_setupCounterThread_sandybridge(
- int thread_id,
- PerfmonEvent* event,
- PerfmonCounterIndex index)
+uint32_t snb_cbox_filter(PerfmonEvent *event)
{
- int haveLock = 0;
- uint64_t flags;
- uint32_t uflags;
- uint64_t reg = sandybridge_counter_map[index].configRegister;
- int cpu_id = perfmon_threadData[thread_id].processorId;
- uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
- uint64_t orig_fixed_flags = fixed_flags;
- perfmon_threadData[thread_id].counters[index].init = TRUE;
+ int j;
+ uint32_t ret = 0x0;
+ uint64_t mask = 0x0ULL;
+ int set_state = 0;
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ for(j=0;j<event->numberOfOptions;j++)
{
- haveLock = 1;
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_OPCODE:
+ if ((event->options[j].value == 0x180) ||
+ (event->options[j].value == 0x181) ||
+ (event->options[j].value == 0x182) ||
+ (event->options[j].value == 0x187) ||
+ (event->options[j].value == 0x18C) ||
+ (event->options[j].value == 0x18D) ||
+ (event->options[j].value == 0x190) ||
+ (event->options[j].value == 0x191) ||
+ (event->options[j].value == 0x192) ||
+ (event->options[j].value == 0x194) ||
+ (event->options[j].value == 0x195) ||
+ (event->options[j].value == 0x19C) ||
+ (event->options[j].value == 0x19E) ||
+ (event->options[j].value == 0x1C4) ||
+ (event->options[j].value == 0x1C5) ||
+ (event->options[j].value == 0x1C8) ||
+ (event->options[j].value == 0x1E4) ||
+ (event->options[j].value == 0x1E5) ||
+ (event->options[j].value == 0x1E6))
+ {
+ ret |= ((event->options[j].value & 0x1FFULL) << 23);
+ }
+ else
+ {
+ ERROR_PRINT(Invalid value 0x%llx for opcode option, LLU_CAST event->options[j].value);
+ }
+ break;
+ case EVENT_OPTION_STATE:
+ if (event->options[j].value & 0x3F)
+ {
+ ret |= ((event->options[j].value & 0x3FULL) << 17);
+ set_state = 1;
+ }
+ else
+ {
+ ERROR_PRINT(Invalid value 0x%llx for state option, LLU_CAST event->options[j].value);
+ }
+ break;
+ case EVENT_OPTION_NID:
+ mask = 0x0ULL;
+ for (int i=0; i<affinityDomains.numberOfNumaDomains;i++)
+ mask |= (1ULL<<i);
+ if (event->options[j].value & mask)
+ {
+ ret |= ((event->options[j].value & 0xFFULL) << 10);
+ }
+ else
+ {
+ ERROR_PRINT(Invalid value 0x%llx for node id option, LLU_CAST event->options[j].value);
+ }
+ break;
+ case EVENT_OPTION_TID:
+ if (event->options[j].value <= 0xF)
+ {
+ ret |= (event->options[j].value & 0x1FULL);
+ }
+ else
+ {
+ ERROR_PRINT(Invalid value 0x%llx for thread id option, LLU_CAST event->options[j].value);
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ if ((event->eventId == 0x34) && (set_state == 0))
+ {
+ ret |= (0x1FULL << 18);
}
+ return ret;
+}
+
+int snb_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint32_t flags = 0x0U;
- switch (sandybridge_counter_map[index].type)
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
{
- case PMC:
+ return 0;
+ }
- //flags = msr_read(cpu_id,reg);
- //flags &= ~(0xFFFFU); /* clear lower 16bits */
- flags = (1<<22)|(1<<16);
+ flags |= (1ULL<<22)|(1ULL<<20);
+ flags |= (event->umask<<8) + event->eventId;
+ for(j=0;j<event->numberOfOptions;j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0x1FULL)<<24;
+ break;
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_CBOX);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
- /* Intel with standard 8 bit event mask: [7:0] */
- flags |= (event->umask<<8) + event->eventId;
+int snbep_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint32_t flags = 0x0U;
- if (event->cfgBits != 0) /* set custom cfg and cmask */
- {
- flags &= ~(0xFFFFU<<16); /* clear upper 16bits */
- flags |= ((event->cmask<<8) + event->cfgBits)<<16;
- }
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
- if (perfmon_verbose)
- {
- printf("[%d] perfmon_setup_counter PMC: Write Register 0x%llX , Flags: 0x%llX \n",
- cpu_id,
- LLU_CAST reg,
- LLU_CAST flags);
- }
+ flags |= (1ULL<<22);
+ flags |= (event->umask<<8) + event->eventId;
- msr_write(cpu_id, reg , flags);
- break;
+ if (event->numberOfOptions > 0)
+ {
+ uint32_t optflags = snb_cbox_filter(event);
+ uint32_t filter_reg = box_map[counter_map[index].type].filterRegister1;
+ if (optflags != 0x0U)
+ {
+ VERBOSEPRINTREG(cpu_id, filter_reg, LLU_CAST optflags, SETUP_CBOX_FILTER);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter_reg, optflags));
+ }
+ }
- case FIXED:
- fixed_flags |= (0x2 << (index*4));
- break;
+ for(j=0;j<event->numberOfOptions;j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_TID:
+ flags |= (1ULL<<19);
+ break;
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL)<<24;
+ break;
+ default:
+ break;
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_CBOX);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
- case POWER:
- break;
- case MBOX0:
- BOX_GATE_SNB(PCI_IMC_DEVICE_CH_0,MBOX0);
- break;
+int snb_ubox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint32_t flags = 0x0U;
- case MBOX1:
- BOX_GATE_SNB(PCI_IMC_DEVICE_CH_1,MBOX1);
- break;
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
- case MBOX2:
- BOX_GATE_SNB(PCI_IMC_DEVICE_CH_2,MBOX2);
- break;
+ flags |= (1ULL<<17);
+ flags |= (event->umask<<8) + event->eventId;
+ for(j=0;j<event->numberOfOptions;j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0x1FULL) << 24);
+ break;
+ default:
+ break;
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_UBOX)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
- case MBOX3:
- BOX_GATE_SNB(PCI_IMC_DEVICE_CH_3,MBOX3);
- break;
+int snb_bbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint32_t flags = 0x0U;
+ uint64_t match = 0x0ULL;
+ PciDeviceIndex dev = counter_map[index].device;
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(dev, cpu_id))
+ {
+ return -ENODEV;
+ }
- case SBOX0:
+ flags = (1ULL<<22);
+ flags |= (event->umask<<8) + event->eventId;
+ for(j=0;j<event->numberOfOptions;j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0xFFULL) << 24);
+ break;
+ case EVENT_OPTION_OPCODE:
+ VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH,
+ LLU_CAST (event->options[j].value & 0x3FULL), SETUP_BBOX_OPCODE);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH,
+ (event->options[j].value & 0x3FULL)));
+ break;
+ case EVENT_OPTION_MATCH0:
+ match = event->options[j].value & 0xFFFFFFC0ULL;
+ VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, LLU_CAST match, SETUP_BBOX_MATCH0);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, match));
+ match = (event->options[j].value >> 32) & 0x3FFFULL;
+ VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, LLU_CAST match, SETUP_BBOX_MATCH1);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, match));
+ break;
+ default:
+ break;
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, LLU_CAST flags, SETUP_BBOX);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
- /* CTO_COUNT event requires programming of MATCH/MASK registers */
- if (event->eventId == 0x38)
- {
- if(haveLock)
+
+int snb_wbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint32_t flags = 0x0U;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+
+ flags = (1ULL<<22);
+ flags |= event->eventId & 0xFF;
+ for(j=0;j<event->numberOfOptions;j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0x1FULL) << 24);
+ break;
+ case EVENT_OPTION_OCCUPANCY:
+ flags |= ((event->options[j].value & 0x3ULL) << 14);
+ break;
+ case EVENT_OPTION_OCCUPANCY_EDGE:
+ flags |= (1ULL<<31);
+ break;
+ case EVENT_OPTION_OCCUPANCY_INVERT:
+ flags |= (1ULL<<30);
+ break;
+ case EVENT_OPTION_OCCUPANCY_FILTER:
+ VERBOSEPRINTREG(cpu_id, MSR_UNC_PCU_PMON_BOX_FILTER, LLU_CAST event->options[j].value & 0xFFFFFFFFULL, SETUP_WBOX_FILTER);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_PCU_PMON_BOX_FILTER, event->options[j].value & 0xFFFFFFFFULL));
+ default:
+ break;
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_WBOX);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int snb_sbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event, PciDeviceIndex filterdev)
+{
+ int j;
+ uint32_t flags = 0x0U;
+ PciDeviceIndex dev = counter_map[index].device;
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(dev, cpu_id))
+ {
+ return -ENODEV;
+ }
+
+ flags = (1ULL<<22);
+ flags |= event->cfgBits;
+ flags |= (event->umask<<8) + event->eventId;
+ for(j=0;j<event->numberOfOptions;j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0xFFULL) << 24);
+ break;
+ case EVENT_OPTION_MATCH0:
+ if (HPMcheck(filterdev, cpu_id))
{
- //uflags = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_0, reg);
- //uflags &= ~(0xFFFFU);
- uflags = (1<<22);
- uflags |= (1UL<<21) + event->eventId; /* Set extension bit */
- printf("UFLAGS 0x%x \n",uflags);
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0, reg, uflags);
-
- /* program MATCH0 */
- uflags = 0x0UL;
- uflags = (event->cmask<<13) + (event->umask<<8);
- printf("MATCH UFLAGS 0x%x \n",uflags);
- pci_write(cpu_id, PCI_QPI_MASK_DEVICE_PORT_0, PCI_UNC_QPI_PMON_MATCH_0, uflags);
-
- /* program MASK0 */
- uflags = 0x0UL;
- uflags = (0x3F<<12) + (event->cfgBits<<4);
- printf("MASK UFLAGS 0x%x \n",uflags);
- pci_write(cpu_id, PCI_QPI_MASK_DEVICE_PORT_0, PCI_UNC_QPI_PMON_MASK_0, uflags);
+ VERBOSEPRINTPCIREG(cpu_id, filterdev, PCI_UNC_QPI_PMON_MATCH_0,
+ event->options[j].value & 0x8003FFF8ULL, SETUP_SBOX_MATCH0);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, filterdev, PCI_UNC_QPI_PMON_MATCH_0,
+ event->options[j].value & 0x8003FFF8ULL));
}
- }
- else
- {
- BOX_GATE_SNB(PCI_QPI_DEVICE_PORT_0,SBOX0);
- }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+ }
+ break;
+ case EVENT_OPTION_MATCH1:
+ if (HPMcheck(filterdev, cpu_id))
+ {
+ VERBOSEPRINTPCIREG(cpu_id, filterdev, PCI_UNC_QPI_PMON_MATCH_1,
+ event->options[j].value & 0x000F000FULL, SETUP_SBOX_MATCH1);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, filterdev, PCI_UNC_QPI_PMON_MATCH_1,
+ event->options[j].value & 0x000F000FULL));
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+ }
+ break;
+ case EVENT_OPTION_MASK0:
+ if (HPMcheck(filterdev, cpu_id))
+ {
+ VERBOSEPRINTPCIREG(cpu_id, filterdev, PCI_UNC_QPI_PMON_MASK_0,
+ event->options[j].value & 0x8003FFF8ULL, SETUP_SBOX_MASK0);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, filterdev, PCI_UNC_QPI_PMON_MASK_0,
+ event->options[j].value & 0x8003FFF8ULL));
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+ }
+ break;
+ case EVENT_OPTION_MASK1:
+ if (HPMcheck(filterdev, cpu_id))
+ {
+ VERBOSEPRINTPCIREG(cpu_id, filterdev, PCI_UNC_QPI_PMON_MASK_1,
+ event->options[j].value & 0x000F000FULL, SETUP_SBOX_MASK1);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, filterdev, PCI_UNC_QPI_PMON_MASK_1,
+ event->options[j].value & 0x000F000FULL));
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, LLU_CAST flags, SETUP_SBOX);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
- break;
- case SBOX1:
- /* CTO_COUNT event requires programming of MATCH/MASK registers */
- if (event->eventId == 0x38)
- {
- if(haveLock)
+int snb_rbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint32_t flags = 0x0U;
+ PciDeviceIndex dev = counter_map[index].device;
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(dev, cpu_id))
+ {
+ return -ENODEV;
+ }
+
+ flags = (1ULL<<22);
+ flags |= (event->umask<<8) + event->eventId;
+ for(j=0;j<event->numberOfOptions;j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0xFFULL) << 24);
+ break;
+ default:
+ break;
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, LLU_CAST flags, SETUP_RBOX)
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int snb_pbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint32_t flags = 0x0U;
+ PciDeviceIndex dev = counter_map[index].device;
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (!HPMcheck(dev, cpu_id))
+ {
+ return -ENODEV;
+ }
+
+ flags = (1ULL<<22);
+ flags |= (event->umask<<8) + event->eventId;
+ for(j=0;j<event->numberOfOptions;j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0xFFULL) << 24);
+ break;
+ default:
+ break;
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, LLU_CAST flags, SETUP_PBOX)
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+// Macros to stop counting and reset control registers
+// FREEZE(_AND_RESET_CTL) uses central box register to freeze (bit 8 + 16) and bit 1 to reset control registers
+#define SNB_FREEZE_AND_RESET_CTL_BOX(id) \
+ if (haveLock && eventSet->regTypeMask & (REG_TYPE_MASK(id))) \
+ { \
+ VERBOSEPRINTREG(cpu_id, box_map[id].ctrlRegister, 0x10101U, FREEZE_AND_RESET_CTL_BOX_##id) \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ctrlRegister, 0x10101ULL)); \
+ }
+
+#define SNB_FREEZE_BOX(id) \
+ if (haveLock && eventSet->regTypeMask & (REG_TYPE_MASK(id))) \
+ { \
+ VERBOSEPRINTREG(cpu_id, box_map[id].ctrlRegister, 0x10100U, FREEZE_AND_RESET_CTL_BOX_##id) \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ctrlRegister, 0x10100ULL)); \
+ }
+
+// FREEZE(_AND_RESET_CTL)_PCI uses central box register to freeze (bit 8 + 16) and bit 1 to reset control registers
+// Checks whether PCI device exists, because this is the first operation we do on the devices
+#define SNB_FREEZE_AND_RESET_CTL_PCI_BOX(id) \
+ if (haveLock && \
+ (eventSet->regTypeMask & (REG_TYPE_MASK(id))) && \
+ (HPMcheck(box_map[id].device, cpu_id) == 0)) \
+ { \
+ VERBOSEPRINTPCIREG(cpu_id, box_map[id].device, box_map[id].ctrlRegister, 0x10101ULL, FREEZE_AND_RESET_CTL_PCI_BOX_##id); \
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, box_map[id].device, box_map[id].ctrlRegister, 0x10101ULL)); \
+ }
+
+#define SNB_FREEZE_PCI_BOX(id) \
+ if (haveLock && \
+ (eventSet->regTypeMask & (REG_TYPE_MASK(id))) && \
+ (HPMcheck(box_map[id].device, cpu_id) == 0)) \
+ { \
+ VERBOSEPRINTPCIREG(cpu_id, box_map[id].device, box_map[id].ctrlRegister, 0x10100ULL, FREEZE_PCI_BOX_##id) \
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, box_map[id].device, box_map[id].ctrlRegister, 0x10100ULL)); \
+ }
+
+// MBOX*FIX have a slightly different scheme, setting the whole register to 0 freeze the counter
+#define SNB_FREEZE_MBOXFIX(number) \
+ if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(MBOX##number##FIX))) && \
+ (HPMcheck(PCI_IMC_DEVICE_0_CH_##number, cpu_id))) \
+ { \
+ VERBOSEPRINTPCIREG(cpu_id, PCI_IMC_DEVICE_0_CH_##number, PCI_UNC_MC_PMON_FIXED_CTL, 0x0ULL, FREEZE_MBOXFIX##number) \
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, PCI_IMC_DEVICE_0_CH_##number, PCI_UNC_MC_PMON_FIXED_CTL, 0x0ULL)); \
+ }
+
+
+
+int perfmon_setupCounterThread_sandybridge(
+ int thread_id,
+ PerfmonEventSet* eventSet)
+{
+ int i;
+ int haveLock = 0;
+ uint64_t flags = 0x0ULL;
+ uint64_t fixed_flags = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveLock = 1;
+ }
+
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL));
+ }
+ if (cpuid_info.model == SANDYBRIDGE_EP)
+ {
+ SNB_FREEZE_BOX(CBOX0);
+ SNB_FREEZE_BOX(CBOX1);
+ SNB_FREEZE_BOX(CBOX2);
+ SNB_FREEZE_BOX(CBOX3);
+ SNB_FREEZE_BOX(CBOX4);
+ SNB_FREEZE_BOX(CBOX5);
+ SNB_FREEZE_BOX(CBOX6);
+ SNB_FREEZE_BOX(CBOX7);
+
+ SNB_FREEZE_PCI_BOX(MBOX0);
+ SNB_FREEZE_PCI_BOX(MBOX1);
+ SNB_FREEZE_PCI_BOX(MBOX2);
+ SNB_FREEZE_PCI_BOX(MBOX3);
+
+ SNB_FREEZE_MBOXFIX(0);
+ SNB_FREEZE_MBOXFIX(1);
+ SNB_FREEZE_MBOXFIX(2);
+ SNB_FREEZE_MBOXFIX(3);
+
+ SNB_FREEZE_PCI_BOX(SBOX0);
+ SNB_FREEZE_PCI_BOX(SBOX1);
+
+ SNB_FREEZE_PCI_BOX(RBOX0);
+ SNB_FREEZE_PCI_BOX(RBOX1);
+
+ SNB_FREEZE_PCI_BOX(PBOX);
+
+ SNB_FREEZE_PCI_BOX(BBOX0);
+ SNB_FREEZE_BOX(WBOX);
+ }
+ else
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL, LLU_CAST (1ULL<<31), FREEZE_UNCORE)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_PERF_GLOBAL_CTRL, (1ULL<<31)));
+ }
+
+ for (i=0;i < eventSet->numberOfEvents;i++)
+ {
+ flags = 0x0ULL;
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ RegisterIndex index = eventSet->events[i].index;
+ PciDeviceIndex dev = counter_map[index].device;
+ eventSet->events[i].threadCounter[thread_id].init = TRUE;
+ switch (type)
+ {
+ case PMC:
+ snb_pmc_setup(cpu_id, index, event);
+ break;
+
+ case FIXED:
+ /* initialize fixed counters
+ * FIXED 0: Instructions retired
+ * FIXED 1: Clocks unhalted core
+ * FIXED 2: Clocks unhalted ref */
+ fixed_flags |= snb_fixed_setup(cpu_id, index,event);
+ /* Written in the end of function for all fixed purpose registers */
+ break;
+
+ case POWER:
+ break;
+
+ case MBOX0:
+ case MBOX1:
+ case MBOX2:
+ case MBOX3:
+ snb_mbox_setup(cpu_id, index, event);
+ break;
+
+ case MBOX0FIX:
+ break;
+ case MBOX1FIX:
+ break;
+ case MBOX2FIX:
+ break;
+ case MBOX3FIX:
+ break;
+
+ case CBOX0:
+ case CBOX1:
+ case CBOX2:
+ case CBOX3:
+ case CBOX4:
+ case CBOX5:
+ case CBOX6:
+ case CBOX7:
+ sandy_cbox_setup(cpu_id, index, event);
+ break;
+
+ case UBOX:
+ snb_ubox_setup(cpu_id, index, event);
+ break;
+
+ case UBOXFIX:
+ if (cpuid_info.model == SANDYBRIDGE_EP)
{
- //uflags = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_1, reg);
- //uflags &= ~(0xFFFFU);
- uflags = (1<<22);
- uflags |= (1UL<<21) + event->eventId; /* Set extension bit */
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1, reg, uflags);
-
- /* program MATCH0 */
- uflags = 0x0UL;
- uflags = (event->cmask<<13) + (event->umask<<8);
- pci_write(cpu_id, PCI_QPI_MASK_DEVICE_PORT_1, PCI_UNC_QPI_PMON_MATCH_0, uflags);
-
- /* program MASK0 */
- uflags = 0x0UL;
- uflags = (0x3F<<12) + (event->cfgBits<<4);
- pci_write(cpu_id, PCI_QPI_MASK_DEVICE_PORT_1, PCI_UNC_QPI_PMON_MASK_0, uflags);
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST (1ULL<<22), SETUP_UBOXFIX)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, (1ULL<<22)));
}
- }
- else
- {
- BOX_GATE_SNB(PCI_QPI_DEVICE_PORT_0,SBOX0);
- }
- break;
+ else
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST (1ULL<<20)|(1ULL<<22), SETUP_UBOXFIX)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, (1ULL<<20)|(1ULL<<22)));
+ }
+ break;
+
+ case SBOX0:
+ snb_sbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_0);
+ break;
+ case SBOX1:
+ snb_sbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_1);
+ break;
+
+ case SBOX0FIX:
+ case SBOX1FIX:
+ break;
- default:
- /* should never be reached */
- break;
+ case BBOX0:
+ snb_bbox_setup(cpu_id, index, event);
+ break;
+
+ case WBOX:
+ snb_wbox_setup(cpu_id, index, event);
+ break;
+
+ case RBOX0:
+ case RBOX1:
+ snb_rbox_setup(cpu_id, index, event);
+ break;
+
+ case PBOX:
+ snb_pbox_setup(cpu_id, index, event);
+ break;
+
+
+ default:
+ break;
+ }
}
- if (fixed_flags != orig_fixed_flags)
+
+ if (fixed_flags > 0x0)
{
- msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
}
+ return 0;
}
-void perfmon_startCountersThread_sandybridge(int thread_id)
+
+// Macros for MSR HPM counters
+// UNFREEZE(_AND_RESET_CTR) uses the central box registers to unfreeze and reset the counter registers
+#define SNB_UNFREEZE_BOX(id) \
+ if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id)))) { \
+ VERBOSEPRINTREG(cpu_id, box_map[id].ctrlRegister, LLU_CAST 0x0ULL, UNFREEZE_BOX_##id) \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ctrlRegister, 0x0ULL)); \
+ }
+
+#define SNB_UNFREEZE_AND_RESET_CTR_BOX(id) \
+ if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id)))) { \
+ VERBOSEPRINTREG(cpu_id, box_map[id].ctrlRegister, LLU_CAST 0x2ULL, UNFREEZE_BOX_##id) \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ctrlRegister, 0x2ULL)); \
+ }
+
+// ENABLE(_AND_RESET_CTR) uses the control registers to enable (bit 22) and reset the counter registers (bit 19)
+#define SNB_ENABLE_BOX(id, reg) \
+ if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id)))) { \
+ uint64_t tmp = 0x0ULL; \
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &tmp)); \
+ tmp |= (1ULL<<22); \
+ VERBOSEPRINTREG(cpu_id, reg, LLU_CAST tmp, ENABLE_BOX_##id) \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, tmp)); \
+ }
+
+#define SNB_ENABLE_AND_RESET_CTR_BOX(id) \
+ if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id)))) { \
+ uint64_t tmp = 0x0ULL; \
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, box_map[id].ctrlRegister, &tmp)); \
+ tmp |= (1ULL<<22)|(1ULL<<17); \
+ VERBOSEPRINTREG(cpu_id, box_map[id].ctrlRegister, LLU_CAST tmp, ENABLE_AND_RESET_CTR_BOX_##id) \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ctrlRegister, tmp)); \
+ }
+
+// UNFREEZE(_AND_RESET_CTR)_PCI is similar to MSR UNFREEZE but for PCI devices
+#define SNB_UNFREEZE_PCI_BOX(id) \
+ if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id))) \
+ && (HPMcheck(box_map[id].device, cpu_id))) \
+ { \
+ VERBOSEPRINTPCIREG(cpu_id, box_map[id].device, box_map[id].ctrlRegister, LLU_CAST 0x0ULL, UNFREEZE_PCI_BOX_##id) \
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, box_map[id].device, box_map[id].ctrlRegister, 0x0ULL)); \
+ }
+#define SNB_UNFREEZE_AND_RESET_CTR_PCI_BOX(id) \
+ if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id))) \
+ && (HPMcheck(box_map[id].device, cpu_id))) \
+ { \
+ VERBOSEPRINTPCIREG(cpu_id, box_map[id].device, box_map[id].ctrlRegister, LLU_CAST 0x2ULL, UNFREEZE_AND_RESET_CTR_PCI_BOX_##id) \
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, box_map[id].device, box_map[id].ctrlRegister, 0x2ULL)); \
+ }
+
+// UNFREEZE(_AND_RESET_CTR)_MBOXFIX is kind of ENABLE for PCI but uses bit 19 for reset
+#define SNB_UNFREEZE_AND_RESET_CTR_MBOXFIX(number) \
+ if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(MBOX##number##FIX))) && \
+ (HPMcheck(PCI_IMC_DEVICE_0_CH_##number, cpu_id))) \
+ { \
+ VERBOSEPRINTPCIREG(cpu_id, PCI_IMC_DEVICE_0_CH_##number, \
+ PCI_UNC_MC_PMON_FIXED_CTL, LLU_CAST (1ULL<<22)|(1ULL<<19), UNFREEZE_AND_RESET_CTR_MBOX##number##FIX) \
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, PCI_IMC_DEVICE_0_CH_##number, PCI_UNC_MC_PMON_FIXED_CTL, (1ULL<<22)|(1ULL<<19))); \
+ }
+#define SNB_UNFREEZE_MBOXFIX(number) \
+ if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(MBOX##number##FIX))) && \
+ (HPMcheck(PCI_IMC_DEVICE_0_CH_##number, cpu_id))) \
+ { \
+ VERBOSEPRINTPCIREG(cpu_id, PCI_IMC_DEVICE_0_CH_##number, \
+ PCI_UNC_MC_PMON_FIXED_CTL, LLU_CAST (1ULL<<22), UNFREEZE_MBOXFIX##id) \
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, PCI_IMC_DEVICE_0_CH_##number, PCI_UNC_MC_PMON_FIXED_CTL, (1ULL<<22))); \
+ }
+
+int perfmon_startCountersThread_sandybridge(int thread_id, PerfmonEventSet* eventSet)
{
int haveLock = 0;
+ uint64_t tmp = 0x0ULL;
uint64_t flags = 0x0ULL;
- uint32_t uflags = 0x10000UL; /* Clear freeze bit */
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
haveLock = 1;
}
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-
- for ( int i=0; i<perfmon_numCountersSandybridge; i++ )
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- switch (sandybridge_counter_map[i].type)
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ tmp = 0x0ULL;
+ eventSet->events[i].threadCounter[thread_id].startData = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ uint64_t reg = counter_map[index].configRegister;
+ uint64_t counter1 = counter_map[index].counterRegister;
+ uint64_t counter2 = counter_map[index].counterRegister2;
+ PciDeviceIndex dev = counter_map[index].device;
+ eventSet->events[i].threadCounter[thread_id].startData = 0;
+ eventSet->events[i].threadCounter[thread_id].counterData = 0;
+ switch (type)
{
case PMC:
- msr_write(cpu_id, sandybridge_counter_map[i].counterRegister, 0x0ULL);
- flags |= (1<<(i-OFFSET_PMC)); /* enable counter */
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+ flags |= (1<<(index-cpuid_info.perf_num_fixed_ctr)); /* enable counter */
break;
case FIXED:
- msr_write(cpu_id, sandybridge_counter_map[i].counterRegister, 0x0ULL);
- flags |= (1ULL<<(i+32)); /* enable fixed counter */
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+ flags |= (1ULL<<(index+32)); /* enable fixed counter */
break;
case POWER:
if(haveLock)
{
- perfmon_threadData[thread_id].counters[i].counterData =
- power_read(cpu_id, sandybridge_counter_map[i].counterRegister);
+ CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&tmp));
+ eventSet->events[i].threadCounter[thread_id].startData = tmp;
}
-
break;
case MBOX0:
- if(haveLock)
+ case MBOX1:
+ case MBOX2:
+ case MBOX3:
+ if (haveLock && HPMcheck(dev, cpu_id))
{
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_BOX_CTL, uflags);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter2, 0x0ULL));
}
break;
- case MBOX1:
- if(haveLock)
+ case MBOX0FIX:
+ case MBOX1FIX:
+ case MBOX2FIX:
+ case MBOX3FIX:
+ /*if (haveLock && HPMcheck(dev, cpu_id))
{
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_BOX_CTL, uflags);
- }
+ tmp = 0x0ULL;
+ CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter1, &tmp));
+ eventSet->events[i].threadCounter[thread_id].startData = tmp;
+ }*/
break;
- case MBOX2:
- if(haveLock)
+
+ case SBOX0:
+ case SBOX1:
+ case SBOX0FIX:
+ case SBOX1FIX:
+ case CBOX0:
+ case CBOX1:
+ case CBOX2:
+ case CBOX3:
+ case CBOX4:
+ case CBOX5:
+ case CBOX6:
+ case CBOX7:
+ if ((haveLock) && (cpuid_info.model == SANDYBRIDGE))
{
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_BOX_CTL, uflags);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
}
break;
- case MBOX3:
- if(haveLock)
+ case UBOX:
+ //SNB_ENABLE_AND_RESET_CTR_BOX(UBOX);
+ if (haveLock)
{
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_BOX_CTL, uflags);
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &tmp));
+ tmp |= (1ULL<<22)|(1ULL<<17);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, tmp));
}
break;
+ case UBOXFIX:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+ SNB_ENABLE_BOX(UBOXFIX, reg);
+ break;
- case MBOXFIX:
- if(haveLock)
+ case BBOX0:
+ if (haveLock && HPMcheck(dev, cpu_id))
{
- pci_write(cpu_id, counter_map[i].device, PCI_UNC_MC_PMON_FIXED_CTL, 0x48000UL);
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+ CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter2, 0x0ULL));
}
break;
- case SBOX0:
- if(haveLock)
+ case WBOX:
+ if (haveLock)
{
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0, PCI_UNC_QPI_PMON_BOX_CTL, uflags);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_PCU_PMON_BOX_FILTER, 0x0U));
}
break;
-
- case SBOX1:
+ case WBOX0FIX:
+ case WBOX1FIX:
if(haveLock)
{
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1, PCI_UNC_QPI_PMON_BOX_CTL, uflags);
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &tmp));
+ eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[WBOX0FIX].regWidth);
}
break;
-
default:
- /* should never be reached */
break;
}
+ eventSet->events[i].threadCounter[thread_id].counterData = eventSet->events[i].threadCounter[thread_id].startData;
}
}
- if (perfmon_verbose)
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
{
- printf("perfmon_start_counters: Write Register 0x%X , \
- Flags: 0x%llX \n",MSR_PERF_GLOBAL_CTRL, LLU_CAST flags);
- printf("perfmon_start_counters: Write Register 0x%X , \
- Flags: 0x%llX \n",MSR_UNCORE_PERF_GLOBAL_CTRL, LLU_CAST uflags);
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, UNFREEZE_PMC_OR_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
}
-
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
- msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x30000000FULL);
+ if (cpuid_info.model == SANDYBRIDGE_EP)
+ {
+ SNB_UNFREEZE_AND_RESET_CTR_BOX(CBOX0);
+ SNB_UNFREEZE_AND_RESET_CTR_BOX(CBOX1);
+ SNB_UNFREEZE_AND_RESET_CTR_BOX(CBOX2);
+ SNB_UNFREEZE_AND_RESET_CTR_BOX(CBOX3);
+ SNB_UNFREEZE_AND_RESET_CTR_BOX(CBOX4);
+ SNB_UNFREEZE_AND_RESET_CTR_BOX(CBOX5);
+ SNB_UNFREEZE_AND_RESET_CTR_BOX(CBOX6);
+ SNB_UNFREEZE_AND_RESET_CTR_BOX(CBOX7);
+ SNB_UNFREEZE_AND_RESET_CTR_PCI_BOX(SBOX0);
+ SNB_UNFREEZE_AND_RESET_CTR_PCI_BOX(SBOX1);
+ SNB_UNFREEZE_PCI_BOX(MBOX0);
+ SNB_UNFREEZE_PCI_BOX(MBOX1);
+ SNB_UNFREEZE_PCI_BOX(MBOX2);
+ SNB_UNFREEZE_PCI_BOX(MBOX3);
+ SNB_UNFREEZE_AND_RESET_CTR_MBOXFIX(0);
+ SNB_UNFREEZE_AND_RESET_CTR_MBOXFIX(1);
+ SNB_UNFREEZE_AND_RESET_CTR_MBOXFIX(2);
+ SNB_UNFREEZE_AND_RESET_CTR_MBOXFIX(3);
+ SNB_UNFREEZE_PCI_BOX(BBOX0);
+ SNB_UNFREEZE_AND_RESET_CTR_BOX(WBOX);
+ SNB_UNFREEZE_AND_RESET_CTR_PCI_BOX(RBOX0);
+ SNB_UNFREEZE_AND_RESET_CTR_PCI_BOX(RBOX1);
+ SNB_UNFREEZE_AND_RESET_CTR_PCI_BOX(PBOX);
+ }
+ else
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_UNC_U_PMON_GLOBAL_CTL, LLU_CAST (1ULL<<29), UNFREEZE_UNCORE)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_U_PMON_GLOBAL_CTL, (1ULL<<29)));
+ }
+ return 0;
}
-void perfmon_stopCountersThread_sandybridge(int thread_id)
+// Read MSR counter register
+#define SNB_READ_BOX(id, reg1) \
+ if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id)))) \
+ { \
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg1, &counter_result)); \
+ VERBOSEPRINTREG(cpu_id, reg1, LLU_CAST counter_result, READ_BOX_##id) \
+ }
+
+// Read PCI counter registers and combine them to a single value
+#define SNB_READ_PCI_BOX(id, dev, reg1, reg2) \
+ if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id))) && HPMcheck(dev, cpu_id)) \
+ { \
+ uint64_t tmp = 0x0ULL; \
+ CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, reg1, &tmp)); \
+ counter_result = (tmp<<32); \
+ CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, reg2, &tmp)); \
+ counter_result += tmp; \
+ VERBOSEPRINTPCIREG(cpu_id, dev, reg1, LLU_CAST counter_result, READ_PCI_BOX_##id) \
+ }
+
+// Check counter result for overflows. We do not handle overflows directly, that is done in the getResults function in perfmon.c
+// SandyBridge has no bits indicating that overflows occured, therefore we use this simple check
+#define SNB_CHECK_OVERFLOW \
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+ { \
+ eventSet->events[i].threadCounter[thread_id].overflows++; \
+ }
+
+
+int perfmon_stopCountersThread_sandybridge(int thread_id, PerfmonEventSet* eventSet)
{
- uint64_t flags;
- uint32_t uflags = 0x10100UL; /* Set freeze bit */
uint64_t counter_result = 0x0ULL;
int haveLock = 0;
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
haveLock = 1;
}
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
+ if (cpuid_info.model == SANDYBRIDGE_EP)
+ {
+ SNB_FREEZE_BOX(CBOX0);
+ SNB_FREEZE_BOX(CBOX1);
+ SNB_FREEZE_BOX(CBOX2);
+ SNB_FREEZE_BOX(CBOX3);
+ SNB_FREEZE_BOX(CBOX4);
+ SNB_FREEZE_BOX(CBOX5);
+ SNB_FREEZE_BOX(CBOX6);
+ SNB_FREEZE_BOX(CBOX7);
+
+ SNB_FREEZE_PCI_BOX(MBOX0);
+ SNB_FREEZE_PCI_BOX(MBOX1);
+ SNB_FREEZE_PCI_BOX(MBOX2);
+ SNB_FREEZE_PCI_BOX(MBOX3);
+
+ SNB_FREEZE_AND_RESET_CTL_PCI_BOX(SBOX0);
+ SNB_FREEZE_AND_RESET_CTL_PCI_BOX(SBOX1);
+
+ SNB_FREEZE_AND_RESET_CTL_PCI_BOX(RBOX0);
+ SNB_FREEZE_AND_RESET_CTL_PCI_BOX(RBOX1);
+
+ SNB_FREEZE_AND_RESET_CTL_PCI_BOX(PBOX);
+
+ SNB_FREEZE_PCI_BOX(BBOX0);
+ SNB_FREEZE_AND_RESET_CTL_BOX(WBOX);
+ }
+ else
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL, LLU_CAST (1ULL<<31), FREEZE_UNCORE)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_PERF_GLOBAL_CTRL, (1ULL<<31)));
+ }
- for ( int i=0; i < perfmon_numCountersSandybridge; i++ )
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- switch (sandybridge_counter_map[i].type)
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ counter_result = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ PciDeviceIndex dev = counter_map[index].device;
+ uint64_t reg = counter_map[index].configRegister;
+ uint64_t counter1 = counter_map[index].counterRegister;
+ uint64_t counter2 = counter_map[index].counterRegister2;
+ switch (type)
{
case PMC:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+ {
+ uint64_t ovf_values = 0x0ULL;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+ if (ovf_values & (1ULL<<(index - cpuid_info.perf_num_fixed_ctr)))
+ {
+ eventSet->events[i].threadCounter[thread_id].overflows++;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL,
+ (1ULL<<(index - cpuid_info.perf_num_fixed_ctr))));
+ }
+ }
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, STOP_PMC);
+ break;
case FIXED:
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, sandybridge_counter_map[i].counterRegister);
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+ {
+ uint64_t ovf_values = 0x0ULL;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+ if (ovf_values & (1ULL<<(index+32)))
+ {
+ eventSet->events[i].threadCounter[thread_id].overflows++;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<(index+32))));
+ }
+ }
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, STOP_FIXED);
break;
case POWER:
- if(haveLock)
+ if (haveLock)
{
- perfmon_threadData[thread_id].counters[i].counterData =
- power_info.energyUnit *
- ( power_read(cpu_id, sandybridge_counter_map[i].counterRegister) -
- perfmon_threadData[thread_id].counters[i].counterData);
+ CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, STOP_POWER);
+ SNB_CHECK_OVERFLOW;
}
break;
case THERMAL:
- perfmon_threadData[thread_id].counters[i].counterData =
- thermal_read(cpu_id);
+ CHECK_MSR_READ_ERROR(thermal_read(cpu_id, (uint32_t*)&counter_result));
break;
case MBOX0:
- if(haveLock)
- {
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_BOX_CTL, uflags);
- counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
- sandybridge_counter_map[i].counterRegister);
-
- counter_result = (counter_result<<32) +
- pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
- sandybridge_counter_map[i].counterRegister2);
-
- perfmon_threadData[thread_id].counters[i].counterData = counter_result;
- }
+ SNB_READ_PCI_BOX(MBOX0, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
break;
case MBOX1:
- if(haveLock)
- {
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_1, PCI_UNC_MC_PMON_BOX_CTL, uflags);
- counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_1,
- sandybridge_counter_map[i].counterRegister);
-
- counter_result = (counter_result<<32) +
- pci_read(cpu_id, PCI_IMC_DEVICE_CH_1,
- sandybridge_counter_map[i].counterRegister2);
-
- perfmon_threadData[thread_id].counters[i].counterData = counter_result;
- }
+ SNB_READ_PCI_BOX(MBOX1, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
break;
case MBOX2:
- if(haveLock)
- {
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_2, PCI_UNC_MC_PMON_BOX_CTL, uflags);
-
- counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_2,
- sandybridge_counter_map[i].counterRegister);
-
- counter_result = (counter_result<<32) +
- pci_read(cpu_id, PCI_IMC_DEVICE_CH_2,
- sandybridge_counter_map[i].counterRegister2);
-
- perfmon_threadData[thread_id].counters[i].counterData = counter_result;
- }
+ SNB_READ_PCI_BOX(MBOX2, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
break;
case MBOX3:
- if(haveLock)
- {
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_3, PCI_UNC_MC_PMON_BOX_CTL, uflags);
+ SNB_READ_PCI_BOX(MBOX3, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
+ break;
- counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_3,
- sandybridge_counter_map[i].counterRegister);
+ case MBOX0FIX:
+ SNB_READ_PCI_BOX(MBOX0FIX, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case MBOX1FIX:
+ SNB_READ_PCI_BOX(MBOX1FIX, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case MBOX2FIX:
+ SNB_READ_PCI_BOX(MBOX2FIX, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case MBOX3FIX:
+ SNB_READ_PCI_BOX(MBOX3FIX, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
+ break;
- counter_result = (counter_result<<32) +
- pci_read(cpu_id, PCI_IMC_DEVICE_CH_3,
- sandybridge_counter_map[i].counterRegister2);
+ case SBOX0:
+ SNB_READ_PCI_BOX(SBOX0, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
+ break;
- perfmon_threadData[thread_id].counters[i].counterData = counter_result;
- }
+ case SBOX1:
+ SNB_READ_PCI_BOX(SBOX1, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
break;
- case MBOXFIX:
- if(haveLock)
+ case SBOX0FIX:
+ case SBOX1FIX:
+ if (haveLock && HPMcheck(dev, cpu_id))
{
- pci_write(cpu_id, PCI_IMC_DEVICE_CH_0, PCI_UNC_MC_PMON_FIXED_CTL, uflags);
-
- counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
- sandybridge_counter_map[i].counterRegister);
-
- counter_result = (counter_result<<32) +
- pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
- sandybridge_counter_map[i].counterRegister2);
-
- perfmon_threadData[thread_id].counters[i].counterData = counter_result;
+ HPMread(cpu_id, dev, counter1, &counter_result);
+ if (eventSet->events[i].event.eventId == 0x00)
+ {
+ switch(extractBitField(counter_result, 3, 0))
+ {
+ case 0x2:
+ counter_result = 5.6E9;
+ break;
+ case 0x3:
+ counter_result = 6.4E9;
+ break;
+ case 0x4:
+ counter_result = 7.2E9;
+ break;
+ case 0x5:
+ counter_result = 8.0E9;
+ break;
+ case 0x6:
+ counter_result = 8.8E9;
+ break;
+ case 0x7:
+ counter_result = 9.6E9;
+ break;
+ default:
+ counter_result = 0;
+ break;
+ }
+ }
+ else if (eventSet->events[i].event.eventId == 0x01)
+ {
+ counter_result = extractBitField(counter_result, 1, 4);
+ }
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, STOP_SBOXFIX);
}
break;
- case SBOX0:
- if(haveLock)
- {
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0, PCI_UNC_QPI_PMON_BOX_CTL, uflags);
-
- counter_result = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_0,
- sandybridge_counter_map[i].counterRegister);
-
- counter_result = (counter_result<<32) +
- pci_read(cpu_id, PCI_QPI_DEVICE_PORT_0,
- sandybridge_counter_map[i].counterRegister2);
+ case CBOX0:
+ SNB_READ_BOX(CBOX0, counter1);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case CBOX1:
+ SNB_READ_BOX(CBOX1, counter1);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case CBOX2:
+ SNB_READ_BOX(CBOX2, counter1);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case CBOX3:
+ SNB_READ_BOX(CBOX3, counter1);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case CBOX4:
+ SNB_READ_BOX(CBOX4, counter1);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case CBOX5:
+ SNB_READ_BOX(CBOX5, counter1);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case CBOX6:
+ SNB_READ_BOX(CBOX6, counter1);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case CBOX7:
+ SNB_READ_BOX(CBOX7, counter1);
+ SNB_CHECK_OVERFLOW;
+ break;
- perfmon_threadData[thread_id].counters[i].counterData = counter_result;
- }
+ case UBOX:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, STOP_UBOX);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case UBOXFIX:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, STOP_UBOXFIX);
+ SNB_CHECK_OVERFLOW;
break;
- case SBOX1:
- if(haveLock)
- {
- pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1, PCI_UNC_QPI_PMON_BOX_CTL, uflags);
- counter_result = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_1,
- sandybridge_counter_map[i].counterRegister);
+ case BBOX0:
+ SNB_READ_PCI_BOX(BBOX0, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
+ break;
- counter_result = (counter_result<<32) +
- pci_read(cpu_id, PCI_QPI_DEVICE_PORT_1,
- sandybridge_counter_map[i].counterRegister2);
+ case WBOX:
+ SNB_READ_BOX(WBOX, counter1);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case WBOX0FIX:
+ SNB_READ_BOX(WBOX0FIX, counter1);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case WBOX1FIX:
+ SNB_READ_BOX(WBOX1FIX, counter1);
+ SNB_CHECK_OVERFLOW;
+ break;
- perfmon_threadData[thread_id].counters[i].counterData = counter_result;
- }
+ case RBOX0:
+ SNB_READ_PCI_BOX(RBOX0, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case RBOX1:
+ SNB_READ_PCI_BOX(RBOX1, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
break;
+ case PBOX:
+ SNB_READ_PCI_BOX(PBOX, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
+ break;
default:
- /* should never be reached */
break;
}
+ eventSet->events[i].threadCounter[thread_id].counterData =
+ field64(counter_result, 0, box_map[type].regWidth);
}
}
- flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
- // printf ("Status: 0x%llX \n", LLU_CAST flags);
- if ( (flags & 0x3) || (flags & (0x3ULL<<32)) )
- {
- printf ("Overflow occured \n");
- }
+ return 0;
}
-void perfmon_readCountersThread_sandybridge(int thread_id)
+int perfmon_readCountersThread_sandybridge(int thread_id, PerfmonEventSet* eventSet)
{
uint64_t counter_result = 0x0ULL;
int haveLock = 0;
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+ uint64_t pmc_flags = 0x0ULL;
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
haveLock = 1;
}
- for ( int i=0; i<perfmon_numCountersSandybridge; i++ )
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &pmc_flags));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
+ if (cpuid_info.model == SANDYBRIDGE_EP)
+ {
+ SNB_FREEZE_BOX(CBOX0);
+ SNB_FREEZE_BOX(CBOX1);
+ SNB_FREEZE_BOX(CBOX2);
+ SNB_FREEZE_BOX(CBOX3);
+ SNB_FREEZE_BOX(CBOX4);
+ SNB_FREEZE_BOX(CBOX5);
+ SNB_FREEZE_BOX(CBOX6);
+ SNB_FREEZE_BOX(CBOX7);
+
+ SNB_FREEZE_PCI_BOX(MBOX0);
+ SNB_FREEZE_PCI_BOX(MBOX1);
+ SNB_FREEZE_PCI_BOX(MBOX2);
+ SNB_FREEZE_PCI_BOX(MBOX3);
+
+ SNB_FREEZE_MBOXFIX(0);
+ SNB_FREEZE_MBOXFIX(1);
+ SNB_FREEZE_MBOXFIX(2);
+ SNB_FREEZE_MBOXFIX(3);
+
+ SNB_FREEZE_PCI_BOX(SBOX0);
+ SNB_FREEZE_PCI_BOX(SBOX1);
+
+ SNB_FREEZE_PCI_BOX(RBOX0);
+ SNB_FREEZE_PCI_BOX(RBOX1);
+
+ SNB_FREEZE_PCI_BOX(PBOX);
+
+ SNB_FREEZE_PCI_BOX(BBOX0);
+ SNB_FREEZE_BOX(WBOX);
+ }
+ else
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL, LLU_CAST (1ULL<<31), FREEZE_UNCORE)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_PERF_GLOBAL_CTRL, (1ULL<<31)));
+ }
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- if ((sandybridge_counter_map[i].type == PMC) ||
- (sandybridge_counter_map[i].type == FIXED))
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, sandybridge_counter_map[i].counterRegister);
+ continue;
}
- else
+ counter_result = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ PciDeviceIndex dev = counter_map[index].device;
+ uint64_t reg = counter_map[index].configRegister;
+ uint64_t counter1 = counter_map[index].counterRegister;
+ uint64_t counter2 = counter_map[index].counterRegister2;
+ switch (type)
{
- if(haveLock)
- {
- switch (sandybridge_counter_map[i].type)
+ case PMC:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, READ_PMC);
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
{
- case POWER:
- perfmon_threadData[thread_id].counters[i].counterData =
- power_info.energyUnit *
- power_read(cpu_id, sandybridge_counter_map[i].counterRegister);
- break;
+ uint64_t ovf_values = 0x0ULL;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+ if (ovf_values & (1ULL<<(index - cpuid_info.perf_num_fixed_ctr)))
+ {
+ eventSet->events[i].threadCounter[thread_id].overflows++;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL,
+ (1ULL<<(index - cpuid_info.perf_num_fixed_ctr))));
+ }
+ }
+ break;
- case MBOX0:
- counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
- sandybridge_counter_map[i].counterRegister);
+ case FIXED:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, READ_FIXED);
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+ {
+ uint64_t ovf_values = 0x0ULL;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+ if (ovf_values & (1ULL<<(index+32)))
+ {
+ eventSet->events[i].threadCounter[thread_id].overflows++;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<(index+32))));
+ }
+ }
+ break;
- counter_result = (counter_result<<32) +
- pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
- sandybridge_counter_map[i].counterRegister2);
+ case THERMAL:
+ CHECK_MSR_READ_ERROR(thermal_read(cpu_id, (uint32_t*)&counter_result));
+ break;
+
+ case POWER:
+ if (haveLock)
+ {
+ CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, READ_POWER);
+ SNB_CHECK_OVERFLOW;
+ }
+ break;
- perfmon_threadData[thread_id].counters[i].counterData = counter_result;
- break;
+ case MBOX0:
+ SNB_READ_PCI_BOX(MBOX0, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
+ break;
- case MBOX1:
- counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_1,
- sandybridge_counter_map[i].counterRegister);
+ case MBOX1:
+ SNB_READ_PCI_BOX(MBOX1, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
+ break;
- counter_result = (counter_result<<32) +
- pci_read(cpu_id, PCI_IMC_DEVICE_CH_1,
- sandybridge_counter_map[i].counterRegister2);
+ case MBOX2:
+ SNB_READ_PCI_BOX(MBOX2, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
+ break;
- perfmon_threadData[thread_id].counters[i].counterData = counter_result;
- break;
+ case MBOX3:
+ SNB_READ_PCI_BOX(MBOX3, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
+ break;
- case MBOX2:
- counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_2,
- sandybridge_counter_map[i].counterRegister);
+ case MBOX0FIX:
+ SNB_READ_PCI_BOX(MBOX0FIX, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case MBOX1FIX:
+ SNB_READ_PCI_BOX(MBOX1FIX, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case MBOX2FIX:
+ SNB_READ_PCI_BOX(MBOX2FIX, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case MBOX3FIX:
+ SNB_READ_PCI_BOX(MBOX3FIX, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
+ break;
- counter_result = (counter_result<<32) +
- pci_read(cpu_id, PCI_IMC_DEVICE_CH_2,
- sandybridge_counter_map[i].counterRegister2);
+ case UBOX:
+ case UBOXFIX:
+ if (haveLock)
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, STOP_UBOX);
+ SNB_CHECK_OVERFLOW;
+ }
- perfmon_threadData[thread_id].counters[i].counterData = counter_result;
- break;
+ case CBOX0:
+ SNB_READ_BOX(CBOX0, counter1);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case CBOX1:
+ SNB_READ_BOX(CBOX1, counter1);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case CBOX2:
+ SNB_READ_BOX(CBOX2, counter1);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case CBOX3:
+ SNB_READ_BOX(CBOX3, counter1);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case CBOX4:
+ SNB_READ_BOX(CBOX4, counter1);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case CBOX5:
+ SNB_READ_BOX(CBOX5, counter1);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case CBOX6:
+ SNB_READ_BOX(CBOX6, counter1);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case CBOX7:
+ SNB_READ_BOX(CBOX7, counter1);
+ SNB_CHECK_OVERFLOW;
+ break;
- case MBOX3:
- counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_3,
- sandybridge_counter_map[i].counterRegister);
+ case BBOX0:
+ SNB_READ_PCI_BOX(BBOX0, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
+ break;
- counter_result = (counter_result<<32) +
- pci_read(cpu_id, PCI_IMC_DEVICE_CH_3,
- sandybridge_counter_map[i].counterRegister2);
+ case SBOX0:
+ SNB_READ_PCI_BOX(SBOX0, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
+ break;
- perfmon_threadData[thread_id].counters[i].counterData = counter_result;
- break;
+ case SBOX1:
+ SNB_READ_PCI_BOX(SBOX1, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
+ break;
- default:
- /* should never be reached */
- break;
+ case SBOX0FIX:
+ case SBOX1FIX:
+
+ HPMread(cpu_id, dev, counter1, &counter_result);
+ if (eventSet->events[i].event.eventId == 0x00)
+ {
+ switch(extractBitField(counter_result, 3, 0))
+ {
+ case 0x2:
+ counter_result = 5.6E9;
+ break;
+ case 0x3:
+ counter_result = 6.4E9;
+ break;
+ case 0x4:
+ counter_result = 7.2E9;
+ break;
+ case 0x5:
+ counter_result = 8.0E9;
+ break;
+ case 0x6:
+ counter_result = 8.8E9;
+ break;
+ case 0x7:
+ counter_result = 9.6E9;
+ break;
+ default:
+ counter_result = 0;
+ break;
+ }
}
- }
+ else if (eventSet->events[i].event.eventId == 0x01)
+ {
+ counter_result = extractBitField(counter_result, 1, 4);
+ }
+ eventSet->events[i].threadCounter[thread_id].startData = 0x0ULL;
+ VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, READ_SBOXFIX);
+ break;
+
+ case WBOX:
+ SNB_READ_BOX(WBOX, counter1);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case WBOX0FIX:
+ SNB_READ_BOX(WBOX0FIX, counter1);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case WBOX1FIX:
+ SNB_READ_BOX(WBOX1FIX, counter1);
+ SNB_CHECK_OVERFLOW;
+ break;
+
+ case RBOX0:
+ SNB_READ_PCI_BOX(RBOX0, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
+ break;
+ case RBOX1:
+ SNB_READ_PCI_BOX(RBOX1, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
+ break;
+
+ case PBOX:
+ SNB_READ_PCI_BOX(PBOX, dev, counter1, counter2);
+ SNB_CHECK_OVERFLOW;
+ break;
+
+ default:
+ break;
}
+ eventSet->events[i].threadCounter[thread_id].counterData =
+ field64(counter_result, 0, box_map[type].regWidth);
}
}
+ if (cpuid_info.model == SANDYBRIDGE_EP)
+ {
+ SNB_UNFREEZE_BOX(CBOX0);
+ SNB_UNFREEZE_BOX(CBOX1);
+ SNB_UNFREEZE_BOX(CBOX2);
+ SNB_UNFREEZE_BOX(CBOX3);
+ SNB_UNFREEZE_BOX(CBOX4);
+ SNB_UNFREEZE_BOX(CBOX5);
+ SNB_UNFREEZE_BOX(CBOX6);
+ SNB_UNFREEZE_BOX(CBOX7);
+
+ SNB_UNFREEZE_PCI_BOX(MBOX0);
+ SNB_UNFREEZE_PCI_BOX(MBOX1);
+ SNB_UNFREEZE_PCI_BOX(MBOX2);
+ SNB_UNFREEZE_PCI_BOX(MBOX3);
+
+ SNB_UNFREEZE_MBOXFIX(0);
+ SNB_UNFREEZE_MBOXFIX(1);
+ SNB_UNFREEZE_MBOXFIX(2);
+ SNB_UNFREEZE_MBOXFIX(3);
+
+ SNB_UNFREEZE_PCI_BOX(SBOX0);
+ SNB_UNFREEZE_PCI_BOX(SBOX1);
+
+ SNB_UNFREEZE_PCI_BOX(RBOX0);
+ SNB_UNFREEZE_PCI_BOX(RBOX1);
+
+ SNB_UNFREEZE_PCI_BOX(PBOX);
+
+ SNB_UNFREEZE_PCI_BOX(BBOX0);
+ SNB_UNFREEZE_BOX(WBOX);
+ }
+ else
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL, LLU_CAST (1ULL<<29), UNFREEZE_UNCORE)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_PERF_GLOBAL_CTRL, (1ULL<<29)));
+ }
+
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, pmc_flags));
+ }
+
+ return 0;
}
+int perfmon_finalizeCountersThread_sandybridge(int thread_id, PerfmonEventSet* eventSet)
+{
+ int haveLock = 0;
+ int haveTileLock = 0;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+ uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveLock = 1;
+ }
+ if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+ {
+ haveTileLock = 1;
+ }
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ PciDeviceIndex dev = counter_map[index].device;
+ uint64_t reg = counter_map[index].configRegister;
+ switch(type)
+ {
+ case PMC:
+ ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+ if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
+ }
+ else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+ }
+ break;
+ case FIXED:
+ ovf_values_core |= (1ULL<<(index+32));
+ break;
+ default:
+ break;
+ }
+ if ((reg) &&
+ (((type == PMC)||(type == FIXED)) || ((type >= UNCORE) && (haveLock) && (HPMcheck(dev, cpu_id)))))
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+ }
+ eventSet->events[i].threadCounter[thread_id].init = FALSE;
+ }
+ }
+
+
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST ovf_values_core, CLEAR_GLOBAL_OVF)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST 0x0ULL, CLEAR_GLOBAL_CTRL)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
+ return 0;
+}
diff --git a/src/includes/perfmon_sandybridgeEP_counters.h b/src/includes/perfmon_sandybridgeEP_counters.h
new file mode 100644
index 0000000..befef53
--- /dev/null
+++ b/src/includes/perfmon_sandybridgeEP_counters.h
@@ -0,0 +1,214 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: perfmon_sandybridgeEP_counters.h
+ *
+ * Description: Counter header file of perfmon module for Intel Sandy Bridge EP.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#define NUM_COUNTERS_SANDYBRIDGEEP 97
+#define NUM_COUNTERS_UNCORE_SANDYBRIDGEEP 53
+#define NUM_COUNTERS_CORE_SANDYBRIDGEEP 8
+
+#define SNBEP_VALID_OPTIONS_FIXED EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_ANYTHREAD_MASK
+#define SNBEP_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK| \
+ EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define SNBEP_VALID_OPTIONS_CBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_TID_MASK| \
+ EVENT_OPTION_INVERT_MASK|EVENT_OPTION_OPCODE_MASK|EVENT_OPTION_NID_MASK|EVENT_OPTION_STATE_MASK
+#define SNBEP_VALID_OPTIONS_WBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK| \
+ EVENT_OPTION_OCCUPANCY_MASK|EVENT_OPTION_OCCUPANCY_EDGE_MASK| \
+ EVENT_OPTION_OCCUPANCY_INVERT_MASK|EVENT_OPTION_MATCH0_MASK
+#define SNBEP_VALID_OPTIONS_UBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+#define SNBEP_VALID_OPTIONS_BBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK| \
+ EVENT_OPTION_OPCODE_MASK|EVENT_OPTION_MATCH0_MASK
+#define SNBEP_VALID_OPTIONS_MBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+#define SNBEP_VALID_OPTIONS_SBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK| \
+ EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK|EVENT_OPTION_MASK0_MASK| \
+ EVENT_OPTION_MASK1_MASK
+#define SNBEP_VALID_OPTIONS_RBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+#define SNBEP_VALID_OPTIONS_PBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+
+static RegisterMap sandybridgeEP_counter_map[NUM_COUNTERS_SANDYBRIDGEEP] = {
+ /* Fixed Counters: instructions retired, cycles unhalted core */
+ {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, SNBEP_VALID_OPTIONS_FIXED},
+ {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, SNBEP_VALID_OPTIONS_FIXED},
+ {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, SNBEP_VALID_OPTIONS_FIXED},
+ /* PMC Counters: 4 48bit wide */
+ {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, SNBEP_VALID_OPTIONS_PMC},
+ {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, SNBEP_VALID_OPTIONS_PMC},
+ {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, SNBEP_VALID_OPTIONS_PMC},
+ {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, SNBEP_VALID_OPTIONS_PMC},
+ /* Temperature Sensor*/
+ {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ /* RAPL counters */
+ {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ /* CBOX counters */
+ {"CBOX0C0", PMC12, CBOX0, MSR_UNC_C0_PMON_CTL0, MSR_UNC_C0_PMON_CTR0, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX0C1", PMC13, CBOX0, MSR_UNC_C0_PMON_CTL1, MSR_UNC_C0_PMON_CTR1, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX0C2", PMC14, CBOX0, MSR_UNC_C0_PMON_CTL2, MSR_UNC_C0_PMON_CTR2, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX0C3", PMC15, CBOX0, MSR_UNC_C0_PMON_CTL3, MSR_UNC_C0_PMON_CTR3, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX1C0", PMC16, CBOX1, MSR_UNC_C1_PMON_CTL0, MSR_UNC_C1_PMON_CTR0, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX1C1", PMC17, CBOX1, MSR_UNC_C1_PMON_CTL1, MSR_UNC_C1_PMON_CTR1, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX1C2", PMC18, CBOX1, MSR_UNC_C1_PMON_CTL2, MSR_UNC_C1_PMON_CTR2, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX1C3", PMC19, CBOX1, MSR_UNC_C1_PMON_CTL3, MSR_UNC_C1_PMON_CTR3, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX2C0", PMC20, CBOX2, MSR_UNC_C2_PMON_CTL0, MSR_UNC_C2_PMON_CTR0, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX2C1", PMC21, CBOX2, MSR_UNC_C2_PMON_CTL1, MSR_UNC_C2_PMON_CTR1, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX2C2", PMC22, CBOX2, MSR_UNC_C2_PMON_CTL2, MSR_UNC_C2_PMON_CTR2, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX2C3", PMC23, CBOX2, MSR_UNC_C2_PMON_CTL3, MSR_UNC_C2_PMON_CTR3, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX3C0", PMC24, CBOX3, MSR_UNC_C3_PMON_CTL0, MSR_UNC_C3_PMON_CTR0, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX3C1", PMC25, CBOX3, MSR_UNC_C3_PMON_CTL1, MSR_UNC_C3_PMON_CTR1, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX3C2", PMC26, CBOX3, MSR_UNC_C3_PMON_CTL2, MSR_UNC_C3_PMON_CTR2, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX3C3", PMC27, CBOX3, MSR_UNC_C3_PMON_CTL3, MSR_UNC_C3_PMON_CTR3, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX4C0", PMC28, CBOX4, MSR_UNC_C4_PMON_CTL0, MSR_UNC_C4_PMON_CTR0, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX4C1", PMC29, CBOX4, MSR_UNC_C4_PMON_CTL1, MSR_UNC_C4_PMON_CTR1, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX4C2", PMC30, CBOX4, MSR_UNC_C4_PMON_CTL2, MSR_UNC_C4_PMON_CTR2, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX4C3", PMC31, CBOX4, MSR_UNC_C4_PMON_CTL3, MSR_UNC_C4_PMON_CTR3, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX5C0", PMC32, CBOX5, MSR_UNC_C5_PMON_CTL0, MSR_UNC_C5_PMON_CTR0, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX5C1", PMC33, CBOX5, MSR_UNC_C5_PMON_CTL1, MSR_UNC_C5_PMON_CTR1, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX5C2", PMC34, CBOX5, MSR_UNC_C5_PMON_CTL2, MSR_UNC_C5_PMON_CTR2, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX5C3", PMC35, CBOX5, MSR_UNC_C5_PMON_CTL3, MSR_UNC_C5_PMON_CTR3, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX6C0", PMC36, CBOX6, MSR_UNC_C6_PMON_CTL0, MSR_UNC_C6_PMON_CTR0, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX6C1", PMC37, CBOX6, MSR_UNC_C6_PMON_CTL1, MSR_UNC_C6_PMON_CTR1, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX6C2", PMC38, CBOX6, MSR_UNC_C6_PMON_CTL2, MSR_UNC_C6_PMON_CTR2, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX6C3", PMC39, CBOX6, MSR_UNC_C6_PMON_CTL3, MSR_UNC_C6_PMON_CTR3, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX7C0", PMC40, CBOX7, MSR_UNC_C7_PMON_CTL0, MSR_UNC_C7_PMON_CTR0, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX7C1", PMC41, CBOX7, MSR_UNC_C7_PMON_CTL1, MSR_UNC_C7_PMON_CTR1, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX7C2", PMC42, CBOX7, MSR_UNC_C7_PMON_CTL2, MSR_UNC_C7_PMON_CTR2, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ {"CBOX7C3", PMC43, CBOX7, MSR_UNC_C7_PMON_CTL3, MSR_UNC_C7_PMON_CTR3, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+ /* UBOX counters */
+ {"UBOX0", PMC44, UBOX, MSR_UNC_U_PMON_CTL0, MSR_UNC_U_PMON_CTR0, 0, 0, SNBEP_VALID_OPTIONS_UBOX},
+ {"UBOX1", PMC45, UBOX, MSR_UNC_U_PMON_CTL1, MSR_UNC_U_PMON_CTR0, 0, 0, SNBEP_VALID_OPTIONS_UBOX},
+ {"UBOXFIX", PMC46, UBOXFIX, MSR_UNC_U_UCLK_FIXED_CTL, MSR_UNC_U_UCLK_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"WBOX0",PMC47, WBOX, MSR_UNC_PCU_PMON_CTL0, MSR_UNC_PCU_PMON_CTR0, 0, 0, SNBEP_VALID_OPTIONS_WBOX},
+ {"WBOX1",PMC48, WBOX, MSR_UNC_PCU_PMON_CTL1, MSR_UNC_PCU_PMON_CTR1, 0, 0, SNBEP_VALID_OPTIONS_WBOX},
+ {"WBOX2",PMC49, WBOX, MSR_UNC_PCU_PMON_CTL2, MSR_UNC_PCU_PMON_CTR2, 0, 0, SNBEP_VALID_OPTIONS_WBOX},
+ {"WBOX3",PMC50, WBOX, MSR_UNC_PCU_PMON_CTL3, MSR_UNC_PCU_PMON_CTR3, 0, 0, SNBEP_VALID_OPTIONS_WBOX},
+ {"WBOXFIX0", PMC51, WBOX0FIX, 0, MSR_UNC_PCU_PMON_FIXED_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"WBOXFIX1", PMC52, WBOX0FIX, 0, MSR_UNC_PCU_PMON_FIXED_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+ /* IMC Counters: 4 48bit wide per memory channel, split in two reads */
+ {"MBOX0C0",PMC53, MBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_0, SNBEP_VALID_OPTIONS_MBOX},
+ {"MBOX0C1",PMC54, MBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_0, SNBEP_VALID_OPTIONS_MBOX},
+ {"MBOX0C2",PMC55, MBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_0, SNBEP_VALID_OPTIONS_MBOX},
+ {"MBOX0C3",PMC56, MBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_0, SNBEP_VALID_OPTIONS_MBOX},
+ {"MBOX0FIX", PMC57, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_NONE_MASK},
+ {"MBOX1C0",PMC58, MBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_1, SNBEP_VALID_OPTIONS_MBOX},
+ {"MBOX1C1",PMC59, MBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_1, SNBEP_VALID_OPTIONS_MBOX},
+ {"MBOX1C2",PMC60, MBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_1, SNBEP_VALID_OPTIONS_MBOX},
+ {"MBOX1C3",PMC61, MBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_1, SNBEP_VALID_OPTIONS_MBOX},
+ {"MBOX1FIX", PMC62, MBOX1FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_1, EVENT_OPTION_NONE_MASK},
+ {"MBOX2C0",PMC63, MBOX2, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_2, SNBEP_VALID_OPTIONS_MBOX},
+ {"MBOX2C1",PMC64, MBOX2, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_2, SNBEP_VALID_OPTIONS_MBOX},
+ {"MBOX2C2",PMC65, MBOX2, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_2, SNBEP_VALID_OPTIONS_MBOX},
+ {"MBOX2C3",PMC66, MBOX2, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_2, SNBEP_VALID_OPTIONS_MBOX},
+ {"MBOX2FIX", PMC67, MBOX2FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_2, EVENT_OPTION_NONE_MASK},
+ {"MBOX3C0",PMC68, MBOX3, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_3, SNBEP_VALID_OPTIONS_MBOX},
+ {"MBOX3C1",PMC69, MBOX3, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_3, SNBEP_VALID_OPTIONS_MBOX},
+ {"MBOX3C2",PMC70, MBOX3, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_3, SNBEP_VALID_OPTIONS_MBOX},
+ {"MBOX3C3",PMC71, MBOX3, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_3, SNBEP_VALID_OPTIONS_MBOX},
+ {"MBOX3FIX", PMC72, MBOX3FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_3, EVENT_OPTION_NONE_MASK},
+ /* QPI counters four 48bit wide per port, split in two reads */
+ {"SBOX0C0",PMC73, SBOX0, PCI_UNC_QPI_PMON_CTL_0, PCI_UNC_QPI_PMON_CTR_0_A, PCI_UNC_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_0, SNBEP_VALID_OPTIONS_SBOX},
+ {"SBOX0C1",PMC74, SBOX0, PCI_UNC_QPI_PMON_CTL_1, PCI_UNC_QPI_PMON_CTR_1_A, PCI_UNC_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_0, SNBEP_VALID_OPTIONS_SBOX},
+ {"SBOX0C2",PMC75, SBOX0, PCI_UNC_QPI_PMON_CTL_2, PCI_UNC_QPI_PMON_CTR_2_A, PCI_UNC_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_0, SNBEP_VALID_OPTIONS_SBOX},
+ {"SBOX0C3",PMC76, SBOX0, PCI_UNC_QPI_PMON_CTL_3, PCI_UNC_QPI_PMON_CTR_3_A, PCI_UNC_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_0, SNBEP_VALID_OPTIONS_SBOX},
+ {"SBOX0FIX", PMC77, SBOX0FIX, 0, PCI_UNC_QPI_RATE_STATUS, 0, PCI_QPI_MISC_DEVICE_PORT_0, EVENT_OPTION_NONE_MASK},
+ {"SBOX1C0",PMC78, SBOX1, PCI_UNC_QPI_PMON_CTL_0, PCI_UNC_QPI_PMON_CTR_0_A, PCI_UNC_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_1, SNBEP_VALID_OPTIONS_SBOX},
+ {"SBOX1C1",PMC79, SBOX1, PCI_UNC_QPI_PMON_CTL_1, PCI_UNC_QPI_PMON_CTR_1_A, PCI_UNC_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_1, SNBEP_VALID_OPTIONS_SBOX},
+ {"SBOX1C2",PMC80, SBOX1, PCI_UNC_QPI_PMON_CTL_2, PCI_UNC_QPI_PMON_CTR_2_A, PCI_UNC_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_1, SNBEP_VALID_OPTIONS_SBOX},
+ {"SBOX1C3",PMC81, SBOX1, PCI_UNC_QPI_PMON_CTL_3, PCI_UNC_QPI_PMON_CTR_3_A, PCI_UNC_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_1, SNBEP_VALID_OPTIONS_SBOX},
+ {"SBOX1FIX", PMC82, SBOX1FIX, 0, PCI_UNC_QPI_RATE_STATUS, 0, PCI_QPI_MISC_DEVICE_PORT_1, EVENT_OPTION_NONE_MASK},
+ /* BBOX or better known as Home Agent (HA) */
+ {"BBOX0",PMC83, BBOX0, PCI_UNC_HA_PMON_CTL_0, PCI_UNC_HA_PMON_CTR_0_A, PCI_UNC_HA_PMON_CTR_0_B, PCI_HA_DEVICE_0, SNBEP_VALID_OPTIONS_BBOX},
+ {"BBOX1",PMC84, BBOX0, PCI_UNC_HA_PMON_CTL_1, PCI_UNC_HA_PMON_CTR_1_A, PCI_UNC_HA_PMON_CTR_1_B, PCI_HA_DEVICE_0, SNBEP_VALID_OPTIONS_BBOX},
+ {"BBOX2",PMC85, BBOX0, PCI_UNC_HA_PMON_CTL_2, PCI_UNC_HA_PMON_CTR_2_A, PCI_UNC_HA_PMON_CTR_2_B, PCI_HA_DEVICE_0, SNBEP_VALID_OPTIONS_BBOX},
+ {"BBOX3",PMC86, BBOX0, PCI_UNC_HA_PMON_CTL_3, PCI_UNC_HA_PMON_CTR_3_A, PCI_UNC_HA_PMON_CTR_3_B, PCI_HA_DEVICE_0, SNBEP_VALID_OPTIONS_BBOX},
+ {"RBOX0C0", PMC87, RBOX0, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_0, SNBEP_VALID_OPTIONS_RBOX},
+ {"RBOX0C1", PMC88, RBOX0, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_0, SNBEP_VALID_OPTIONS_RBOX},
+ {"RBOX0C2", PMC89, RBOX0, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_0, SNBEP_VALID_OPTIONS_RBOX},
+ {"RBOX1C0", PMC90, RBOX1, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_1, SNBEP_VALID_OPTIONS_RBOX},
+ {"RBOX1C1", PMC91, RBOX1, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_1, SNBEP_VALID_OPTIONS_RBOX},
+ {"RBOX1C2", PMC92, RBOX1, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_1, SNBEP_VALID_OPTIONS_RBOX},
+ {"PBOX0", PMC93, PBOX, PCI_UNC_R2PCIE_PMON_CTL_0, PCI_UNC_R2PCIE_PMON_CTR_0_A, PCI_UNC_R2PCIE_PMON_CTR_0_B, PCI_R2PCIE_DEVICE, SNBEP_VALID_OPTIONS_PBOX},
+ {"PBOX1", PMC94, PBOX, PCI_UNC_R2PCIE_PMON_CTL_1, PCI_UNC_R2PCIE_PMON_CTR_1_A, PCI_UNC_R2PCIE_PMON_CTR_1_B, PCI_R2PCIE_DEVICE, SNBEP_VALID_OPTIONS_PBOX},
+ {"PBOX2", PMC95, PBOX, PCI_UNC_R2PCIE_PMON_CTL_2, PCI_UNC_R2PCIE_PMON_CTR_2_A, PCI_UNC_R2PCIE_PMON_CTR_2_B, PCI_R2PCIE_DEVICE, SNBEP_VALID_OPTIONS_PBOX},
+ {"PBOX3", PMC96, PBOX, PCI_UNC_R2PCIE_PMON_CTL_3, PCI_UNC_R2PCIE_PMON_CTR_3_A, PCI_UNC_R2PCIE_PMON_CTR_3_B, PCI_R2PCIE_DEVICE, SNBEP_VALID_OPTIONS_PBOX},
+};
+
+static BoxMap sandybridgeEP_box_map[NUM_UNITS] = {
+ [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, MSR_DEV, 48},
+ [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, MSR_DEV, 48},
+ [THERMAL] = {0, 0, 0, 0, 0, MSR_DEV, 8},
+ [POWER] = {0, 0, 0, 0, 0, MSR_DEV, 32},
+ [WBOX] = {MSR_UNC_PCU_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 48},
+ [WBOX0FIX] = {0, 0, 0, 0, 0, MSR_DEV, 48},
+ [UBOX] = {0, MSR_UNC_U_PMON_BOX_STATUS, 0, 0, 0, MSR_DEV, 44},
+ [UBOXFIX] = {0, 0, 0, 0, 0, MSR_DEV, 44},
+ [CBOX0] = {MSR_UNC_C0_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 44, MSR_UNC_C0_PMON_BOX_FILTER},
+ [CBOX1] = {MSR_UNC_C1_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 44, MSR_UNC_C1_PMON_BOX_FILTER},
+ [CBOX2] = {MSR_UNC_C2_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 44, MSR_UNC_C2_PMON_BOX_FILTER},
+ [CBOX3] = {MSR_UNC_C3_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 44, MSR_UNC_C3_PMON_BOX_FILTER},
+ [CBOX4] = {MSR_UNC_C4_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 44, MSR_UNC_C4_PMON_BOX_FILTER},
+ [CBOX5] = {MSR_UNC_C5_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 44, MSR_UNC_C5_PMON_BOX_FILTER},
+ [CBOX6] = {MSR_UNC_C6_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 44, MSR_UNC_C6_PMON_BOX_FILTER},
+ [CBOX7] = {MSR_UNC_C7_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 44, MSR_UNC_C7_PMON_BOX_FILTER},
+ [MBOX0] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 0, 1, PCI_IMC_DEVICE_0_CH_0, 48},
+ [MBOX1] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 0, 1, PCI_IMC_DEVICE_0_CH_1, 48},
+ [MBOX2] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 0, 1, PCI_IMC_DEVICE_0_CH_2, 48},
+ [MBOX3] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 0, 1, PCI_IMC_DEVICE_0_CH_3, 48},
+ [MBOX0FIX] = {0, 0, 0, 0, 1, PCI_IMC_DEVICE_0_CH_0, 48},
+ [MBOX1FIX] = {0, 0, 0, 0, 1, PCI_IMC_DEVICE_0_CH_1, 48},
+ [MBOX2FIX] = {0, 0, 0, 0, 1, PCI_IMC_DEVICE_0_CH_2, 48},
+ [MBOX3FIX] = {0, 0, 0, 0, 1, PCI_IMC_DEVICE_0_CH_3, 48},
+ [BBOX0] = {PCI_UNC_HA_PMON_BOX_CTL, 0, 0, 0, 1, PCI_HA_DEVICE_0, 48},
+ [SBOX0] = {PCI_UNC_QPI_PMON_BOX_CTL, 0, 0, 0, 1, PCI_QPI_DEVICE_PORT_0, 48},
+ [SBOX1] = {PCI_UNC_QPI_PMON_BOX_CTL, 0, 0, 0, 1, PCI_QPI_DEVICE_PORT_1, 48},
+ [SBOX0FIX] = {0, 0, 0, 0, 1, PCI_QPI_MISC_DEVICE_PORT_0, 32},
+ [SBOX1FIX] = {0, 0, 0, 0, 1, PCI_QPI_MISC_DEVICE_PORT_1, 32},
+ [RBOX0] = {PCI_UNC_R3QPI_PMON_BOX_CTL, 0, 0, 0, 1, PCI_R3QPI_DEVICE_LINK_0, 44},
+ [RBOX1] = {PCI_UNC_R3QPI_PMON_BOX_CTL, 0, 0, 0, 1, PCI_R3QPI_DEVICE_LINK_1, 44},
+ [PBOX] = {PCI_UNC_R2PCIE_PMON_BOX_CTL, 0, 0, 0, 1, PCI_R2PCIE_DEVICE, 44},
+};
+
+static PciDevice sandybridgeEP_pci_devices[MAX_NUM_PCI_DEVICES] = {
+ [MSR_DEV] = {NODEVTYPE, "", "", ""},
+ [PCI_R3QPI_DEVICE_LINK_0] = {R3QPI, "13.5", "PCI_R3QPI_DEVICE_LINK_0", "RBOX0", 0x3c44},
+ [PCI_R3QPI_DEVICE_LINK_1] = {R3QPI, "13.6", "PCI_R3QPI_DEVICE_LINK_1", "RBOX1", 0x3c45},
+ [PCI_R2PCIE_DEVICE] = {R2PCIE, "13.1", "PCI_R2PCIE_DEVICE", "PBOX0", 0x3c43},
+ [PCI_IMC_DEVICE_0_CH_0] = {IMC, "10.0", "PCI_IMC_DEVICE_CH_0", "MBOX0", 0x3cb0},
+ [PCI_IMC_DEVICE_0_CH_1] = {IMC, "10.1", "PCI_IMC_DEVICE_CH_1", "MBOX1", 0x3cb1},
+ [PCI_IMC_DEVICE_0_CH_2] = {IMC, "10.4", "PCI_IMC_DEVICE_CH_2", "MBOX2", 0x3cb4},
+ [PCI_IMC_DEVICE_0_CH_3] = {IMC, "10.5", "PCI_IMC_DEVICE_CH_3", "MBOX3", 0x3cb5},
+ [PCI_HA_DEVICE_0] = {HA, "0e.1", "PCI_HA_DEVICE", "BBOX", 0x3c46},
+ [PCI_QPI_DEVICE_PORT_0] = {QPI, "08.2", "PCI_QPI_DEVICE_PORT_0", "SBOX0", 0x3c41},
+ [PCI_QPI_DEVICE_PORT_1] = {QPI, "09.2", "PCI_QPI_DEVICE_PORT_1", "SBOX1", 0x3c42},
+ [PCI_QPI_MASK_DEVICE_PORT_0] = {QPI, "08.6", "PCI_QPI_MASK_DEVICE_PORT_0", NULL, 0x3c86},
+ [PCI_QPI_MASK_DEVICE_PORT_1] = {QPI, "09.6", "PCI_QPI_MASK_DEVICE_PORT_1", NULL, 0x3c96},
+ [PCI_QPI_MISC_DEVICE_PORT_0] = {QPI, "08.0", "PCI_QPI_MISC_DEVICE_PORT_0", "SBOX0FIX",0x3c80},
+ [PCI_QPI_MISC_DEVICE_PORT_1] = {QPI, "09.0", "PCI_QPI_MISC_DEVICE_PORT_1", "SBOX1FIX", 0x3c91},
+};
diff --git a/src/includes/perfmon_sandybridgeEP_events.txt b/src/includes/perfmon_sandybridgeEP_events.txt
new file mode 100644
index 0000000..63198a9
--- /dev/null
+++ b/src/includes/perfmon_sandybridgeEP_events.txt
@@ -0,0 +1,1342 @@
+# =======================================================================================
+#
+# Filename: perfmon_sandybridgeEP_events.txt
+#
+# Description: Event list for Intel SandyBridge EP
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
+# Author: Jan Treibig (jt), jan.treibig at gmail.com
+# Thomas Roehl (tr), thomas.roehl at googlemail.com
+# Project: likwid
+#
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation, either version 3 of the License, or (at your option) any later
+# version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
+
+EVENT_TEMP_CORE 0x00 TMP0
+UMASK_TEMP_CORE 0x00
+
+EVENT_PWR_PKG_ENERGY 0x00 PWR0
+UMASK_PWR_PKG_ENERGY 0x00
+
+EVENT_PWR_PP0_ENERGY 0x00 PWR1
+UMASK_PWR_PP0_ENERGY 0x00
+
+EVENT_PWR_PP1_ENERGY 0x00 PWR2
+UMASK_PWR_PP1_ENERGY 0x00
+
+EVENT_PWR_DRAM_ENERGY 0x00 PWR3
+UMASK_PWR_DRAM_ENERGY 0x00
+
+EVENT_INSTR_RETIRED 0x00 FIXC0
+UMASK_INSTR_RETIRED_ANY 0x00
+
+EVENT_CPU_CLK_UNHALTED 0x00 FIXC1
+UMASK_CPU_CLK_UNHALTED_CORE 0x00
+
+EVENT_CPU_CLK_UNHALTED 0x00 FIXC2
+UMASK_CPU_CLK_UNHALTED_REF 0x00
+
+EVENT_LOAD_BLOCKS 0x03 PMC
+UMASK_LOAD_BLOCKS_DATA_UNKNOWN 0x01
+UMASK_LOAD_BLOCKS_STORE_FORWARD 0x02
+UMASK_LOAD_BLOCKS_NO_SR 0x08
+UMASK_LOAD_BLOCKS_ALL_BLOCK 0x10
+
+EVENT_MISALIGN_MEM_REF 0x05 PMC
+UMASK_MISALIGN_MEM_REF_LOAD 0x01
+UMASK_MISALIGN_MEM_REF_STORE 0x02
+UMASK_MISALIGN_MEM_REF_ANY 0x03
+
+EVENT_LD_BLOCKS_PARTIAL 0x07 PMC
+UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS 0x01
+UMASK_LD_BLOCKS_PARTIAL_ALL_STA_BLOCK 0x08
+
+EVENT_DTLB_LOAD_MISSES 0x08 PMC
+UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK 0x01
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED 0x02
+UMASK_DTLB_LOAD_MISSES_WALK_DURATION 0x04
+
+EVENT_INT_MISC 0x0D PMC
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_INT_MISC_RECOVERY_CYCLES 0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_INT_MISC_RECOVERY_COUNT 0x03
+UMASK_INT_MISC_RAT_STALL_CYCLES 0x40
+DEFAULT_OPTIONS_INT_MISC_RAT_STALL_COUNT EVENT_OPTION_EDGE=1
+UMASK_INT_MISC_RAT_STALL_COUNT 0x40
+
+EVENT_UOPS_ISSUED 0x0E PMC
+UMASK_UOPS_ISSUED_ANY 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_ACTIVE_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_ISSUED_ACTIVE_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_ANY 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_ACTIVE_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_ACTIVE_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_TOTAL_CYCLES 0x01
+
+EVENT_FP_COMP_OPS_EXE 0x10 PMC
+UMASK_FP_COMP_OPS_EXE_X87 0x01
+UMASK_FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE 0x10
+UMASK_FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE 0x20
+UMASK_FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE 0x40
+UMASK_FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE 0x80
+
+EVENT_SIMD_FP_256_PACKED 0x11 PMC
+UMASK_SIMD_FP_256_PACKED_SINGLE 0x01
+UMASK_SIMD_FP_256_PACKED_DOUBLE 0x02
+
+EVENT_ARITH 0x14 PMC
+UMASK_ARITH_FPU_DIV_ACTIVE 0x01
+DEFAULT_OPTIONS_ARITH_NUM_DIV EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_ARITH_NUM_DIV 0x01
+
+EVENT_INSTS_WRITTEN_TO_IQ 0x17 PMC
+UMASK_INSTS_WRITTEN_TO_IQ_INSTS 0x01
+
+EVENT_L2_RQSTS 0x24 PMC
+UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD_HIT 0x01
+UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD_MISS 0x02
+UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD 0x03
+UMASK_L2_RQSTS_RFO_HITS 0x04
+UMASK_L2_RQSTS_RFO_MISS 0x08
+UMASK_L2_RQSTS_RFO_ANY 0x0C
+UMASK_L2_RQSTS_CODE_RD_HITS 0x10
+UMASK_L2_RQSTS_CODE_RD_MISS 0x20
+UMASK_L2_RQSTS_ALL_CODE_CODE_RD 0x30
+UMASK_L2_RQSTS_PF_HIT 0x40
+UMASK_L2_RQSTS_PF_MISS 0x80
+UMASK_L2_RQSTS_ALL_PF 0xC0
+UMASK_L2_RQSTS_MISS 0xAA
+
+EVENT_L2_STORE_LOCK_RQSTS 0x27 PMC
+UMASK_L2_STORE_LOCK_RQSTS_MISS 0x01
+UMASK_L2_STORE_LOCK_RQSTS_HIT_E 0x04
+UMASK_L2_STORE_LOCK_RQSTS_HIT_M 0x08
+UMASK_L2_STORE_LOCK_RQSTS_ALL 0x0F
+
+EVENT_L1D_WB_RQST 0x28 PMC
+UMASK_L1D_WB_RQST_HIT_E 0x04
+UMASK_L1D_WB_RQST_HIT_M 0x08
+
+EVENT_L3_LAT_CACHE 0x2E PMC
+UMASK_L3_LAT_CACHE_REFERENCE 0x4F
+UMASK_L3_LAT_CACHE_MISS 0x41
+
+EVENT_CPU_CLOCK_UNHALTED 0x3C PMC
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P 0x00
+UMASK_CPU_CLOCK_UNHALTED_REF_P 0x01
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0x2,EVENT_OPTION_INVERT=0x1
+UMASK_CPU_CLOCK_UNHALTED_TOTAL_CYCLES 0x00
+
+EVENT_L1D_PEND_MISS 0x48 PMC1
+UMASK_L1D_PEND_MISS_PENDING 0x01
+
+EVENT_DTLB_STORE_MISSES 0x49 PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK 0x01
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED 0x02
+UMASK_DTLB_STORE_MISSES_WALK_DURATION 0x04
+UMASK_DTLB_STORE_MISSES_STLB_HIT 0x10
+
+EVENT_LOAD_HIT_PRE 0x4C PMC
+UMASK_LOAD_HIT_PRE_SW_PF 0x01
+UMASK_LOAD_HIT_PRE_HW_PF 0x02
+
+EVENT_HW_PRE_REQ 0x4E PMC
+UMASK_HW_PRE_REQ_DL1_MISS 0x02
+
+EVENT_L1D 0x51 PMC
+UMASK_L1D_REPLACEMENT 0x01
+UMASK_L1D_ALLOCATED_IN_M 0x02
+UMASK_L1D_M_EVICT 0x04
+UMASK_L1D_ALL_M_REPLACEMENT 0x08
+
+EVENT_PARTIAL_RAT_STALLS 0x59 PMC
+UMASK_PARTIAL_RAT_STALLS_FLAGS_MERGE_UOP 0x20
+UMASK_PARTIAL_RAT_STALLS_SLOW_LEA_WINDOW 0x40
+UMASK_PARTIAL_RAT_STALLS_MUL_SINGLE_UOP 0x80
+
+EVENT_RESOURCE_STALLS2 0x5B PMC
+UMASK_RESOURCE_STALLS2_ALL_FL_EMPTY 0x0C
+UMASK_RESOURCE_STALLS2_ALL_PRF_CONTROL 0x0F
+UMASK_RESOURCE_STALLS2_BOB_FULL 0x40
+UMASK_RESOURCE_STALLS2_OOO_RSRC 0x4F
+
+EVENT_CPL_CYCLES 0x5C PMC
+UMASK_CPL_CYCLES_RING0 0x01
+UMASK_CPL_CYCLES_RING123 0x02
+
+EVENT_RS_EVENTS 0x5E PMC
+UMASK_RS_EVENTS_EMPTY_CYCLES 0x01
+
+EVENT_OFFCORE_REQUESTS_OUTSTANDING 0x60 PMC
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD 0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD 0x02
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO 0x04
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD 0x08
+
+EVENT_CACHE_LOCK_CYCLES 0x63 PMC
+UMASK_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION 0x01
+UMASK_CACHE_LOCK_CYCLES_CACHE_LOCK_DURATION 0x02
+
+EVENT_IDQ 0x79 PMC
+UMASK_IDQ_EMPTY 0x02
+UMASK_IDQ_MITE_UOPS 0x04
+UMASK_IDQ_DSB_UOPS 0x08
+UMASK_IDQ_MS_DSB_UOPS 0x10
+UMASK_IDQ_MS_MITE_UOPS 0x20
+UMASK_IDQ_MS_UOPS 0x30
+
+EVENT_ICACHE 0x80 PMC
+UMASK_ICACHE_HITS 0x01
+UMASK_ICACHE_MISSES 0x02
+UMASK_ICACHE_ACCESSES 0x03
+UMASK_ICACHE_IFETCH_STALL 0x04
+
+EVENT_ITLB_MISSES 0x85 PMC
+UMASK_ITLB_MISSES_CAUSES_A_WALK 0x01
+UMASK_ITLB_MISSES_WALK_COMPLETED 0x02
+UMASK_ITLB_MISSES_WALK_DURATION 0x04
+UMASK_ITLB_MISSES_STLB_HIT 0x10
+
+EVENT_ILD_STALL 0x87 PMC
+UMASK_ILD_STALL_LCP 0x01
+UMASK_ILD_STALL_IQ_FULL 0x04
+
+EVENT_BR_INST_EXEC 0x88 PMC
+UMASK_BR_INST_EXEC_COND_TAKEN 0x81
+UMASK_BR_INST_EXEC_COND_NON_TAKEN 0x41
+UMASK_BR_INST_EXEC_DIRECT_JMP_TAKEN 0x82
+UMASK_BR_INST_EXEC_DIRECT_JMP_NON_TAKEN 0x42
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN 0x84
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN 0x44
+UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN 0x88
+UMASK_BR_INST_EXEC_RETURN_NEAR_NON_TAKEN 0x48
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN 0x90
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_NON_TAKEN 0x50
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN 0xA0
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN 0x60
+UMASK_BR_INST_EXEC_ALL_BRANCHES 0xFF
+
+EVENT_BR_MISP_EXEC 0x89 PMC
+UMASK_BR_MISP_EXEC_COND_TAKEN 0x81
+UMASK_BR_MISP_EXEC_COND_NON_TAKEN 0x41
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN 0x84
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN 0x44
+UMASK_BR_MISP_EXEC_RETURN_NEAR_TAKEN 0x88
+UMASK_BR_MISP_EXEC_RETURN_NEAR_NON_TAKEN 0x48
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_TAKEN 0x90
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_NON_TAKEN 0x50
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN 0xA0
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN 0x60
+UMASK_BR_MISP_EXEC_ALL_BRANCHES 0xFF
+
+EVENT_IDQ_UOPS_NOT_DELIVERED 0x9C PMC
+UMASK_IDQ_UOPS_NOT_DELIVERED_CORE 0x01
+
+EVENT_UOPS_DISPATCHED_PORT 0xA1 PMC
+UMASK_UOPS_DISPATCHED_PORT_PORT_0 0x01
+UMASK_UOPS_DISPATCHED_PORT_PORT_1 0x02
+UMASK_UOPS_DISPATCHED_PORT_PORT_2_LD 0x04
+UMASK_UOPS_DISPATCHED_PORT_PORT_2_STA 0x08
+UMASK_UOPS_DISPATCHED_PORT_PORT_2 0x0C
+UMASK_UOPS_DISPATCHED_PORT_PORT_3_LD 0x10
+UMASK_UOPS_DISPATCHED_PORT_PORT_3_STA 0x20
+UMASK_UOPS_DISPATCHED_PORT_PORT_3 0x30
+UMASK_UOPS_DISPATCHED_PORT_PORT_4 0x40
+UMASK_UOPS_DISPATCHED_PORT_PORT_5 0x80
+DEFAULT_OPTIONS_UOPS_DISPATCHED_PORT_ARITH_PORTS_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_DISPATCHED_PORT_ARITH_PORTS_CORE 0x83
+UMASK_UOPS_DISPATCHED_PORT_DATA_PORTS 0x7C
+UMASK_UOPS_DISPATCHED_PORT_ALL_PORTS 0xFF
+
+EVENT_RESOURCE_STALLS 0xA2 PMC
+UMASK_RESOURCE_STALLS_ANY 0x01
+UMASK_RESOURCE_STALLS_LB 0x02
+UMASK_RESOURCE_STALLS_RS 0x04
+UMASK_RESOURCE_STALLS_B 0x08
+UMASK_RESOURCE_STALLS_ROB 0x10
+UMASK_RESOURCE_STALLS_FCSW 0x20
+UMASK_RESOURCE_STALLS_MXCSR 0x40
+UMASK_RESOURCE_STALLS_OTHER 0x80
+
+EVENT_DSB2MITE_SWITCHES 0xAB PMC
+UMASK_DSB2MITE_SWITCHES_COUNT 0x01
+UMASK_DSB2MITE_SWITCHES_PENALTY_CYCLES 0x02
+
+EVENT_DSB_FILL 0xAC PMC
+UMASK_DSB_FILL_OTHER_CANCEL 0x02
+UMASK_DSB_FILL_EXCEED_DSB_LINES 0x08
+UMASK_DSB_FILL_ALL_CANCEL 0x0A
+
+EVENT_ITLB 0xAE PMC
+UMASK_ITLB_ITLB_FLUSH 0x01
+
+EVENT_OFFCORE_REQUESTS 0xB0 PMC
+UMASK_OFFCORE_REQUESTS_DEMAND_DATA_RD 0x01
+UMASK_OFFCORE_REQUESTS_DEMAND_CODE_RD 0x02
+UMASK_OFFCORE_REQUESTS_DEMAND_RFO 0x04
+UMASK_OFFCORE_REQUESTS_ALL_DATA_RD 0x08
+
+EVENT_UOPS_EXECUTED 0xB1 PMC
+UMASK_UOPS_EXECUTED_THREAD 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC 0x01
+UMASK_UOPS_EXECUTED_CORE 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_USED_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_TOTAL_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC 0x02
+
+EVENT_OFFCORE_REQUESTS_BUFFER 0xB2 PMC
+UMASK_OFFCORE_REQUESTS_BUFFER_SQ_FULL 0x01
+
+EVENT_AGU_BYPASS_CANCEL 0xB6 PMC
+UMASK_AGU_BYPASS_CANCEL_COUNT 0x01
+
+EVENT_TLB_FLUSH 0xBD PMC
+UMASK_TLB_FLUSH_DTLB_THREAD 0x01
+UMASK_TLB_FLUSH_STLB_ANY 0x20
+
+EVENT_L1D_BLOCKS 0xBF PMC
+DEFAULT_OPTIONS_L1D_BLOCKS_BANK_CONFLICT_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_L1D_BLOCKS_BANK_CONFLICT_CYCLES 0x05
+DEFAULT_OPTIONS_L1D_BLOCKS_BANK_CONFLICT_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_L1D_BLOCKS_BANK_CONFLICT_COUNT 0x05
+
+EVENT_INST_RETIRED 0xC0 PMC0
+UMASK_INST_RETIRED_ANY_P 0x00
+UMASK_INST_RETIRED_PREC_DIST 0x01
+
+EVENT_OTHER_ASSISTS 0xC1 PMC
+UMASK_OTHER_ASSISTS_ITLB_MISS_RETIRED 0x02
+UMASK_OTHER_ASSISTS_AVX_TO_SSE 0x10
+UMASK_OTHER_ASSISTS_SSE_TO_AVX 0x20
+
+EVENT_UOPS_RETIRED 0xC2 PMC
+UMASK_UOPS_RETIRED_ALL 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL 0x01
+UMASK_UOPS_RETIRED_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_RETIRED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL 0x01
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_TOTAL_CYCLES 0x01
+
+EVENT_MACHINE_CLEARS 0xC3 PMC
+UMASK_MACHINE_CLEARS_MEMORY_ORDERING 0x02
+UMASK_MACHINE_CLEARS_SMC 0x04
+UMASK_MACHINE_CLEARS_MASKMOV 0x20
+
+EVENT_BR_INST_RETIRED 0xC4 PMC
+UMASK_BR_INST_RETIRED_ALL_BRANCHES 0x00
+UMASK_BR_INST_RETIRED_CONDITIONAL 0x01
+UMASK_BR_INST_RETIRED_NEAR_CALL 0x02
+UMASK_BR_INST_RETIRED_NEAR_RETURN 0x08
+UMASK_BR_INST_RETIRED_NOT_TAKEN 0x10
+UMASK_BR_INST_RETIRED_NEAR_TAKEN 0x20
+UMASK_BR_INST_RETIRED_FAR_BRANCH 0x40
+
+EVENT_BR_MISP_RETIRED 0xC5 PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES 0x00
+UMASK_BR_MISP_RETIRED_CONDITIONAL 0x01
+UMASK_BR_MISP_RETIRED_NEAR_CALL 0x02
+UMASK_BR_MISP_RETIRED_NOT_TAKEN 0x10
+UMASK_BR_MISP_RETIRED_TAKEN 0x20
+
+EVENT_FP_ASSIST 0xCA PMC
+UMASK_FP_ASSIST_X87_OUTPUT 0x02
+UMASK_FP_ASSIST_X87_INPUT 0x04
+UMASK_FP_ASSIST_SIMD_OUTPUT 0x08
+UMASK_FP_ASSIST_SIMD_INPUT 0x10
+UMASK_FP_ASSIST_ANY 0x1E
+
+EVENT_HW_INTERRUPTS_RECEIVED 0xCB PMC
+UMASK_HW_INTERRUPTS_RECEIVED 0x01
+
+EVENT_ROB_MISC_EVENT_LBR_INSERTS 0xCC PMC
+UMASK_ROB_MISC_EVENT_LBR_INSERTS 0x20
+
+EVENT_MEM_UOPS_RETIRED 0xD0 PMC
+UMASK_MEM_UOPS_RETIRED_LOADS 0x81
+UMASK_MEM_UOPS_RETIRED_STORES 0x82
+UMASK_MEM_UOPS_RETIRED_LOADS_STLB_MISS 0x11
+UMASK_MEM_UOPS_RETIRED_STORES_STLB_MISS 0x12
+UMASK_MEM_UOPS_RETIRED_LOADS_LOCK 0x21
+UMASK_MEM_UOPS_RETIRED_STORES_LOCK 0x22
+UMASK_MEM_UOPS_RETIRED_LOADS_SPLIT 0x41
+UMASK_MEM_UOPS_RETIRED_STORES_SPLIT 0x42
+
+EVENT_MEM_LOAD_UOPS_RETIRED 0xD1 PMC
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT 0x01
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_MISS 0x08
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_ALL 0x09
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_HIT 0x02
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_MISS 0x10
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_ALL 0x12
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT 0x04
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS 0x20
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL 0x24
+UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB 0x40
+UMASK_MEM_LOAD_UOPS_RETIRED_ALL 0x7F
+
+EVENT_MEM_LOAD_UOPS_LLC_HIT_RETIRED 0xD2 PMC
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_MISS 0x01
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HIT 0x02
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM 0x04
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_NONE 0x08
+
+EVENT_MEM_LOAD_UOPS_MISC_RETIRED 0xD4 PMC
+UMASK_MEM_LOAD_UOPS_MISC_RETIRED_LLC_MISS 0x02
+
+EVENT_L2_TRANS 0xF0 PMC
+UMASK_L2_TRANS_DEMAND_DATA_RD 0x01
+UMASK_L2_TRANS_RFO 0x02
+UMASK_L2_TRANS_CODE_RD 0x04
+UMASK_L2_TRANS_ALL_PREF 0x08
+UMASK_L2_TRANS_L1D_WB 0x10
+UMASK_L2_TRANS_L2_FILL 0x20
+UMASK_L2_TRANS_L2_WB 0x40
+UMASK_L2_TRANS_ALL_REQUESTS 0x80
+
+EVENT_L2_LINES_IN 0xF1 PMC
+UMASK_L2_LINES_IN_I 0x01
+UMASK_L2_LINES_IN_S 0x02
+UMASK_L2_LINES_IN_E 0x04
+UMASK_L2_LINES_IN_ALL 0x07
+
+EVENT_L2_LINES_OUT 0xF2 PMC
+UMASK_L2_LINES_OUT_DEMAND_CLEAN 0x01
+UMASK_L2_LINES_OUT_DEMAND_DIRTY 0x02
+UMASK_L2_LINES_OUT_PF_CLEAN 0x04
+UMASK_L2_LINES_OUT_PF_DIRTY 0x08
+UMASK_L2_LINES_OUT_DIRTY_ALL 0x0A
+UMASK_L2_LINES_OUT_CLEAN_ALL 0x05
+UMASK_L2_LINES_OUT_ALL 0x0F
+
+EVENT_SQ_MISC 0xF4 PMC
+UMASK_SQ_MISC_SPLIT_LOCK 0x10
+
+EVENT_OFFCORE_RESPONSE_0 0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_LOCAL_DRAM EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_LOCAL_DRAM EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x78040
+UMASK_OFFCORE_RESPONSE_0_LOCAL_DRAM 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_REMOTE_DRAM EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_REMOTE_DRAM EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x7FF80
+UMASK_OFFCORE_RESPONSE_0_REMOTE_DRAM 0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY 0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY 0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY 0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY 0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY 0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY 0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY 0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY 0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY 0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY 0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY 0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY 0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY 0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1 0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_LOCAL_DRAM EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_LOCAL_DRAM EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x78040
+UMASK_OFFCORE_RESPONSE_1_LOCAL_DRAM 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_REMOTE_DRAM EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_REMOTE_DRAM EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x7FF80
+UMASK_OFFCORE_RESPONSE_1_REMOTE_DRAM 0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY 0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY 0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY 0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY 0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY 0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY 0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY 0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY 0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY 0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY 0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY 0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY 0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY 0x01 0x0F 0x10
+
+EVENT_CBOX_CLOCKTICKS 0x00 CBOX
+UMASK_CBOX_CLOCKTICKS 0x00
+
+EVENT_COUNTER0_OCCUPANCY 0x1F CBOX0C1|CBOX0C2|CBOX0C3|CBOX1C1|CBOX1C2|CBOX1C3|CBOX2C1|CBOX2C2|CBOX2C3|CBOX3C1|CBOX03C2|CBOX3C3|CBOX4C1|CBOX4C2|CBOX4C3|CBOX5C1|CBOX5C2|CBOX5C3|CBOX6C1|CBOX6C2|CBOX6C3|CBOX7C1|CBOX7C2|CBOX7C3
+UMASK_COUNTER0_OCCUPANCY 0x00
+
+EVENT_ISMQ_DRD_MISS_OCC 0x21 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_ISMQ_DRD_MISS_OCC 0x00
+
+EVENT_LLC_LOOKUP 0x34 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+DEFAULT_OPTIONS_LLC_LOOKUP_ANY EVENT_OPTION_STATE=0x1
+UMASK_LLC_LOOKUP_ANY 0x11
+DEFAULT_OPTIONS_LLC_LOOKUP_DATA_READ EVENT_OPTION_STATE=0x1
+UMASK_LLC_LOOKUP_DATA_READ 0x03
+DEFAULT_OPTIONS_LLC_LOOKUP_WRITE EVENT_OPTION_STATE=0x1
+UMASK_LLC_LOOKUP_WRITE 0x05
+DEFAULT_OPTIONS_LLC_LOOKUP_DATA_READ_AND_ALL EVENT_OPTION_STATE=0x1
+UMASK_LLC_LOOKUP_DATA_READ_AND_ALL WRITE 0x7
+DEFAULT_OPTIONS_LLC_LOOKUP_REMOTE_SNOOP EVENT_OPTION_STATE=0x1
+UMASK_LLC_LOOKUP_REMOTE_SNOOP 0x09
+DEFAULT_OPTIONS_LLC_LOOKUP_NID EVENT_OPTION_STATE=0x1
+UMASK_LLC_LOOKUP_NID 0x41
+
+EVENT_LLC_VICTIMS 0x37 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_LLC_VICTIMS_M_STATE 0x01
+UMASK_LLC_VICTIMS_E_STATE 0x02
+UMASK_LLC_VICTIMS_S_STATE 0x04
+UMASK_LLC_VICTIMS_MISS 0x08
+UMASK_LLC_VICTIMS_ALL_STATES 0x0F
+OPTIONS_LLC_VICTIMS_NID EVENT_OPTION_NID_MASK
+UMASK_LLC_VICTIMS_NID 0x40
+OPTIONS_LLC_VICTIMS_NID_MISSES EVENT_OPTION_NID_MASK
+UMASK_LLC_VICTIMS_NID_MISSES 0x41
+
+EVENT_CBOX_MISC 0x39 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_CBOX_MISC_RSPI_WAS_FSE 0x01
+UMASK_CBOX_MISC_WC_ALIASING 0x02
+UMASK_CBOX_MISC_STARTED 0x04
+UMASK_CBOX_MISC_RFO_HIT_S 0x08
+
+EVENT_RING_AD_USED 0x1B CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3
+UMASK_RING_AD_USED_UP_EVEN 0x01
+UMASK_RING_AD_USED_UP_ODD 0x02
+UMASK_RING_AD_USED_DOWN_EVEN 0x04
+UMASK_RING_AD_USED_DOWN_ODD 0x08
+
+EVENT_RING_AK_USED 0x1C CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3
+UMASK_RING_AK_USED_UP_EVEN 0x01
+UMASK_RING_AK_USED_UP_ODD 0x02
+UMASK_RING_AK_USED_DOWN_EVEN 0x04
+UMASK_RING_AK_USED_DOWN_ODD 0x08
+
+EVENT_RING_BL_USED 0x1D CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3
+UMASK_RING_BL_USED_UP_EVEN 0x01
+UMASK_RING_BL_USED_UP_ODD 0x02
+UMASK_RING_BL_USED_DOWN_EVEN 0x04
+UMASK_RING_BL_USED_DOWN_ODD 0x08
+
+EVENT_RING_BOUNCES 0x05 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_RING_BOUNCES_AK_CORE 0x02
+UMASK_RING_BOUNCES_BL_CORE 0x04
+UMASK_RING_BOUNCES_IV_CORE 0x08
+
+EVENT_RING_IV_USED 0x1E CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3
+UMASK_RING_IV_USED_ANY 0x0F
+
+EVENT_RING_SRC_THRTL 0x05 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_RING_SRC_THRTL 0x07
+
+EVENT_RXR_EXT_STARVED 0x12 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_RXR_EXT_STARVED_IRQ 0x01
+UMASK_RXR_EXT_STARVED_IPQ 0x02
+UMASK_RXR_EXT_STARVED_ISMQ 0x04
+UMASK_RXR_EXT_STARVED_ISMQ_BIDS 0x08
+
+EVENT_RXR_INSERTS 0x13 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_RXR_INSERTS_IRQ 0x01
+UMASK_RXR_INSERTS_IRQ_REJECTED 0x02
+UMASK_RXR_INSERTS_IPQ 0x04
+UMASK_RXR_INSERTS_VFIFO 0x10
+
+EVENT_RXR_IPQ_RETRY 0x31 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_RXR_IPQ_RETRY_ANY 0x01
+UMASK_RXR_IPQ_RETRY_FULL 0x02
+UMASK_RXR_IPQ_RETRY_ADDR_CONFLICT 0x04
+UMASK_RXR_IPQ_RETRY_QPI_CREDITS 0x10
+
+EVENT_RXR_IRQ_RETRY 0x32 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_RXR_IRQ_RETRY_ANY 0x01
+UMASK_RXR_IRQ_RETRY_FULL 0x02
+UMASK_RXR_IRQ_RETRY_ADDR_CONFLICT 0x04
+UMASK_RXR_IRQ_RETRY_RTID 0x08
+UMASK_RXR_IRQ_RETRY_QPI_CREDITS 0x10
+
+EVENT_RXR_ISMQ_RETRY 0x33 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_RXR_ISMQ_RETRY_ANY 0x01
+UMASK_RXR_ISMQ_RETRY_FULL 0x02
+UMASK_RXR_ISMQ_RETRY_ADDR_CONFLICT 0x04
+UMASK_RXR_ISMQ_RETRY_RTID 0x08
+UMASK_RXR_ISMQ_RETRY_QPI_CREDITS 0x10
+
+EVENT_RXR_OCCUPANCY 0x11 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0
+UMASK_RXR_OCCUPANCY_IRQ 0x01
+UMASK_RXR_OCCUPANCY_IRQ_REJECTED 0x02
+UMASK_RXR_OCCUPANCY_IPQ 0x04
+UMASK_RXR_OCCUPANCY_VIFO 0x10
+
+EVENT_TOR_INSERTS 0x35 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+OPTIONS_TOR_INSERTS_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_OPCODE 0x01
+UMASK_TOR_INSERTS_EVICTION 0x04
+UMASK_TOR_INSERTS_WB 0x10
+OPTIONS_TOR_INSERTS_MISS_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_MISS_OPCODE 0x03
+UMASK_TOR_INSERTS_MISS_ALL 0x0A
+OPTIONS_TOR_INSERTS_NID_OPCODE EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_NID_OPCODE 0x41
+OPTIONS_TOR_INSERTS_NID_EVICION EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_EVICTION 0x44
+OPTIONS_TOR_INSERTS_NID_ALL EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_ALL 0x48
+OPTIONS_TOR_INSERTS_NID_WB EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_WB 0x50
+OPTIONS_TOR_INSERTS_NID_MISS_OPCODE EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_NID_MISS_OPCODE 0x43
+OPTIONS_TOR_INSERTS_NID_ALL EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_MISS_ALL 0x4A
+
+EVENT_TOR_OCCUPANCY 0x36 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0
+OPTIONS_TOR_OCCUPANCY_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_OPCODE 0x01
+UMASK_TOR_OCCUPANCY_EVICTION 0x04
+UMASK_TOR_OCCUPANCY_ALL 0x08
+OPTIONS_TOR_OCCUPANCY_MISS_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_MISS_OPCODE 0x03
+UMASK_TOR_OCCUPANCY_MISS_ALL 0x0A
+OPTIONS_TOR_OCCUPANCY_NID_OPCODE EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_NID_OPCODE 0x41
+OPTIONS_TOR_OCCUPANCY_NID_EVICTION EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_EVICTION 0x44
+OPTIONS_TOR_OCCUPANCY_NID_ALL EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_ALL 0x48
+OPTIONS_TOR_OCCUPANCY_NID_MISS_OPCODE EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_NID_MISS_OPCODE 0x43
+OPTIONS_TOR_OCCUPANCY_NID_MISS_ALL EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_MISS_ALL 0x4A
+
+EVENT_TXT_ADS_USED 0x04 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_TXT_ADS_USED 0x00
+
+EVENT_TXT_INSERTS 0x02 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_TXT_INSERTS_AD_CACHE 0x01
+UMASK_TXT_INSERTS_AK_CACHE 0x02
+UMASK_TXT_INSERTS_BL_CACHE 0x04
+UMASK_TXT_INSERTS_IV_CACHE 0x08
+UMASK_TXT_INSERTS_AD_CORE 0x10
+UMASK_TXT_INSERTS_AK_CORE 0x20
+UMASK_TXT_INSERTS_BL_CORE 0x40
+
+EVENT_BBOX_CLOCKTICKS 0x00 BBOX
+UMASK_BBOX_CLOCKTICKS 0x00
+
+EVENT_CONFLICT_CYCLES 0x0B BBOX
+UMASK_CONFLICT_CYCLES_NO_CONFLICT 0x01
+UMASK_CONFLICT_CYCLES_CONFLICT 0x02
+
+EVENT_DIRECT2CORE_COUNT 0x11 BBOX
+UMASK_DIRECT2CORE_COUNT 0x00
+
+EVENT_DIRECT2CORE_CYCLES_DISABLED 0x12 BBOX
+UMASK_DIRECT2CORE_CYCLES_DISABLED 0x00
+
+EVENT_DIRECT2CORE_TXN_OVERRIDE 0x13 BBOX
+UMASK_DIRECT2CORE_TXN_OVERRIDE 0x00
+
+EVENT_DIRECTORY_LOOKUP 0x0C BBOX
+UMASK_DIRECTORY_LOOKUP_SNP 0x01
+UMASK_DIRECTORY_LOOKUP_NO_SNP 0x02
+
+EVENT_DIRECTORY_UPDATE 0x0D BBOX
+UMASK_DIRECTORY_UPDATE_SET 0x01
+UMASK_DIRECTORY_UPDATE_CLEAR 0x02
+UMASK_DIRECTORY_UPDATE_ANY 0x03
+
+EVENT_IGR_NO_CREDIT_CYCLES 0x22 BBOX
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI0 0x01
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI1 0x02
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI0 0x04
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI1 0x08
+
+EVENT_IMC_RETRY 0x1E BBOX
+UMASK_IMC_RETRY 0x00
+
+EVENT_IMC_WRITES 0x1A BBOX
+UMASK_IMC_WRITES_FULL 0x01
+UMASK_IMC_WRITES_PARTIAL 0x02
+UMASK_IMC_WRITES_FULL_ISOCH 0x04
+UMASK_IMC_WRITES_PARTIAL_ISOCH 0x08
+UMASK_IMC_WRITES_ALL 0x0F
+
+EVENT_REQUESTS 0x01 BBOX
+UMASK_REQUESTS_READS 0x03
+UMASK_REQUESTS_WRITES 0x0C
+
+EVENT_RPQ_CYCLES_NO_REG_CREDITS 0x15 BBOX
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN0 0x01
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN1 0x02
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN2 0x04
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN3 0x08
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_ALL 0x0F
+
+EVENT_TAD_REQUESTS_G0 0x1B BBOX
+UMASK_TAD_REQUESTS_G0_REGION_0 0x01
+UMASK_TAD_REQUESTS_G0_REGION_1 0x02
+UMASK_TAD_REQUESTS_G0_REGION_2 0x04
+UMASK_TAD_REQUESTS_G0_REGION_3 0x08
+UMASK_TAD_REQUESTS_G0_REGION_4 0x10
+UMASK_TAD_REQUESTS_G0_REGION_5 0x20
+UMASK_TAD_REQUESTS_G0_REGION_6 0x40
+UMASK_TAD_REQUESTS_G0_REGION_7 0x80
+
+EVENT_TAD_REQUESTS_G1 0x1C BBOX
+UMASK_TAD_REQUESTS_G1_REGION_8 0x01
+UMASK_TAD_REQUESTS_G1_REGION_9 0x02
+UMASK_TAD_REQUESTS_G1_REGION_10 0x04
+UMASK_TAD_REQUESTS_G1_REGION_11 0x08
+
+EVENT_TRACKER_INSERTS 0x06 BBOX
+UMASK_TRACKER_INSERTS_ALL 0x03
+
+EVENT_TXR_AD 0x0F BBOX
+UMASK_TXR_AD_NDR 0x01
+UMASK_TXR_AD_SNP 0x02
+
+EVENT_TXR_AD_CYCLES_FULL 0x2A BBOX
+UMASK_TXR_AD_CYCLES_FULL_SCHED0 0x01
+UMASK_TXR_AD_CYCLES_FULL_SCHED1 0x02
+UMASK_TXR_AD_CYCLES_FULL_ALL 0x03
+
+EVENT_TXR_AK_CYCLES_FULL 0x32 BBOX
+UMASK_TXR_AK_CYCLES_FULL_SCHED0 0x01
+UMASK_TXR_AK_CYCLES_FULL_SCHED1 0x02
+UMASK_TXR_AK_CYCLES_FULL_ALL 0x03
+
+EVENT_TXR_AK_NDR 0x0E BBOX
+UMASK_TXR_AK_NDR 0x00
+
+EVENT_TXR_BL 0x10 BBOX
+UMASK_TXR_BL_DRS_CACHE 0x01
+UMASK_TXR_BL_DRS_CORE 0x02
+UMASK_TXR_BL_DRS_QPI 0x04
+
+EVENT_TXR_BL_CYCLES_FULL 0x36 BBOX
+UMASK_TXR_BL_CYCLES_FULL_SCHED0 0x01
+UMASK_TXR_BL_CYCLES_FULL_SCHED1 0x02
+UMASK_TXR_BL_CYCLES_FULL_ALL 0x03
+
+EVENT_WPQ_CYCLES_NO_REG_CREDITS 0x18 BBOX
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0 0x01
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN1 0x02
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN2 0x04
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN3 0x08
+
+EVENT_DRAM_CLOCKTICKS 0x00 MBOX0FIX|MBOX1FIX|MBOX2FIX|MBOX3FIX
+UMASK_DRAM_CLOCKTICKS 0x00
+
+EVENT_ACT_COUNT 0x01 MBOX
+UMASK_ACT_COUNT 0x00
+
+EVENT_CAS_COUNT 0x04 MBOX
+UMASK_CAS_COUNT_RD_REG 0x01
+UMASK_CAS_COUNT_RD_UNDERFILL 0x02
+UMASK_CAS_COUNT_RD 0x03
+UMASK_CAS_COUNT_WR_WMM 0x04
+UMASK_CAS_COUNT_WR_RMM 0x08
+UMASK_CAS_COUNT_WR 0x0C
+UMASK_CAS_COUNT_ALL 0x0F
+
+EVENT_DRAM_PRE_ALL 0x06 MBOX
+UMASK_DRAM_PRE_ALL 0x00
+
+EVENT_DRAM_REFRESH 0x05 MBOX
+UMASK_DRAM_REFRESH_PANIC 0x02
+UMASK_DRAM_REFRESH_HIGH 0x04
+
+EVENT_ECC_CORRECTABLE_ERRORS 0x09 MBOX
+UMASK_ECC_CORRECTABLE_ERRORS 0x00
+
+EVENT_MAJOR_MODES 0x07 MBOX
+UMASK_MAJOR_MODES_READ 0x01
+UMASK_MAJOR_MODES_WRITE 0x02
+UMASK_MAJOR_MODES_PARTIAL 0x04
+UMASK_MAJOR_MODES_ISOCH 0x08
+
+EVENT_POWER_CHANNEL_DLLOFF 0x84 MBOX
+UMASK_POWER_CHANNEL_DLLOFF 0x00
+
+EVENT_POWER_CHANNEL_PPD 0x85 MBOX
+UMASK_POWER_CHANNEL_PPD 0x00
+
+EVENT_POWER_CKE_CYCLES 0x83 MBOX
+UMASK_POWER_CKE_CYCLES_RANK0 0x01
+UMASK_POWER_CKE_CYCLES_RANK1 0x02
+UMASK_POWER_CKE_CYCLES_RANK2 0x04
+UMASK_POWER_CKE_CYCLES_RANK3 0x08
+UMASK_POWER_CKE_CYCLES_RANK4 0x10
+UMASK_POWER_CKE_CYCLES_RANK5 0x20
+UMASK_POWER_CKE_CYCLES_RANK6 0x40
+UMASK_POWER_CKE_CYCLES_RANK7 0x80
+
+EVENT_POWER_CRITICAL_THROTTLE_CYCLES 0x86 MBOX
+UMASK_POWER_CRITICAL_THROTTLE_CYCLES 0x00
+
+EVENT_POWER_SELF_REFRESH 0x43 MBOX
+UMASK_POWER_SELF_REFRESH 0x00
+
+EVENT_POWER_THROTTLE_CYCLES 0x41 MBOX
+UMASK_POWER_THROTTLE_CYCLES_RANK0 0x01
+UMASK_POWER_THROTTLE_CYCLES_RANK1 0x02
+UMASK_POWER_THROTTLE_CYCLES_RANK2 0x04
+UMASK_POWER_THROTTLE_CYCLES_RANK3 0x08
+UMASK_POWER_THROTTLE_CYCLES_RANK4 0x10
+UMASK_POWER_THROTTLE_CYCLES_RANK5 0x20
+UMASK_POWER_THROTTLE_CYCLES_RANK6 0x40
+UMASK_POWER_THROTTLE_CYCLES_RANK7 0x80
+
+EVENT_PREEMPTION 0x08 MBOX
+UMASK_PREEMPTION_RD_PREEMPT_RD 0x01
+UMASK_PREEMPTION_RD_PREEMPT_WR 0x02
+
+EVENT_PRE_COUNT 0x02 MBOX
+UMASK_PRE_COUNT_PAGE_MISS 0x01
+UMASK_PRE_COUNT_PAGE_CLOSE 0x02
+
+EVENT_RPQ_CYCLES_FULL 0x12 MBOX
+UMASK_RPQ_CYCLES_FULL 0x00
+
+EVENT_RPQ_CYCLES_NE 0x11 MBOX
+UMASK_RPQ_CYCLES_NE 0x00
+
+EVENT_RPQ_INSERTS 0x10 MBOX
+UMASK_RPQ_INSERTS 0x00
+
+EVENT_RPQ_OCCUPANCY 0x80 MBOX
+UMASK_RPQ_OCCUPANCY 0x00
+
+EVENT_WPQ_CYCLES_FULL 0x22 MBOX
+UMASK_WPQ_CYCLES_FULL 0x00
+
+EVENT_WPQ_CYCLES_NE 0x21 MBOX
+UMASK_WPQ_CYCLES_NE 0x00
+
+EVENT_WPQ_INSERTS 0x20 MBOX
+UMASK_WPQ_INSERTS 0x00
+
+EVENT_WPQ_OCCUPANCY 0x81 MBOX
+UMASK_WPQ_OCCUPANCY 0x00
+
+EVENT_WPQ_READ_HIT 0x23 MBOX
+UMASK_WPQ_READ_HIT 0x00
+
+EVENT_WPQ_WRITE_HIT 0x24 MBOX
+UMASK_WPQ_WRITE_HIT 0x00
+
+EVENT_WBOX_CLOCKTICKS 0x00 WBOX
+UMASK_WBOX_CLOCKTICKS 0x00
+
+EVENT_CORE0_TRANSITION_CYCLES 0x03 WBOX
+UMASK_CORE0_TRANSITION_CYCLES 0x00
+
+EVENT_CORE1_TRANSITION_CYCLES 0x04 WBOX
+UMASK_CORE1_TRANSITION_CYCLES 0x00
+
+EVENT_CORE2_TRANSITION_CYCLES 0x05 WBOX
+UMASK_CORE2_TRANSITION_CYCLES 0x00
+
+EVENT_CORE3_TRANSITION_CYCLES 0x06 WBOX
+UMASK_CORE3_TRANSITION_CYCLES 0x00
+
+EVENT_CORE4_TRANSITION_CYCLES 0x07 WBOX
+UMASK_CORE4_TRANSITION_CYCLES 0x00
+
+EVENT_CORE5_TRANSITION_CYCLES 0x08 WBOX
+UMASK_CORE5_TRANSITION_CYCLES 0x00
+
+EVENT_CORE6_TRANSITION_CYCLES 0x09 WBOX
+UMASK_CORE6_TRANSITION_CYCLES 0x00
+
+EVENT_CORE7_TRANSITION_CYCLES 0x0A WBOX
+UMASK_CORE7_TRANSITION_CYCLES 0x00
+
+EVENT_DEMOTIONS_CORE0 0x1E WBOX
+OPTIONS_DEMOTIONS_CORE0 EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_DEMOTIONS_CORE0 0x00
+
+EVENT_DEMOTIONS_CORE1 0x1F WBOX
+OPTIONS_DEMOTIONS_CORE1 EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_DEMOTIONS_CORE1 0x00
+
+EVENT_DEMOTIONS_CORE2 0x20 WBOX
+OPTIONS_DEMOTIONS_CORE2 EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_DEMOTIONS_CORE2 0x00
+
+EVENT_DEMOTIONS_CORE3 0x21 WBOX
+OPTIONS_DEMOTIONS_CORE3 EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_DEMOTIONS_CORE3 0x00
+
+EVENT_DEMOTIONS_CORE4 0x22 WBOX
+OPTIONS_DEMOTIONS_CORE4 EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_DEMOTIONS_CORE4 0x00
+
+EVENT_DEMOTIONS_CORE5 0x23 WBOX
+OPTIONS_DEMOTIONS_CORE5 EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_DEMOTIONS_CORE5 0x00
+
+EVENT_DEMOTIONS_CORE6 0x24 WBOX
+OPTIONS_DEMOTIONS_CORE6 EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_DEMOTIONS_CORE6 0x00
+
+EVENT_DEMOTIONS_CORE7 0x25 WBOX
+OPTIONS_DEMOTIONS_CORE7 EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_DEMOTIONS_CORE7 0x00
+
+EVENT_FREQ_BAND0_CYCLES 0x0B WBOX
+OPTIONS_FREQ_BAND0_CYCLES EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND0_CYCLES 0x00
+
+EVENT_FREQ_BAND1_CYCLES 0x0C WBOX
+OPTIONS_FREQ_BAND1_CYCLES EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND1_CYCLES 0x00
+
+EVENT_FREQ_BAND2_CYCLES 0x0D WBOX
+OPTIONS_FREQ_BAND2_CYCLES EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND2_CYCLES 0x00
+
+EVENT_FREQ_BAND3_CYCLES 0x0E WBOX
+OPTIONS_FREQ_BAND3_CYCLES EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND3_CYCLES 0x00
+
+EVENT_FREQ_MAX_CURRENT_CYCLES 0x07 WBOX
+UMASK_FREQ_MAX_CURRENT_CYCLES 0x00
+
+EVENT_FREQ_MAX_LIMIT_THERMAL_CYCLES 0x04 WBOX
+UMASK_FREQ_MAX_LIMIT_THERMAL_CYCLES 0x00
+
+EVENT_FREQ_MAX_POWER_CYCLES 0x05 WBOX
+UMASK_FREQ_MAX_POWER_CYCLES 0x00
+
+EVENT_FREQ_MAX_OS_CYCLES 0x06 WBOX
+UMASK_FREQ_MAX_OS_CYCLES 0x00
+
+EVENT_FREQ_MIN_IO_P_CYCLES 0x01 WBOX
+UMASK_FREQ_MIN_IO_P_CYCLES 0x00
+
+EVENT_FREQ_MIN_PERF_P_CYCLES 0x02 WBOX
+UMASK_FREQ_MIN_PERF_P_CYCLES 0x00
+
+EVENT_FREQ_TRANS_CYCLES 0x00 WBOX
+UMASK_FREQ_TRANS_CYCLES 0x00
+
+EVENT_MEMORY_PHASE_SHEDDING_CYCLES 0x2F WBOX
+UMASK_MEMORY_PHASE_SHEDDING_CYCLES 0x00
+
+EVENT_POWER_STATE_OCCUPANCY 0x80 WBOX
+UMASK_POWER_STATE_OCCUPANCY_CORES_C0 0x40
+UMASK_POWER_STATE_OCCUPANCY_CORES_C3 0x80
+UMASK_POWER_STATE_OCCUPANCY_CORES_C6 0xC0
+
+EVENT_PROCHOT_EXTERNAL_CYCLES 0x0A WBOX
+UMASK_PROCHOT_EXTERNAL_CYCLES 0x00
+
+EVENT_PROCHOT_INTERNAL_CYCLES 0x09 WBOX
+UMASK_PROCHOT_INTERNAL_CYCLES 0x00
+
+EVENT_TOTAL_TRANSITION_CYCLES 0x0B WBOX
+UMASK_TOTAL_TRANSITION_CYCLES 0x00
+
+EVENT_VOLT_TRANS_CYCLES_CHANGE 0x03 WBOX
+UMASK_VOLT_TRANS_CYCLES_CHANGE 0x00
+
+EVENT_VOLT_TRANS_CYCLES_DECREASE 0x02 WBOX
+UMASK_VOLT_TRANS_CYCLES_DECREASE 0x00
+
+EVENT_VOLT_TRANS_CYCLES_INCREASE 0x01 WBOX
+UMASK_VOLT_TRANS_CYCLES_INCREASE 0x00
+
+EVENT_VR_HOT_CYCLES 0x32 WBOX
+UMASK_VR_HOT_CYCLES 0x00
+
+EVENT_CORES_IN_C3 0x00 WBOXFIX0
+UMASK_CORES_IN_C3 0x00
+
+EVENT_CORES_IN_C6 0x00 WBOXFIX1
+UMASK_CORES_IN_C6 0x00
+
+EVENT_SBOX_CLOCKTICKS 0x14 SBOX
+UMASK_SBOX_CLOCKTICKS 0x00
+
+EVENT_CTO_COUNT 0x38 SBOX
+UMASK_CTO_COUNT 0x00 0x200000
+
+EVENT_DIRECT2CORE 0x13 SBOX
+UMASK_DIRECT2CORE_SUCCESS 0x01
+UMASK_DIRECT2CORE_FAILURE_CREDITS 0x02
+UMASK_DIRECT2CORE_FAILURE_RBT 0x04
+UMASK_DIRECT2CORE_FAILURE_CREDIRTS_RBT 0x08
+
+EVENT_L1_POWER_CYCLES 0x12 SBOX
+UMASK_L1_POWER_CYCLES 0x00
+
+EVENT_RXL0P_POWER_CYCLES 0x10 SBOX
+UMASK_RXL0P_POWER_CYCLES 0x00
+
+EVENT_RXL0_POWER_CYCLES 0x0F SBOX
+UMASK_RXL0_POWER_CYCLES 0x00
+
+EVENT_RXL_BYPASSED 0x09 SBOX
+UMASK_RXL_BYPASSED 0x00
+
+EVENT_RXL_CREDITS_CONSUMED_VN0 0x1E SBOX
+UMASK_RXL_CREDITS_CONSUMED_VN0_DRS 0x01 0x200000
+UMASK_RXL_CREDITS_CONSUMED_VN0_NCB 0x02 0x200000
+UMASK_RXL_CREDITS_CONSUMED_VN0_NCS 0x04 0x200000
+UMASK_RXL_CREDITS_CONSUMED_VN0_HOM 0x08 0x200000
+UMASK_RXL_CREDITS_CONSUMED_VN0_SNP 0x10 0x200000
+UMASK_RXL_CREDITS_CONSUMED_VN0_NDR 0x20 0x200000
+
+EVENT_RXL_CREDITS_CONSUMED_VNA 0x1D SBOX
+UMASK_RXL_CREDITS_CONSUMED_VNA 0x00 0x200000
+
+EVENT_RXL_FLITS_G0 0x01 SBOX
+UMASK_RXL_FLITS_G0_IDLE 0x01
+UMASK_RXL_FLITS_G0_DATA 0x02
+UMASK_RXL_FLITS_G0_NON_DATA 0x04
+
+EVENT_RXL_FLITS_G1 0x02 SBOX
+UMASK_RXL_FLITS_G1_SNP 0x01 0x200000
+UMASK_RXL_FLITS_G1_HOM_REQ 0x02 0x200000
+UMASK_RXL_FLITS_G1_HOM_NONREQ 0x04 0x200000
+UMASK_RXL_FLITS_G1_HOM 0x06 0x200000
+UMASK_RXL_FLITS_G1_DRS_DATA 0x08 0x200000
+UMASK_RXL_FLITS_G1_DRS_NONDATA 0x10 0x200000
+UMASK_RXL_FLITS_G1_DRS 0x60 0x200000
+
+EVENT_RXL_FLITS_G2 0x03 SBOX
+UMASK_RXL_FLITS_G2_NDR_AD 0x01 0x200000
+UMASK_RXL_FLITS_G2_NDR_AK 0x02 0x200000
+UMASK_RXL_FLITS_G2_NCB_DATA 0x04 0x200000
+UMASK_RXL_FLITS_G2_NCB_NODATA 0x08 0x200000
+UMASK_RXL_FLITS_G2_NCB 0x06 0x200000
+UMASK_RXL_FLITS_G2_NCS 0x10 0x200000
+
+EVENT_RXL_INSERTS 0x08 SBOX
+UMASK_RXL_INSERTS 0x00
+
+EVENT_RXL_INSERTS_DRS 0x09 SBOX
+UMASK_RXL_INSERTS_DRS 0x00 0x200000
+
+EVENT_RXL_INSERTS_HOM 0x0C SBOX
+UMASK_RXL_INSERTS_HOM 0x00 0x200000
+
+EVENT_RXL_INSERTS_NCB 0x0A SBOX
+UMASK_RXL_INSERTS_NCB 0x00 0x200000
+
+EVENT_RXL_INSERTS_NCS 0x0B SBOX
+UMASK_RXL_INSERTS_NCS 0x00 0x200000
+
+EVENT_RXL_INSERTS_NDR 0x0E SBOX
+UMASK_RXL_INSERTS_NDR 0x00 0x200000
+
+EVENT_RXL_INSERTS_SNP 0x0D SBOX
+UMASK_RXL_INSERTS_SNP 0x00 0x200000
+
+EVENT_RXL_OCCUPANCY 0x0B SBOX
+UMASK_RXL_OCCUPANCY 0x00
+
+EVENT_RXL_OCCUPANCY_DRS 0x15 SBOX
+UMASK_RXL_OCCUPANCY_DRS 0x00 0x200000
+
+EVENT_RXL_OCCUPANCY_HOM 0x18 SBOX
+UMASK_RXL_OCCUPANCY_HOM 0x00 0x200000
+
+EVENT_RXL_OCCUPANCY_NCB 0x16 SBOX
+UMASK_RXL_OCCUPANCY_NCB 0x00 0x200000
+
+EVENT_RXL_OCCUPANCY_NCS 0x17 SBOX
+UMASK_RXL_OCCUPANCY_NCS 0x00 0x200000
+
+EVENT_RXL_OCCUPANCY_NDR 0x1A SBOX
+UMASK_RXL_OCCUPANCY_NDR 0x00 0x200000
+
+EVENT_RXL_OCCUPANCY_SNP 0x19 SBOX
+UMASK_RXL_OCCUPANCY_SNP 0x00 0x200000
+
+EVENT_TXL0P_POWER_CYCLES 0x0D SBOX
+UMASK_TXL0P_POWER_CYCLES 0x00
+
+EVENT_TXL0_POWER_CYCLES 0x0C SBOX
+UMASK_TXL0_POWER_CYCLES 0x00
+
+EVENT_TXL_BYPASSED 0x05 SBOX
+UMASK_TXL_BYPASSED 0x00
+
+EVENT_TXL_CYCLES_NE 0x06 SBOX
+UMASK_TXL_CYCLES_NE 0x00
+
+EVENT_TXL_FLITS_G0 0x00 SBOX
+UMASK_TXL_FLITS_G0_IDLE 0x01
+UMASK_TXL_FLITS_G0_DATA 0x02
+UMASK_TXL_FLITS_G0_NON_DATA 0x04
+
+EVENT_TXL_FLITS_G1 0x00 SBOX
+UMASK_TXL_FLITS_G1_SNP 0x01 0x200000
+UMASK_TXL_FLITS_G1_HOM_REQ 0x02 0x200000
+UMASK_TXL_FLITS_G1_HOM_NONREQ 0x04 0x200000
+UMASK_TXL_FLITS_G1_HOM 0x06 0x200000
+UMASK_TXL_FLITS_G1_DRS_DATA 0x08 0x200000
+UMASK_TXL_FLITS_G1_DRS_NONDATA 0x10 0x200000
+UMASK_TXL_FLITS_G1_DRS 0x60 0x200000
+
+EVENT_TXL_FLITS_G2 0x01 SBOX
+UMASK_TXL_FLITS_G2_NDR_AD 0x01 0x200000
+UMASK_TXL_FLITS_G2_NDR_AK 0x02 0x200000
+UMASK_TXL_FLITS_G2_NCB_DATA 0x04 0x200000
+UMASK_TXL_FLITS_G2_NCB_NODATA 0x08 0x200000
+UMASK_TXL_FLITS_G2_NCB 0x06 0x200000
+UMASK_TXL_FLITS_G2_NCS 0x10 0x200000
+
+EVENT_TXL_INSERTS 0x04 SBOX
+UMASK_TXL_INSERTS 0x00
+
+EVENT_TXL_OCCUPANCY 0x07 SBOX
+UMASK_TXL_OCCUPANCY 0x00
+
+EVENT_VNA_CREDIT_RETURNS 0x1C SBOX
+UMASK_VNA_CREDIT_RETURNS 0x00 0x200000
+
+EVENT_VNA_CREDIT_RETURN_OCCUPANCY 0x1B SBOX
+UMASK_VNA_CREDIT_RETURN_OCCUPANCY 0x00 0x200000
+
+EVENT_QPI_RATE 0x00 SBOX0FIX|SBOX1FIX
+UMASK_QPI_RATE 0x00
+
+EVENT_QPI_SLOW_MODE 0x01 SBOX0FIX|SBOX1FIX
+UMASK_QPI_SLOW_MODE 0x00
+
+EVENT_PBOX_CLOCKTICKS 0x01 PBOX
+UMASK_PBOX_CLOCKTICKS 0x00
+
+EVENT_RING_AD_USED 0x07 PBOX
+UMASK_RING_AD_USED_CW_EVEN 0x01
+UMASK_RING_AD_USED_CW_ODD 0x02
+UMASK_RING_AD_USED_CCW_EVEN 0x04
+UMASK_RING_AD_USED_CCW_EVEN 0x08
+
+EVENT_RING_AK_USED 0x08 PBOX
+UMASK_RING_AK_USED_CW_EVEN 0x01
+UMASK_RING_AK_USED_CW_ODD 0x02
+UMASK_RING_AK_USED_CCW_EVEN 0x04
+UMASK_RING_AK_USED_CCW_EVEN 0x08
+
+EVENT_RING_BL_USED 0x09 PBOX
+UMASK_RING_BL_USED_CW_EVEN 0x01
+UMASK_RING_BL_USED_CW_ODD 0x02
+UMASK_RING_BL_USED_CCW_EVEN 0x04
+UMASK_RING_BL_USED_CCW_EVEN 0x08
+
+EVENT_RING_IV_USED 0x0A PBOX
+UMASK_RING_IV_USED_ANY 0x0F
+
+EVENT_RXR_AK_BOUNCES 0x12 PBOX0
+UMASK_RXR_AK_BOUNCES 0x00
+
+EVENT_RXR_CYCLES_NE 0x10 PBOX0|PBOX1
+UMASK_RXR_CYCLES_NE_DRS 0x08
+UMASK_RXR_CYCLES_NE_NCB 0x10
+UMASK_RXR_CYCLES_NE_NCS 0x20
+
+EVENT_TXR_CYCLES_FULL 0x25 PBOX0
+UMASK_TXR_CYCLES_FULL_AD 0x01
+UMASK_TXR_CYCLES_FULL_AK 0x02
+UMASK_TXR_CYCLES_FULL_BL 0x04
+
+EVENT_TXR_CYCLES_NE 0x23 PBOX0
+UMASK_TXR_CYCLES_NE_AD 0x01
+UMASK_TXR_CYCLES_NE_AK 0x02
+UMASK_TXR_CYCLES_NE_BL 0x04
+
+EVENT_TXR_INSERTS 0x24 PBOX0
+UMASK_TXR_INSERTS 0x00
+
+EVENT_RBOX_CLOCKTICKS 0x01 RBOX
+UMASK_RBOX_CLOCKTICKS 0x00
+
+EVENT_IIO_CREDITS_ACQUIRED 0x20 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_IIO_CREDITS_ACQUIRED_DRS 0x08
+UMASK_IIO_CREDITS_ACQUIRED_NCB 0x10
+UMASK_IIO_CREDITS_ACQUIRED_NCS 0x20
+
+EVENT_IIO_CREDITS_REJECT 0x21 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_IIO_CREDITS_REJECT_DRS 0x08
+UMASK_IIO_CREDITS_REJECT_NCB 0x10
+UMASK_IIO_CREDITS_REJECT_NCS 0x20
+
+EVENT_IIO_CREDITS_USED 0x22 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_IIO_CREDITS_USED_DRS 0x08
+UMASK_IIO_CREDITS_USED_NCB 0x10
+UMASK_IIO_CREDITS_USED_NCS 0x20
+
+EVENT_RING_AD_USED 0x07 RBOX
+UMASK_RING_AD_USED_CW_EVEN 0x01
+UMASK_RING_AD_USED_CW_ODD 0x02
+UMASK_RING_AD_USED_CCW_EVEN 0x04
+UMASK_RING_AD_USED_CCW_ODD 0x08
+
+EVENT_RING_AK_USED 0x08 RBOX
+UMASK_RING_AK_USED_CW_EVEN 0x01
+UMASK_RING_AK_USED_CW_ODD 0x02
+UMASK_RING_AK_USED_CCW_EVEN 0x04
+UMASK_RING_AK_USED_CCW_ODD 0x08
+
+EVENT_RING_BL_USED 0x09 RBOX
+UMASK_RING_BL_USED_CW_EVEN 0x01
+UMASK_RING_BL_USED_CW_ODD 0x02
+UMASK_RING_BL_USED_CCW_EVEN 0x04
+UMASK_RING_BL_USED_CCW_ODD 0x08
+
+EVENT_RING_IV_USED 0x0A RBOX
+UMASK_RING_IV_USED_ANY 0x0F
+
+EVENT_RXR_BYPASSED 0x12 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_BYPASSED 0x00
+
+EVENT_RXR_CYCLES_NE 0x10 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_CYCLES_NE_HOM 0x01
+UMASK_RXR_CYCLES_NE_SNP 0x02
+UMASK_RXR_CYCLES_NE_NDR 0x04
+UMASK_RXR_CYCLES_NE_DRS 0x08
+UMASK_RXR_CYCLES_NE_NCB 0x10
+UMASK_RXR_CYCLES_NE_NCS 0x20
+
+EVENT_RXR_INSERTS 0x11 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_INSERTS_HOM 0x01
+UMASK_RXR_INSERTS_SNP 0x02
+UMASK_RXR_INSERTS_NDR 0x04
+UMASK_RXR_INSERTS_DRS 0x08
+UMASK_RXR_INSERTS_NCB 0x10
+UMASK_RXR_INSERTS_NCS 0x20
+
+EVENT_RXR_OCCUPANCY 0x13 RBOX0C0|RBOX1C0
+UMASK_RXR_OCCUPANCY_HOM 0x01
+UMASK_RXR_OCCUPANCY_SNP 0x02
+UMASK_RXR_OCCUPANCY_NDR 0x04
+UMASK_RXR_OCCUPANCY_DRS 0x08
+UMASK_RXR_OCCUPANCY_NCB 0x10
+UMASK_RXR_OCCUPANCY_NCS 0x20
+
+EVENT_TXR_CYCLES_FULL 0x25 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_TXR_CYCLES_FULL 0x00
+
+EVENT_TXR_CYCLES_NE 0x23 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_TXR_CYCLES_NE 0x00
+
+EVENT_TXR_INSERTS 0x24 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1|PBOX0
+UMASK_TXR_INSERTS_HOM 0x01
+UMASK_TXR_INSERTS_SNP 0x02
+UMASK_TXR_INSERTS_NDR 0x04
+UMASK_TXR_INSERTS_DRS 0x08
+UMASK_TXR_INSERTS_NCB 0x10
+UMASK_TXR_INSERTS_NCS 0x20
+
+EVENT_TXR_NACK 0x26 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_TXR_NACK 0x00
+
+EVENT_VN0_CREDITS_REJECT 0x37 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN0_CREDITS_REJECT_HOM 0x01
+UMASK_VN0_CREDITS_REJECT_SNP 0x02
+UMASK_VN0_CREDITS_REJECT_NDR 0x04
+UMASK_VN0_CREDITS_REJECT_DRS 0x08
+UMASK_VN0_CREDITS_REJECT_NCB 0x10
+UMASK_VN0_CREDITS_REJECT_NCS 0x20
+
+EVENT_VN0_CREDITS_USED 0x36 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN0_CREDITS_USED_HOM 0x01
+UMASK_VN0_CREDITS_USED_SNP 0x02
+UMASK_VN0_CREDITS_USED_NDR 0x04
+UMASK_VN0_CREDITS_USED_DRS 0x08
+UMASK_VN0_CREDITS_USED_NCB 0x10
+UMASK_VN0_CREDITS_USED_NCS 0x20
+
+EVENT_VNA_CREDITS_ACQUIRED 0x33 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VNA_CREDITS_ACQUIRED 0x00
+
+EVENT_VNA_CREDITS_REJECT 0x34 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VNA_CREDITS_REJECT_HOM 0x01
+UMASK_VNA_CREDITS_REJECT_SNP 0x02
+UMASK_VNA_CREDITS_REJECT_NDR 0x04
+UMASK_VNA_CREDITS_REJECT_DRS 0x08
+UMASK_VNA_CREDITS_REJECT_NCB 0x10
+UMASK_VNA_CREDITS_REJECT_NCS 0x20
+
+EVENT_VNA_CREDITS_CYCLES_OUT 0x31 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VNA_CREDITS_CYCLES_OUT 0x00
+
+EVENT_VNA_CREDITS_CYCLES_USED 0x32 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VNA_CREDITS_CYCLES_USED 0x00
+
+EVENT_EVENT_MSG 0x42 UBOX
+UMASK_EVENT_MSG_VLW_RCVD 0x01
+UMASK_EVENT_MSG_MSI_RCVD 0x02
+UMASK_EVENT_MSG_IPI_RCVD 0x04
+UMASK_EVENT_MSG_DOORBELL_RCVD 0x08
+UMASK_EVENT_MSG_INT_PRIO 0x10
+
+EVENT_LOCK_CYCLES 0x44 UBOX
+UMASK_LOCK_CYCLES 0x00
+
+EVENT_UNCORE_CLOCK 0x0 UBOXFIX
+UMASK_UNCORE_CLOCK 0x0
diff --git a/src/includes/perfmon_sandybridge_counters.h b/src/includes/perfmon_sandybridge_counters.h
index afe9c04..e8dca5b 100644
--- a/src/includes/perfmon_sandybridge_counters.h
+++ b/src/includes/perfmon_sandybridge_counters.h
@@ -3,15 +3,16 @@
*
* Filename: perfmon_sandybridge_counters.h
*
- * Description: Counter header file of perfmon module for Sandy Bridge.
+ * Description: Counter header file of perfmon module for Intel Sandy Bridge.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -28,48 +29,60 @@
* =======================================================================================
*/
-#define NUM_COUNTERS_SANDYBRIDGE 32
-#define NUM_COUNTERS_UNCORE_SANDYBRIDGE 12
+
#define NUM_COUNTERS_CORE_SANDYBRIDGE 8
+#define NUM_COUNTERS_UNCORE_SANDYBRIDGE 15
+#define NUM_COUNTERS_SANDYBRIDGE 23
+
+#define SNB_VALID_OPTIONS_FIXED EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_ANYTHREAD_MASK
+#define SNB_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK| \
+ EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define SNB_VALID_OPTIONS_CBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+#define SNB_VALID_OPTIONS_UBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+
-static PerfmonCounterMap sandybridge_counter_map[NUM_COUNTERS_SANDYBRIDGE] = {
+static RegisterMap sandybridge_counter_map[NUM_COUNTERS_SANDYBRIDGE] = {
/* Fixed Counters: instructions retired, cycles unhalted core */
- {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0},
- {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0},
- {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0},
+ {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, SNB_VALID_OPTIONS_FIXED},
+ {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, SNB_VALID_OPTIONS_FIXED},
+ {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, SNB_VALID_OPTIONS_FIXED},
/* PMC Counters: 4 48bit wide */
- {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
- {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0},
- {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0},
- {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0},
+ {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, SNB_VALID_OPTIONS_PMC},
+ {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, SNB_VALID_OPTIONS_PMC},
+ {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, SNB_VALID_OPTIONS_PMC},
+ {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, SNB_VALID_OPTIONS_PMC},
/* Temperature Sensor*/
- {"TMP0", PMC7, THERMAL, 0, 0, 0, 0},
+ {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
/* RAPL counters */
- {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0},
- {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0},
- {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS, 0, 0},
- {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS, 0, 0},
- /* IMC Counters: 4 48bit wide per memory channel, split in two reads */
- {"MBOX0C0",PMC12, MBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_0},
- {"MBOX0C1",PMC13, MBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_0},
- {"MBOX0C2",PMC14, MBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_0},
- {"MBOX0C3",PMC15, MBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_0},
- {"MBOX1C0",PMC16, MBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_1},
- {"MBOX1C1",PMC17, MBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_1},
- {"MBOX1C2",PMC18, MBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_1},
- {"MBOX1C3",PMC19, MBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_1},
- {"MBOX2C0",PMC20, MBOX2, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_2},
- {"MBOX2C1",PMC21, MBOX2, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_2},
- {"MBOX2C2",PMC22, MBOX2, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_2},
- {"MBOX2C3",PMC23, MBOX2, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_2},
- {"MBOX3C0",PMC24, MBOX3, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_3},
- {"MBOX3C1",PMC25, MBOX3, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_3},
- {"MBOX3C2",PMC26, MBOX3, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_3},
- {"MBOX3C3",PMC27, MBOX3, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_3},
- {"MBOX0FIX",PMC28, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_0},
- {"MBOX1FIX",PMC29, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_1},
- {"MBOX2FIX",PMC30, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_2},
- {"MBOX3FIX",PMC31, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_3},
+ {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"CBOX0C0", PMC12, CBOX0, MSR_UNC_CBO_0_PERFEVTSEL0, MSR_UNC_CBO_0_CTR0, 0, 0, SNB_VALID_OPTIONS_CBOX},
+ {"CBOX0C1", PMC13, CBOX0, MSR_UNC_CBO_0_PERFEVTSEL1, MSR_UNC_CBO_0_CTR1, 0, 0, SNB_VALID_OPTIONS_CBOX},
+ {"CBOX1C0", PMC14, CBOX1, MSR_UNC_CBO_1_PERFEVTSEL0, MSR_UNC_CBO_1_CTR0, 0, 0, SNB_VALID_OPTIONS_CBOX},
+ {"CBOX1C1", PMC15, CBOX1, MSR_UNC_CBO_1_PERFEVTSEL1, MSR_UNC_CBO_1_CTR1, 0, 0, SNB_VALID_OPTIONS_CBOX},
+ {"CBOX2C0", PMC16, CBOX2, MSR_UNC_CBO_2_PERFEVTSEL0, MSR_UNC_CBO_2_CTR0, 0, 0, SNB_VALID_OPTIONS_CBOX},
+ {"CBOX2C1", PMC17, CBOX2, MSR_UNC_CBO_2_PERFEVTSEL1, MSR_UNC_CBO_2_CTR1, 0, 0, SNB_VALID_OPTIONS_CBOX},
+ {"CBOX3C0", PMC18, CBOX3, MSR_UNC_CBO_3_PERFEVTSEL0, MSR_UNC_CBO_3_CTR0, 0, 0, SNB_VALID_OPTIONS_CBOX},
+ {"CBOX3C1", PMC19, CBOX3, MSR_UNC_CBO_3_PERFEVTSEL1, MSR_UNC_CBO_3_CTR1, 0, 0, SNB_VALID_OPTIONS_CBOX},
+ {"UBOX0", PMC20, UBOX, MSR_UNC_ARB_PERFEVTSEL0, MSR_UNC_ARB_CTR0, 0, 0, SNB_VALID_OPTIONS_UBOX},
+ {"UBOX1", PMC21, UBOX, MSR_UNC_ARB_PERFEVTSEL1, MSR_UNC_ARB_CTR1, 0, 0, SNB_VALID_OPTIONS_UBOX},
+ {"UBOXFIX", PMC22, UBOXFIX, MSR_UNC_PERF_FIXED_CTRL, MSR_UNC_PERF_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
};
+static BoxMap sandybridge_box_map[NUM_UNITS] = {
+ [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, MSR_DEV, 48},
+ [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, MSR_DEV, 48},
+ [THERMAL] = {0, 0, 0, 0, 0, MSR_DEV, 8},
+ [POWER] = {0, 0, 0, 0, 0, MSR_DEV, 32},
+ [CBOX0] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+ [CBOX1] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+ [CBOX2] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+ [CBOX3] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+ [UBOX] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 1, 0, MSR_DEV, 44},
+ [UBOXFIX] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 0, 0, MSR_DEV, 44},
+};
+
+
diff --git a/src/includes/perfmon_sandybridge_events.txt b/src/includes/perfmon_sandybridge_events.txt
index ec4d397..8bab52b 100644
--- a/src/includes/perfmon_sandybridge_events.txt
+++ b/src/includes/perfmon_sandybridge_events.txt
@@ -1,16 +1,17 @@
# =======================================================================================
-#
+#
# Filename: perfmon_sandybridge_events.txt
-#
+#
# Description: Event list for Intel SandyBridge
-#
-# Version: 3.1.3
-# Released: 4.11.2014
-#
-# Author: Jan Treibig (jt), jan.treibig at gmail.com
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
+# Author: Jan Treibig (jt), jan.treibig at gmail.com
+# Thomas Roehl (tr), thomas.roehl at googlemail.com
# Project: likwid
#
-# Copyright (C) 2014 Jan Treibig
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
@@ -26,8 +27,8 @@
#
# =======================================================================================
-EVENT_TEMP_CORE 0x00 TMP0
-UMASK_TEMP_CORE 0x00
+EVENT_TEMP_CORE 0x00 TMP0
+UMASK_TEMP_CORE 0x00
EVENT_PWR_PKG_ENERGY 0x00 PWR0
UMASK_PWR_PKG_ENERGY 0x00
@@ -35,17 +36,20 @@ UMASK_PWR_PKG_ENERGY 0x00
EVENT_PWR_PP0_ENERGY 0x00 PWR1
UMASK_PWR_PP0_ENERGY 0x00
-EVENT_PWR_DRAM_ENERGY 0x00 PWR3
-UMASK_PWR_DRAM_ENERGY 0x00
+EVENT_PWR_PP1_ENERGY 0x00 PWR2
+UMASK_PWR_PP1_ENERGY 0x00
-EVENT_INSTR_RETIRED 0x00 FIXC0
-UMASK_INSTR_RETIRED_ANY 0x00
+EVENT_PWR_DRAM_ENERGY 0x00 PWR3
+UMASK_PWR_DRAM_ENERGY 0x00
-EVENT_CPU_CLK_UNHALTED 0x00 FIXC1
-UMASK_CPU_CLK_UNHALTED_CORE 0x00
+EVENT_INSTR_RETIRED 0x00 FIXC0
+UMASK_INSTR_RETIRED_ANY 0x00
-EVENT_CPU_CLK_UNHALTED 0x00 FIXC2
-UMASK_CPU_CLK_UNHALTED_REF 0x00
+EVENT_CPU_CLK_UNHALTED 0x00 FIXC1
+UMASK_CPU_CLK_UNHALTED_CORE 0x00
+
+EVENT_CPU_CLK_UNHALTED 0x00 FIXC2
+UMASK_CPU_CLK_UNHALTED_REF 0x00
EVENT_LOAD_BLOCKS 0x03 PMC
UMASK_LOAD_BLOCKS_DATA_UNKNOWN 0x01
@@ -58,121 +62,144 @@ UMASK_MISALIGN_MEM_REF_LOAD 0x01
UMASK_MISALIGN_MEM_REF_STORE 0x02
UMASK_MISALIGN_MEM_REF_ANY 0x03
-EVENT_LD_BLOCKS_PARTIAL 0x07 PMC
-UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS 0x01 PMC
-UMASK_LD_BLOCKS_PARTIAL_ALL_STA_BLOCK 0x08 PMC
+EVENT_LD_BLOCKS_PARTIAL 0x07 PMC
+UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS 0x01
+UMASK_LD_BLOCKS_PARTIAL_ALL_STA_BLOCK 0x08
-EVENT_DTLB_LOAD_MISSES 0x08 PMC
+EVENT_DTLB_LOAD_MISSES 0x08 PMC
UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK 0x01
UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED 0x02
-UMASK_DTLB_LOAD_MISSES_WALK_DURATION 0x04
-
-EVENT_INT_MISC 0x0D PMC
-UMASK_INT_MISC_RECOVERY_CYCLES 0x03 0x41 0x01
-UMASK_INT_MISC_STALL_CYCLES 0x40
-
-EVENT_UOPS_ISSUED 0x0E PMC
-UMASK_UOPS_ISSUED_ANY 0x01
-
-EVENT_FP_COMP_OPS_EXE 0x10 PMC
-UMASK_FP_COMP_OPS_EXE_X87 0x01
+UMASK_DTLB_LOAD_MISSES_WALK_DURATION 0x04
+
+EVENT_INT_MISC 0x0D PMC
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_INT_MISC_RECOVERY_CYCLES 0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_INT_MISC_RECOVERY_COUNT 0x03
+UMASK_INT_MISC_RAT_STALL_CYCLES 0x40
+DEFAULT_OPTIONS_INT_MISC_RAT_STALL_COUNT EVENT_OPTION_EDGE=1
+UMASK_INT_MISC_RAT_STALL_COUNT 0x40
+
+EVENT_UOPS_ISSUED 0x0E PMC
+UMASK_UOPS_ISSUED_ANY 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_ACTIVE_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_ISSUED_ACTIVE_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_ANY 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_ACTIVE_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_ACTIVE_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_TOTAL_CYCLES 0x01
+
+EVENT_FP_COMP_OPS_EXE 0x10 PMC
+UMASK_FP_COMP_OPS_EXE_X87 0x01
UMASK_FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE 0x10
UMASK_FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE 0x20
UMASK_FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE 0x40
UMASK_FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE 0x80
-EVENT_SIMD_FP_256_PACKED 0x11 PMC
+EVENT_SIMD_FP_256_PACKED 0x11 PMC
UMASK_SIMD_FP_256_PACKED_SINGLE 0x01
UMASK_SIMD_FP_256_PACKED_DOUBLE 0x02
EVENT_ARITH 0x14 PMC
UMASK_ARITH_FPU_DIV_ACTIVE 0x01
-UMASK_ARITH_NUM_DIV 0x01 0xC5 0x01
+DEFAULT_OPTIONS_ARITH_NUM_DIV EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_ARITH_NUM_DIV 0x01
EVENT_INSTS_WRITTEN_TO_IQ 0x17 PMC
-UMASK_INSTS_WRITTEN_TO_IQ_INSTS 0x01
-
-EVENT_L2_RQSTS 0x24 PMC
-UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD_HIT 0x01
-UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD 0x03
-UMASK_L2_RQSTS_RFO_HITS 0x04
-UMASK_L2_RQSTS_RFO_MISS 0x08
-UMASK_L2_RQSTS_RFO_ANY 0x0C
-UMASK_L2_RQSTS_CODE_RD_HITS 0x10
-UMASK_L2_RQSTS_CODE_RD_MISS 0x20
-UMASK_L2_RQSTS_ALL_CODE_CODE_RD 0x30
-UMASK_L2_RQSTS_PF_HIT 0x40
-UMASK_L2_RQSTS_PF_MISS 0x80
-UMASK_L2_RQSTS_ALL_PF 0xC0
-UMASK_L2_RQSTS_MISS 0xAA
+UMASK_INSTS_WRITTEN_TO_IQ_INSTS 0x01
+
+EVENT_L2_RQSTS 0x24 PMC
+UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD_HIT 0x01
+UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD_MISS 0x02
+UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD 0x03
+UMASK_L2_RQSTS_RFO_HITS 0x04
+UMASK_L2_RQSTS_RFO_MISS 0x08
+UMASK_L2_RQSTS_RFO_ANY 0x0C
+UMASK_L2_RQSTS_CODE_RD_HITS 0x10
+UMASK_L2_RQSTS_CODE_RD_MISS 0x20
+UMASK_L2_RQSTS_ALL_CODE_CODE_RD 0x30
+UMASK_L2_RQSTS_PF_HIT 0x40
+UMASK_L2_RQSTS_PF_MISS 0x80
+UMASK_L2_RQSTS_ALL_PF 0xC0
+UMASK_L2_RQSTS_MISS 0xAA
EVENT_L2_STORE_LOCK_RQSTS 0x27 PMC
UMASK_L2_STORE_LOCK_RQSTS_MISS 0x01
-UMASK_L2_STORE_LOCK_RQSTS_HIT_E 0x04
-UMASK_L2_STORE_LOCK_RQSTS_HIT_M 0x08
+UMASK_L2_STORE_LOCK_RQSTS_HIT_E 0x04
+UMASK_L2_STORE_LOCK_RQSTS_HIT_M 0x08
UMASK_L2_STORE_LOCK_RQSTS_ALL 0x0F
EVENT_L1D_WB_RQST 0x28 PMC
-UMASK_L1D_WB_RQST_HIT_E 0x04
-UMASK_L1D_WB_RQST_HIT_M 0x08
+UMASK_L1D_WB_RQST_HIT_E 0x04
+UMASK_L1D_WB_RQST_HIT_M 0x08
EVENT_L3_LAT_CACHE 0x2E PMC
UMASK_L3_LAT_CACHE_REFERENCE 0x4F
UMASK_L3_LAT_CACHE_MISS 0x41
-EVENT_CPU_CLOCK_UNHALTED 0x3C PMC
+EVENT_CPU_CLOCK_UNHALTED 0x3C PMC
UMASK_CPU_CLOCK_UNHALTED_THREAD_P 0x00
UMASK_CPU_CLOCK_UNHALTED_REF_P 0x01
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0x2,EVENT_OPTION_INVERT=0x1
+UMASK_CPU_CLOCK_UNHALTED_TOTAL_CYCLES 0x00
EVENT_L1D_PEND_MISS 0x48 PMC1
UMASK_L1D_PEND_MISS_PENDING 0x01
-EVENT_DTLB_STORE_MISSES 0x49 PMC
-UMASK_DTLB_STORE_MISSES_MISS_CAUSES_A_WALK 0x01
-UMASK_DTLB_STORE_MISSES_WALK_COMPLETED 0x02
-UMASK_DTLB_STORE_MISSES_WALK_DURATION 0x04
-UMASK_DTLB_STORE_MISSES_STLB_HIT 0x10
+EVENT_DTLB_STORE_MISSES 0x49 PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK 0x01
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED 0x02
+UMASK_DTLB_STORE_MISSES_WALK_DURATION 0x04
+UMASK_DTLB_STORE_MISSES_STLB_HIT 0x10
-EVENT_LOAD_HIT_PRE 0x4C PMC
+EVENT_LOAD_HIT_PRE 0x4C PMC
UMASK_LOAD_HIT_PRE_SW_PF 0x01
UMASK_LOAD_HIT_PRE_HW_PF 0x02
EVENT_HW_PRE_REQ 0x4E PMC
UMASK_HW_PRE_REQ_DL1_MISS 0x02
-EVENT_L1D 0x51 PMC
+EVENT_L1D 0x51 PMC
UMASK_L1D_REPLACEMENT 0x01
UMASK_L1D_ALLOCATED_IN_M 0x02
UMASK_L1D_M_EVICT 0x04
UMASK_L1D_ALL_M_REPLACEMENT 0x08
-EVENT_PARTIAL_RAT_STALLS 0x59 PMC
+EVENT_PARTIAL_RAT_STALLS 0x59 PMC
UMASK_PARTIAL_RAT_STALLS_FLAGS_MERGE_UOP 0x20
UMASK_PARTIAL_RAT_STALLS_SLOW_LEA_WINDOW 0x40
-UMASK_PARTIAL_RAT_STALLS_MUL_SINGLE_UOP 0x80
+UMASK_PARTIAL_RAT_STALLS_MUL_SINGLE_UOP 0x80
-EVENT_RESOURCE_STALLS2 0x5B PMC
+EVENT_RESOURCE_STALLS2 0x5B PMC
UMASK_RESOURCE_STALLS2_ALL_FL_EMPTY 0x0C
-UMASK_RESOURCE_STALLS2_ALL_PRF_CONTROL 0x0F
-UMASK_RESOURCE_STALLS2_BOB_FULL 0x40
-UMASK_RESOURCE_STALLS2_OOO_RSRC 0x4F
+UMASK_RESOURCE_STALLS2_ALL_PRF_CONTROL 0x0F
+UMASK_RESOURCE_STALLS2_BOB_FULL 0x40
+UMASK_RESOURCE_STALLS2_OOO_RSRC 0x4F
EVENT_CPL_CYCLES 0x5C PMC
-UMASK_CPL_CYCLES_RING0 0x01
-UMASK_CPL_CYCLES_RING123 0x02
+UMASK_CPL_CYCLES_RING0 0x01
+UMASK_CPL_CYCLES_RING123 0x02
-EVENT_RS_EVENTS 0x5E PMC
+EVENT_RS_EVENTS 0x5E PMC
UMASK_RS_EVENTS_EMPTY_CYCLES 0x01
-EVENT_OFFCORE_REQUESTS_OUTSTANDING 0x60 PMC
+EVENT_OFFCORE_REQUESTS_OUTSTANDING 0x60 PMC
UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD 0x01
UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD 0x02
-UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO 0x04
-UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD 0x08
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO 0x04
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD 0x08
-EVENT_CACHE_LOCK_CYCLES 0x63 PMC
-UMASK_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION 0x01
-UMASK_CACHE_LOCK_CYCLES_CACHE_LOCK_DURATION 0x02
+EVENT_CACHE_LOCK_CYCLES 0x63 PMC
+UMASK_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION 0x01
+UMASK_CACHE_LOCK_CYCLES_CACHE_LOCK_DURATION 0x02
EVENT_IDQ 0x79 PMC
UMASK_IDQ_EMPTY 0x02
@@ -182,8 +209,8 @@ UMASK_IDQ_MS_DSB_UOPS 0x10
UMASK_IDQ_MS_MITE_UOPS 0x20
UMASK_IDQ_MS_UOPS 0x30
-EVENT_ICACHE 0x80 PMC
-UMASK_ICACHE_HITS 0x01
+EVENT_ICACHE 0x80 PMC
+UMASK_ICACHE_HITS 0x01
UMASK_ICACHE_MISSES 0x02
UMASK_ICACHE_ACCESSES 0x03
UMASK_ICACHE_IFETCH_STALL 0x04
@@ -192,54 +219,58 @@ EVENT_ITLB_MISSES 0x85 PMC
UMASK_ITLB_MISSES_CAUSES_A_WALK 0x01
UMASK_ITLB_MISSES_WALK_COMPLETED 0x02
UMASK_ITLB_MISSES_WALK_DURATION 0x04
-UMASK_ITLB_MISSES_STLB_HIT 0x10
+UMASK_ITLB_MISSES_STLB_HIT 0x10
EVENT_ILD_STALL 0x87 PMC
UMASK_ILD_STALL_LCP 0x01
UMASK_ILD_STALL_IQ_FULL 0x04
-EVENT_BR_INST_EXEC 0x88 PMC
-UMASK_BR_INST_EXEC_COND_TAKEN 0x81
-UMASK_BR_INST_EXEC_COND_NON_TAKEN 0x41
-UMASK_BR_INST_EXEC_DIRECT_JMP_TAKEN 0x82
-UMASK_BR_INST_EXEC_DIRECT_JMP_NON_TAKEN 0x42
+EVENT_BR_INST_EXEC 0x88 PMC
+UMASK_BR_INST_EXEC_COND_TAKEN 0x81
+UMASK_BR_INST_EXEC_COND_NON_TAKEN 0x41
+UMASK_BR_INST_EXEC_DIRECT_JMP_TAKEN 0x82
+UMASK_BR_INST_EXEC_DIRECT_JMP_NON_TAKEN 0x42
UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN 0x84
-UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN 0x44
-UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN 0x88
-UMASK_BR_INST_EXEC_RETURN_NEAR_NON_TAKEN 0x48
-UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN 0x90
-UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_NON_TAKEN 0x50
-UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN 0xA0
-UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN 0x60
-UMASK_BR_INST_EXEC_ALL_BRANCHES 0xFF
-
-EVENT_BR_MISP_EXEC 0x89 PMC
-UMASK_BR_MISP_EXEC_COND_TAKEN 0x81
-UMASK_BR_MISP_EXEC_COND_NON_TAKEN 0x41
-UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN 0x84
-UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN 0x44
-UMASK_BR_MISP_EXEC_RETURN_NEAR_TAKEN 0x88
-UMASK_BR_MISP_EXEC_RETURN_NEAR_NON_TAKEN 0x48
-UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_TAKEN 0x90
-UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_NON_TAKEN 0x50
-UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN 0xA0
-UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN 0x60
-UMASK_BR_MISP_EXEC_ALL_BRANCHES 0xFF
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN 0x44
+UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN 0x88
+UMASK_BR_INST_EXEC_RETURN_NEAR_NON_TAKEN 0x48
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN 0x90
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_NON_TAKEN 0x50
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN 0xA0
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN 0x60
+UMASK_BR_INST_EXEC_ALL_BRANCHES 0xFF
+
+EVENT_BR_MISP_EXEC 0x89 PMC
+UMASK_BR_MISP_EXEC_COND_TAKEN 0x81
+UMASK_BR_MISP_EXEC_COND_NON_TAKEN 0x41
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN 0x84
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN 0x44
+UMASK_BR_MISP_EXEC_RETURN_NEAR_TAKEN 0x88
+UMASK_BR_MISP_EXEC_RETURN_NEAR_NON_TAKEN 0x48
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_TAKEN 0x90
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_NON_TAKEN 0x50
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN 0xA0
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN 0x60
+UMASK_BR_MISP_EXEC_ALL_BRANCHES 0xFF
EVENT_IDQ_UOPS_NOT_DELIVERED 0x9C PMC
UMASK_IDQ_UOPS_NOT_DELIVERED_CORE 0x01
-EVENT_UOPS_DISPATCHED_PORT 0xA1 PMC
+EVENT_UOPS_DISPATCHED_PORT 0xA1 PMC
UMASK_UOPS_DISPATCHED_PORT_PORT_0 0x01
UMASK_UOPS_DISPATCHED_PORT_PORT_1 0x02
UMASK_UOPS_DISPATCHED_PORT_PORT_2_LD 0x04
UMASK_UOPS_DISPATCHED_PORT_PORT_2_STA 0x08
UMASK_UOPS_DISPATCHED_PORT_PORT_2 0x0C
-UMASK_UOPS_DISPATCHED_PORT_PORT_3_LD 0x10
-UMASK_UOPS_DISPATCHED_PORT_PORT_3_STA 0x20
+UMASK_UOPS_DISPATCHED_PORT_PORT_3_LD 0x10
+UMASK_UOPS_DISPATCHED_PORT_PORT_3_STA 0x20
UMASK_UOPS_DISPATCHED_PORT_PORT_3 0x30
UMASK_UOPS_DISPATCHED_PORT_PORT_4 0x40
UMASK_UOPS_DISPATCHED_PORT_PORT_5 0x80
+DEFAULT_OPTIONS_UOPS_DISPATCHED_PORT_ARITH_PORTS_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_DISPATCHED_PORT_ARITH_PORTS_CORE 0x83
+UMASK_UOPS_DISPATCHED_PORT_DATA_PORTS 0x7C
+UMASK_UOPS_DISPATCHED_PORT_ALL_PORTS 0xFF
EVENT_RESOURCE_STALLS 0xA2 PMC
UMASK_RESOURCE_STALLS_ANY 0x01
@@ -258,48 +289,99 @@ UMASK_DSB2MITE_SWITCHES_PENALTY_CYCLES 0x02
EVENT_DSB_FILL 0xAC PMC
UMASK_DSB_FILL_OTHER_CANCEL 0x02
UMASK_DSB_FILL_EXCEED_DSB_LINES 0x08
-UMASK_DSB_FILL_ALL_CANCEL 0x0A
+UMASK_DSB_FILL_ALL_CANCEL 0x0A
EVENT_ITLB 0xAE PMC
-UMASK_ITLB_ITLB_FLUSH 0x01
+UMASK_ITLB_ITLB_FLUSH 0x01
-EVENT_OFFCORE_REQUESTS 0xB0 PMC
+EVENT_OFFCORE_REQUESTS 0xB0 PMC
UMASK_OFFCORE_REQUESTS_DEMAND_DATA_RD 0x01
UMASK_OFFCORE_REQUESTS_DEMAND_CODE_RD 0x02
UMASK_OFFCORE_REQUESTS_DEMAND_RFO 0x04
UMASK_OFFCORE_REQUESTS_ALL_DATA_RD 0x08
-EVENT_UOPS_DISPATCHED 0xB1 PMC
-UMASK_UOPS_DISPATCHED_THREAD 0x01
-UMASK_UOPS_DISPATCHED_CORE 0x02
-
-EVENT_OFFCORE_REQUESTS_BUFFER 0xB2 PMC
+EVENT_UOPS_EXECUTED 0xB1 PMC
+UMASK_UOPS_EXECUTED_THREAD 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC 0x01
+UMASK_UOPS_EXECUTED_CORE 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_USED_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_TOTAL_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC 0x02
+
+EVENT_OFFCORE_REQUESTS_BUFFER 0xB2 PMC
UMASK_OFFCORE_REQUESTS_BUFFER_SQ_FULL 0x01
-EVENT_AGU_BYPASS_CANCEL 0xB6 PMC
+EVENT_AGU_BYPASS_CANCEL 0xB6 PMC
UMASK_AGU_BYPASS_CANCEL_COUNT 0x01
-EVENT_TLB_FLUSH 0xBD PMC
+EVENT_TLB_FLUSH 0xBD PMC
UMASK_TLB_FLUSH_DTLB_THREAD 0x01
UMASK_TLB_FLUSH_STLB_ANY 0x20
-EVENT_L1D_BLOCKS 0xBF PMC
-UMASK_L1D_BLOCKS_BANK_CONFLICT_CYCLES 0x05 0x41 0x01
+EVENT_L1D_BLOCKS 0xBF PMC
+DEFAULT_OPTIONS_L1D_BLOCKS_BANK_CONFLICT_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_L1D_BLOCKS_BANK_CONFLICT_CYCLES 0x05
+DEFAULT_OPTIONS_L1D_BLOCKS_BANK_CONFLICT_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_L1D_BLOCKS_BANK_CONFLICT_COUNT 0x05
-EVENT_INST_RETIRED 0xC0 PMC0
-UMASK_INST_RETIRED_ANY_P 0x00
+EVENT_INST_RETIRED 0xC0 PMC0
+UMASK_INST_RETIRED_ANY_P 0x00
UMASK_INST_RETIRED_PREC_DIST 0x01
-EVENT_OTHER_ASSISTS 0xC1 PMC
+EVENT_OTHER_ASSISTS 0xC1 PMC
UMASK_OTHER_ASSISTS_ITLB_MISS_RETIRED 0x02
UMASK_OTHER_ASSISTS_AVX_TO_SSE 0x10
UMASK_OTHER_ASSISTS_SSE_TO_AVX 0x20
-EVENT_UOPS_RETIRED 0xC2 PMC
-UMASK_UOPS_RETIRED_ALL 0x01
-UMASK_UOPS_RETIRED_RETIRE_SLOTS 0x02
-
-EVENT_MACHINE_CLEARS 0xC3 PMC
+EVENT_UOPS_RETIRED 0xC2 PMC
+UMASK_UOPS_RETIRED_ALL 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL 0x01
+UMASK_UOPS_RETIRED_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_RETIRED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL 0x01
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_TOTAL_CYCLES 0x01
+
+EVENT_MACHINE_CLEARS 0xC3 PMC
UMASK_MACHINE_CLEARS_MEMORY_ORDERING 0x02
UMASK_MACHINE_CLEARS_SMC 0x04
UMASK_MACHINE_CLEARS_MASKMOV 0x20
@@ -308,7 +390,6 @@ EVENT_BR_INST_RETIRED 0xC4 PMC
UMASK_BR_INST_RETIRED_ALL_BRANCHES 0x00
UMASK_BR_INST_RETIRED_CONDITIONAL 0x01
UMASK_BR_INST_RETIRED_NEAR_CALL 0x02
-UMASK_BR_INST_RETIRED_ALL_BRANCHES 0x04
UMASK_BR_INST_RETIRED_NEAR_RETURN 0x08
UMASK_BR_INST_RETIRED_NOT_TAKEN 0x10
UMASK_BR_INST_RETIRED_NEAR_TAKEN 0x20
@@ -316,18 +397,17 @@ UMASK_BR_INST_RETIRED_FAR_BRANCH 0x40
EVENT_BR_MISP_RETIRED 0xC5 PMC
UMASK_BR_MISP_RETIRED_ALL_BRANCHES 0x00
-UMASK_BR_MISP_RETIRED_CONDITIONAL 0x01
+UMASK_BR_MISP_RETIRED_CONDITIONAL 0x01
UMASK_BR_MISP_RETIRED_NEAR_CALL 0x02
-UMASK_BR_MISP_RETIRED_ALL_BRANCHES 0x04
-UMASK_BR_MISP_RETIRED_NOT_TAKEN 0x10
-UMASK_BR_MISP_RETIRED_TAKEN 0x20
+UMASK_BR_MISP_RETIRED_NOT_TAKEN 0x10
+UMASK_BR_MISP_RETIRED_TAKEN 0x20
EVENT_FP_ASSIST 0xCA PMC
-UMASK_FP_ASSIST_X87_OUTPUT 0x02
-UMASK_FP_ASSIST_X87_INPUT 0x04
-UMASK_FP_ASSIST_SIMD_OUTPUT 0x08
-UMASK_FP_ASSIST_SIMD_INPUT 0x10
-UMASK_FP_ASSIST_ANY 0x1E
+UMASK_FP_ASSIST_X87_OUTPUT 0x02
+UMASK_FP_ASSIST_X87_INPUT 0x04
+UMASK_FP_ASSIST_SIMD_OUTPUT 0x08
+UMASK_FP_ASSIST_SIMD_INPUT 0x10
+UMASK_FP_ASSIST_ANY 0x1E
EVENT_HW_INTERRUPTS_RECEIVED 0xCB PMC
UMASK_HW_INTERRUPTS_RECEIVED 0x01
@@ -335,29 +415,30 @@ UMASK_HW_INTERRUPTS_RECEIVED 0x01
EVENT_ROB_MISC_EVENT_LBR_INSERTS 0xCC PMC
UMASK_ROB_MISC_EVENT_LBR_INSERTS 0x20
-EVENT_MEM_UOP_RETIRED 0xD0 PMC
-UMASK_MEM_UOP_RETIRED_LOADS 0x81
-UMASK_MEM_UOP_RETIRED_STORES 0x82
-UMASK_MEM_UOP_RETIRED_LOADS_STLB_MISS 0x11
-UMASK_MEM_UOP_RETIRED_STORES_STLB_MISS 0x12
-UMASK_MEM_UOP_RETIRED_LOADS_LOCK 0x21
-UMASK_MEM_UOP_RETIRED_STORES_LOCK 0x22
-UMASK_MEM_UOP_RETIRED_LOADS_SPLIT 0x41
-UMASK_MEM_UOP_RETIRED_STORES_SPLIT 0x42
+EVENT_MEM_UOPS_RETIRED 0xD0 PMC
+UMASK_MEM_UOPS_RETIRED_LOADS 0x81
+UMASK_MEM_UOPS_RETIRED_STORES 0x82
+UMASK_MEM_UOPS_RETIRED_LOADS_STLB_MISS 0x11
+UMASK_MEM_UOPS_RETIRED_STORES_STLB_MISS 0x12
+UMASK_MEM_UOPS_RETIRED_LOADS_LOCK 0x21
+UMASK_MEM_UOPS_RETIRED_STORES_LOCK 0x22
+UMASK_MEM_UOPS_RETIRED_LOADS_SPLIT 0x41
+UMASK_MEM_UOPS_RETIRED_STORES_SPLIT 0x42
EVENT_MEM_LOAD_UOPS_RETIRED 0xD1 PMC
-UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT 0x01
-UMASK_MEM_LOAD_UOPS_RETIRED_L1_MISS 0x08
-UMASK_MEM_LOAD_UOPS_RETIRED_L1_ALL 0x09
-UMASK_MEM_LOAD_UOPS_RETIRED_L2_HIT 0x02
-UMASK_MEM_LOAD_UOPS_RETIRED_L2_MISS 0x10
-UMASK_MEM_LOAD_UOPS_RETIRED_L2_ALL 0x12
-UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT 0x04
-UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS 0x20
-UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL 0x24
-UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB 0x40
-
-EVENT_MEM_LOAD_UOPS_LLC_HIT_RETIRED 0xD2 PMC
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT 0x01
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_MISS 0x08
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_ALL 0x09
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_HIT 0x02
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_MISS 0x10
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_ALL 0x12
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT 0x04
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS 0x20
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL 0x24
+UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB 0x40
+UMASK_MEM_LOAD_UOPS_RETIRED_ALL 0x7F
+
+EVENT_MEM_LOAD_UOPS_LLC_HIT_RETIRED 0xD2 PMC
UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_MISS 0x01
UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HIT 0x02
UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM 0x04
@@ -366,152 +447,125 @@ UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_NONE 0x08
EVENT_MEM_LOAD_UOPS_MISC_RETIRED 0xD4 PMC
UMASK_MEM_LOAD_UOPS_MISC_RETIRED_LLC_MISS 0x02
-EVENT_L2_TRANS 0xF0 PMC
-UMASK_L2_TRANS_DEMAND_DATA_RD 0x01
-UMASK_L2_TRANS_RFO 0x02
-UMASK_L2_TRANS_CODE_RD 0x04
-UMASK_L2_TRANS_ALL_PREF 0x08
-UMASK_L2_TRANS_L1D_WB 0x10
-UMASK_L2_TRANS_L2_FILL 0x20
-UMASK_L2_TRANS_L2_WB 0x40
-UMASK_L2_TRANS_ALL_REQUESTS 0x80
-
-EVENT_L2_LINES_IN 0xF1 PMC
+EVENT_L2_TRANS 0xF0 PMC
+UMASK_L2_TRANS_DEMAND_DATA_RD 0x01
+UMASK_L2_TRANS_RFO 0x02
+UMASK_L2_TRANS_CODE_RD 0x04
+UMASK_L2_TRANS_ALL_PREF 0x08
+UMASK_L2_TRANS_L1D_WB 0x10
+UMASK_L2_TRANS_L2_FILL 0x20
+UMASK_L2_TRANS_L2_WB 0x40
+UMASK_L2_TRANS_ALL_REQUESTS 0x80
+
+EVENT_L2_LINES_IN 0xF1 PMC
UMASK_L2_LINES_IN_I 0x01
-UMASK_L2_LINES_IN_S 0x02
+UMASK_L2_LINES_IN_S 0x02
UMASK_L2_LINES_IN_E 0x04
-UMASK_L2_LINES_IN_ALL 0x07
+UMASK_L2_LINES_IN_ALL 0x07
EVENT_L2_LINES_OUT 0xF2 PMC
UMASK_L2_LINES_OUT_DEMAND_CLEAN 0x01
UMASK_L2_LINES_OUT_DEMAND_DIRTY 0x02
-UMASK_L2_LINES_OUT_PF_CLEAN 0x04
-UMASK_L2_LINES_OUT_PF_DIRTY 0x08
-UMASK_L2_LINES_OUT_DIRTY_ALL 0x0A
+UMASK_L2_LINES_OUT_PF_CLEAN 0x04
+UMASK_L2_LINES_OUT_PF_DIRTY 0x08
+UMASK_L2_LINES_OUT_DIRTY_ALL 0x0A
+UMASK_L2_LINES_OUT_CLEAN_ALL 0x05
+UMASK_L2_LINES_OUT_ALL 0x0F
EVENT_SQ_MISC 0xF4 PMC
UMASK_SQ_MISC_SPLIT_LOCK 0x10
-EVENT_MEM_LOAD_UOPS_LLC_HIT_RETIRED 0xD2 PMC
-UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_MISS 0x01
-UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HIT 0x02
-UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM 0x04
-UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_NON 0x08
-
-EVENT_MEM_TRANS_RETIRED_LOAD_LATENCY 0xCD PMC
-UMASK_MEM_TRANS_RETIRED_LOAD_LATENCY 0x01
-
-EVENT_MEM_LOAD_UOPS_RETIRED 0xD1 PMC
-UMASK_MEM_LOAD_UOPS_RETIRED_LLC_HIT 0x04
-UMASK_MEM_LOAD_UOPS_RETIRED_LLC_MISS 0x20
-
-EVENT_MEM_LOAD_UOPS_LLC_HIT_RETIRED 0xD2 PMC
-UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_MISS 0x01
-UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HIT 0x02
-UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM 0x04
-UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_NONE 0x08
-
-EVENT_MEM_LOAD_UOPS_LLC_MISS_RETIRED 0xD3 PMC
-UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_LOCAL_DRAM 0x01
-UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_DRAM 0x04
-
-EVENT_DRAM_CLOCKTICKS 0x00 MBOX0FIX|MBOX1FIX|MBOX2FIX|MBOX3FIX
-UMASK_DRAM_CLOCKTICKS 0x00
-
-EVENT_ACT_COUNT 0x01 MBOX
-UMASK_ACT_COUNT 0x00
-
-EVENT_CAS_COUNT 0x04 MBOX
-UMASK_CAS_COUNT_RD_REF 0x01
-UMASK_CAS_COUNT_RD_UNDERFILL 0x02
-UMASK_CAS_COUNT_RD 0x03
-UMASK_CAS_COUNT_WR_WMM 0x04
-UMASK_CAS_COUNT_WR_RMM 0x08
-UMASK_CAS_COUNT_WR 0x0C
-UMASK_CAS_COUNT_ALL 0x0F
-
-EVENT_DRAM_PRE_ALL 0x06 MBOX
-UMASK_DRAM_PRE_ALL 0x00
-
-EVENT_DRAM_REFRESH 0x05 MBOX
-UMASK_DRAM_REFRESH_PANIC 0x02
-UMASK_DRAM_REFRESH_HIGH 0x04
-
-EVENT_ECC_CORRECTABLE_ERRORS 0x09 MBOX
-UMASK_ECC_CORRECTABLE_ERRORS 0x00
-
-EVENT_MAJOR_MODES 0x07 MBOX
-UMASK_MAJOR_MODES_READ 0x01
-UMASK_MAJOR_MODES_WRITE 0x02
-UMASK_MAJOR_MODES_PARTIAL 0x04
-UMASK_MAJOR_MODES_ISOCH 0x08
-
-EVENT_POWER_CHANNEL_DLLOFF 0x84 MBOX
-UMASK_POWER_CHANNEL_DLLOFF 0x00
-
-EVENT_POWER_CHANNEL_PPD 0x85 MBOX
-UMASK_POWER_CHANNEL_PPD 0x00
-
-EVENT_POWER_CKE_CYCLES 0x83 MBOX
-UMASK_POWER_CKE_CYCLES_RANK0 0x01
-UMASK_POWER_CKE_CYCLES_RANK1 0x02
-UMASK_POWER_CKE_CYCLES_RANK2 0x04
-UMASK_POWER_CKE_CYCLES_RANK3 0x08
-UMASK_POWER_CKE_CYCLES_RANK4 0x10
-UMASK_POWER_CKE_CYCLES_RANK5 0x20
-UMASK_POWER_CKE_CYCLES_RANK6 0x40
-UMASK_POWER_CKE_CYCLES_RANK7 0x80
-
-EVENT_POWER_CRITICAL_THROTTLE_CYCLES 0x86 MBOX
-UMASK_POWER_CRITICAL_THROTTLE_CYCLES 0x00
-
-EVENT_POWER_SELF_REFRESH 0x43 MBOX
-UMASK_POWER_SELF_REFRESH 0x00
-
-EVENT_POWER_THROTTLE_CYCLES 0x41 MBOX
-UMASK_POWER_THROTTLE_CYCLES_RANK0 0x01
-UMASK_POWER_THROTTLE_CYCLES_RANK1 0x02
-UMASK_POWER_THROTTLE_CYCLES_RANK2 0x04
-UMASK_POWER_THROTTLE_CYCLES_RANK3 0x08
-UMASK_POWER_THROTTLE_CYCLES_RANK4 0x10
-UMASK_POWER_THROTTLE_CYCLES_RANK5 0x20
-UMASK_POWER_THROTTLE_CYCLES_RANK6 0x40
-UMASK_POWER_THROTTLE_CYCLES_RANK7 0x80
-
-EVENT_PREEMPTION 0x08 MBOX
-UMASK_PREEMPTION_RD_PREEMPT_RD 0x01
-UMASK_PREEMPTION_RD_PREEMPT_WR 0x02
-
-EVENT_PRE_COUNT 0x02 MBOX
-UMASK_PRE_COUNT_PAGE_MISS 0x01
-UMASK_PRE_COUNT_PAGE_CLOSE 0x02
-
-EVENT_RPQ_CYCLES_FULL 0x12 MBOX
-UMASK_RPQ_CYCLES_FULL 0x00
-
-EVENT_RPQ_CYCLES_NE 0x11 MBOX
-UMASK_RPQ_CYCLES_NE 0x00
-
-EVENT_RPQ_INSERTS 0x10 MBOX
-UMASK_RPQ_INSERTS 0x00
-
-EVENT_RPQ_OCCUPANCY 0x80 MBOX
-UMASK_RPQ_OCCUPANCY 0x00
-
-EVENT_WPQ_CYCLES_FULL 0x22 MBOX
-UMASK_WPQ_CYCLES_FULL 0x00
-
-EVENT_WPQ_CYCLES_NE 0x21 MBOX
-UMASK_WPQ_CYCLES_NE 0x00
-
-EVENT_WPQ_INSERTS 0x20 MBOX
-UMASK_WPQ_INSERTS 0x00
-
-EVENT_WPQ_OCCUPANCY 0x81 MBOX
-UMASK_WPQ_OCCUPANCY 0x00
-
-EVENT_WPQ_READ_HIT 0x23 MBOX
-UMASK_WPQ_READ_HIT 0x00
-
-EVENT_WPQ_WRITE_HIT 0x24 MBOX
-UMASK_WPQ_WRITE_HIT 0x00
+EVENT_OFFCORE_RESPONSE_0 0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS 0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY 0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY 0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY 0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY 0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY 0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY 0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY 0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY 0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY 0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY 0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY 0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY 0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY 0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1 0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS 0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY 0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY 0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY 0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY 0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY 0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY 0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY 0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY 0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY 0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY 0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY 0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY 0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY 0x01 0x0F 0x10
+
+EVENT_CACHE_LOOKUP 0x34 CBOX
+UMASK_CACHE_LOOKUP_M 0x01
+UMASK_CACHE_LOOKUP_E 0x02
+UMASK_CACHE_LOOKUP_S 0x04
+UMASK_CACHE_LOOKUP_I 0x08
+UMASK_CACHE_LOOKUP_READ_FILTER 0x10
+UMASK_CACHE_LOOKUP_WRITE_FILTER 0x20
+UMASK_CACHE_LOOKUP_EXTSNP_FILTER 0x40
+UMASK_CACHE_LOOKUP_ANY_REQUEST_FILTER 0x80
+UMASK_CACHE_LOOKUP_READ_M 0x11
+UMASK_CACHE_LOOKUP_WRITE_M 0x21
+UMASK_CACHE_LOOKUP_EXTSNP_M 0x41
+UMASK_CACHE_LOOKUP_ANY_M 0x81
+UMASK_CACHE_LOOKUP_READ_E 0x12
+UMASK_CACHE_LOOKUP_WRITE_E 0x22
+UMASK_CACHE_LOOKUP_EXTSNP_E 0x42
+UMASK_CACHE_LOOKUP_ANY_E 0x82
+UMASK_CACHE_LOOKUP_READ_S 0x14
+UMASK_CACHE_LOOKUP_WRITE_S 0x24
+UMASK_CACHE_LOOKUP_EXTSNP_S 0x44
+UMASK_CACHE_LOOKUP_ANY_S 0x84
+UMASK_CACHE_LOOKUP_READ_ES 0x16
+UMASK_CACHE_LOOKUP_WRITE_ES 0x26
+UMASK_CACHE_LOOKUP_EXTSNP_ES 0x46
+UMASK_CACHE_LOOKUP_ANY_ES 0x86
+UMASK_CACHE_LOOKUP_READ_I 0x18
+UMASK_CACHE_LOOKUP_WRITE_I 0x28
+UMASK_CACHE_LOOKUP_EXTSNP_I 0x48
+UMASK_CACHE_LOOKUP_ANY_I 0x88
+UMASK_CACHE_LOOKUP_READ_MESI 0x1F
+UMASK_CACHE_LOOKUP_WRITE_MESI 0x2F
+UMASK_CACHE_LOOKUP_EXTSNP_MESI 0x4F
+UMASK_CACHE_LOOKUP_ANY_MESI 0x8F
+
+EVENT_XSNP_RESPONSE 0x22 CBOX
+UMASK_XSNP_RESPONSE_MISS_EXTERNAL 0x21
+UMASK_XSNP_RESPONSE_MISS_XCORE 0x41
+UMASK_XSNP_RESPONSE_MISS_EVICTION 0x81
+UMASK_XSNP_RESPONSE_HIT_EXTERNAL 0x24
+UMASK_XSNP_RESPONSE_HIT_XCORE 0x44
+UMASK_XSNP_RESPONSE_HIT_EVICTION 0x84
+UMASK_XSNP_RESPONSE_HITM_EXTERNAL 0x28
+UMASK_XSNP_RESPONSE_HITM_XCORE 0x48
+UMASK_XSNP_RESPONSE_HITM_EVICTION 0x88
+
+EVENT_TRK_OCCUPANCY_ALL 0x80 UBOX0
+UMASK_TRK_OCCUPANCY_ALL 0x01
+
+EVENT_TRK_REQUESTS 0x81 UBOX
+UMASK_TRK_REQUESTS_ALL 0x01
+UMASK_TRK_REQUESTS_WRITES 0x20
+
+EVENT_COH_TRK_OCCUPANCY 0x83 UBOX0
+UMASK_COH_TRK_OCCUPANCY 0x01
+
+EVENT_COH_TRK_REQUESTS 0x84 UBOX
+UMASK_COH_TRK_REQUESTS_ALL 0x01
+
+EVENT_UNCORE_CLOCK 0x00 UBOXFIX
+UMASK_UNCORE_CLOCK 0x01
diff --git a/src/includes/perfmon_silvermont.h b/src/includes/perfmon_silvermont.h
index 9cfd6f1..980d528 100644
--- a/src/includes/perfmon_silvermont.h
+++ b/src/includes/perfmon_silvermont.h
@@ -3,15 +3,15 @@
*
* Filename: perfmon_silvermont.h
*
- * Description: Header file of perfmon module for Intel Atom Silvermont
+ * Description: Header file of perfmon module for Intel Atom (Silvermont)
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -29,264 +29,491 @@
*/
#include <perfmon_silvermont_events.h>
-#include <perfmon_silvermont_groups.h>
#include <perfmon_silvermont_counters.h>
static int perfmon_numCountersSilvermont = NUM_COUNTERS_SILVERMONT;
-static int perfmon_numGroupsSilvermont = NUM_GROUPS_SILVERMONT;
+static int perfmon_numCoreCountersSilvermont = NUM_COUNTERS_SILVERMONT;
static int perfmon_numArchEventsSilvermont = NUM_ARCH_EVENTS_SILVERMONT;
-void perfmon_init_silvermont(PerfmonThread *thread)
+int perfmon_init_silvermont(int cpu_id)
{
- uint64_t flags = 0x0ULL;
- int cpu_id = thread->processorId;
lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
-
- /* Initialize registers */
- msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
- msr_write(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL);
- msr_write(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL);
-
- msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
+ lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+ return 0;
}
-void perfmon_setupCounterThread_silvermont(
- int thread_id,
- PerfmonEvent* event,
- PerfmonCounterIndex index)
+uint32_t svm_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
{
- int haveLock = 0;
- uint64_t flags = 0x0ULL;
- uint32_t uflags;
- uint64_t reg = silvermont_counter_map[index].configRegister;
- int cpu_id = perfmon_threadData[thread_id].processorId;
- uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
- uint64_t orig_fixed_flags = fixed_flags;
- perfmon_threadData[thread_id].counters[index].init = TRUE;
-
- switch (silvermont_counter_map[index].type)
+ uint32_t flags = (1ULL<<(1+(index*4)));
+ if (event->numberOfOptions > 0)
{
- case PMC:
-
- flags = (1<<16)|(1<<22);
- flags &= ~(0xFFFFU); /* clear lower 16bits */
+ for(int i=0;i<event->numberOfOptions;i++)
+ {
+ switch(event->options[i].type)
+ {
+ case EVENT_OPTION_ANYTHREAD:
+ flags |= (1ULL<<(2+(index*4)));
+ break;
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<(index*4));
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ return flags;
+}
- /* Intel with standard 8 bit event mask: [7:0] */
- flags |= (event->umask<<8) + event->eventId;
+int svm_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ uint64_t flags = 0x0ULL;
+ uint64_t offcore_flags = 0x0ULL;
+ flags |= (1ULL<<16)|(1ULL<<22);
+ flags |= (event->umask<<8) + event->eventId;
+ /* For event id 0xB7 the cmask must be written in an extra register */
+ if ((event->cmask != 0x00) && (event->eventId != 0xB7))
+ {
+ flags |= (event->cmask << 24);
+ }
+ /* set custom cfgbits */
+ if ((event->cfgBits != 0x00) && (event->eventId != 0xB7))
+ {
+ flags |= (event->cfgBits << 16);
+ }
- if (perfmon_verbose)
+ if (event->numberOfOptions > 0)
+ {
+ for(int i=0;i<event->numberOfOptions;i++)
+ {
+ switch(event->options[i].type)
{
- printf("[%d] perfmon_setup_counter PMC: Write Register 0x%llX , Flags: 0x%llX \n",
- cpu_id,
- LLU_CAST reg,
- LLU_CAST flags);
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_ANYTHREAD:
+ flags |= (1ULL<<21);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<17);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[i].value & 0xFFULL)<<24;
+ break;
+ case EVENT_OPTION_MATCH0:
+ offcore_flags |= (event->options[i].value & 0xFFFFULL);
+ break;
+ case EVENT_OPTION_MATCH1:
+ if ((event->eventId == 0xB7) && (event->umask == 0x01))
+ {
+ offcore_flags |= (event->options[i].value & 0x768005ULL)<<16;
+ }
+ else if ((event->eventId == 0xB7) && (event->umask == 0x02))
+ {
+ offcore_flags |= (event->options[i].value & 0x368005ULL)<<16;
+ }
+ break;
+ default:
+ break;
}
- msr_write(cpu_id, reg , flags);
-
- // Offcore event with additional configuration register
- // We included the additional register as counterRegister2
- // to avoid creating a new data structure
- // cfgBits contain offset of "request type" bit
- // cmask contain offset of "response type" bit
- if (event->eventId == 0xB7)
+ }
+ }
+
+ // Offcore event with additional configuration register
+ // cfgBits contain offset of "request type" bit
+ // cmask contain offset of "response type" bit
+ if (event->eventId == 0xB7)
+ {
+ uint32_t reg = 0x0;
+ if (event->umask == 0x01)
+ {
+ reg = MSR_OFFCORE_RESP0;
+ }
+ else if (event->umask == 0x02)
+ {
+ reg = MSR_OFFCORE_RESP1;
+ }
+ if (reg)
+ {
+ if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
{
- if (event->umask == 0x01)
- {
- reg = MSR_OFFCORE_RESP0;
- }
- else if (event->umask == 0x02)
- {
- reg = MSR_OFFCORE_RESP1;
- }
- flags = 0x0ULL;
- flags = (1<<event->cfgBits)|(1<<event->cmask);
- msr_write(cpu_id, reg , flags);
+ offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
}
+ VERBOSEPRINTREG(cpu_id, reg, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , offcore_flags));
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int perfmon_setupCountersThread_silvermont(
+ int thread_id,
+ PerfmonEventSet* eventSet)
+{
+ int haveLock = 0;
+ uint64_t flags = 0x0ULL;
+ uint64_t fixed_flags = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveLock = 1;
+ }
+
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL));
+ }
- break;
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ flags = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ eventSet->events[i].threadCounter[thread_id].init = TRUE;
+ switch (type)
+ {
+ case PMC:
+ svm_pmc_setup(cpu_id, index, event);
+ break;
- case FIXED:
- fixed_flags |= (2ULL<<(index*4));
- break;
+ case FIXED:
+ fixed_flags |= svm_fixed_setup(cpu_id, index, event);
+ break;
- case POWER:
- break;
+ case POWER:
+ break;
- default:
- /* should never be reached */
- break;
+ default:
+ break;
+ }
}
- if (fixed_flags != orig_fixed_flags)
+ if (fixed_flags > 0x0)
{
- msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
}
+ return 0;
}
-void perfmon_startCountersThread_silvermont(int thread_id)
+
+
+int perfmon_startCountersThread_silvermont(int thread_id, PerfmonEventSet* eventSet)
{
int haveLock = 0;
+ uint64_t tmp;
uint64_t flags = 0x0ULL;
- uint32_t uflags = 0x10000UL; /* Clear freeze bit */
- uint64_t fixed_flags = 0x0ULL;
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
haveLock = 1;
}
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-
- for ( int i=0; i<perfmon_numCountersSilvermont; i++ )
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- switch (silvermont_counter_map[i].type)
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ tmp = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ uint64_t counter1 = counter_map[index].counterRegister;
+ eventSet->events[i].threadCounter[thread_id].startData = 0;
+ eventSet->events[i].threadCounter[thread_id].counterData = 0;
+ switch (type)
{
case PMC:
- msr_write(cpu_id, silvermont_counter_map[i].counterRegister, 0x0ULL);
- flags |= (1<<(i-OFFSET_PMC)); /* enable counter */
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+ flags |= (1<<(index-cpuid_info.perf_num_fixed_ctr)); /* enable counter */
break;
case FIXED:
- msr_write(cpu_id, silvermont_counter_map[i].counterRegister, 0x0ULL);
- flags |= (1ULL<<(i+32)); /* enable fixed counter */
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+ flags |= (1ULL<<(index+32)); /* enable fixed counter */
break;
case POWER:
if(haveLock)
{
- perfmon_threadData[thread_id].counters[i].counterData =
- power_read(cpu_id, silvermont_counter_map[i].counterRegister);
+ CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&tmp));
+ eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[type].regWidth);
}
-
break;
default:
- /* should never be reached */
break;
}
+ eventSet->events[i].threadCounter[thread_id].counterData = eventSet->events[i].threadCounter[thread_id].startData;
}
}
- if (perfmon_verbose)
- {
- printf("perfmon_start_counters: Write Register 0x%X , \
- Flags: 0x%llX \n",MSR_PERF_GLOBAL_CTRL, LLU_CAST flags);
- printf("perfmon_start_counters: Write Register 0x%X , \
- Flags: 0x%llX \n",MSR_UNCORE_PERF_GLOBAL_CTRL, LLU_CAST uflags);
- }
- if (flags != 0x0ULL)
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
{
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, flags));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, UNFREEZE_PMC_OR_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
}
+
+ return 0;
}
-void perfmon_stopCountersThread_silvermont(int thread_id)
+int perfmon_stopCountersThread_silvermont(int thread_id, PerfmonEventSet* eventSet)
{
- uint64_t flags;
- uint32_t uflags = 0x10100UL; /* Set freeze bit */
uint64_t counter_result = 0x0ULL;
int haveLock = 0;
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
haveLock = 1;
}
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_OR_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
- for ( int i=0; i < perfmon_numCountersSilvermont; i++ )
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- switch (silvermont_counter_map[i].type)
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ counter_result = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ uint64_t counter1 = counter_map[index].counterRegister;
+ switch (type)
{
case PMC:
-
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+ {
+ uint64_t ovf_values = 0x0ULL;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+ if (ovf_values & (1ULL<<(index - cpuid_info.perf_num_fixed_ctr)))
+ {
+ eventSet->events[i].threadCounter[thread_id].overflows++;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL,
+ (1ULL<<(index - cpuid_info.perf_num_fixed_ctr))));
+ }
+ }
+ break;
case FIXED:
- perfmon_threadData[thread_id].counters[i].counterData =
- (double)msr_read(cpu_id, silvermont_counter_map[i].counterRegister);
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+ {
+ uint64_t ovf_values = 0x0ULL;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+ if (ovf_values & (1ULL<<(index + 32)))
+ {
+ eventSet->events[i].threadCounter[thread_id].overflows++;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<(index + 32))));
+ }
+ }
break;
case POWER:
if(haveLock)
{
- perfmon_threadData[thread_id].counters[i].counterData =
- power_info.energyUnit *
- ( power_read(cpu_id, silvermont_counter_map[i].counterRegister) -
- perfmon_threadData[thread_id].counters[i].counterData);
+ CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+ {
+ eventSet->events[i].threadCounter[thread_id].overflows++;
+ }
}
break;
case THERMAL:
- perfmon_threadData[thread_id].counters[i].counterData =
- thermal_read(cpu_id);
+ CHECK_TEMP_READ_ERROR(thermal_read(cpu_id, (uint32_t*)&counter_result));
break;
default:
- /* should never be reached */
break;
}
+ eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
}
}
-
- flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
- // printf ("Status: 0x%llX \n", LLU_CAST flags);
- if ( (flags & 0x3) || (flags & (0x3ULL<<32)) )
- {
- printf ("Overflow occured \n");
- }
+ return 0;
}
-void perfmon_readCountersThread_silvermont(int thread_id)
+int perfmon_readCountersThread_silvermont(int thread_id, PerfmonEventSet* eventSet)
{
uint64_t counter_result = 0x0ULL;
+ uint64_t pmc_flags = 0x0ULL;
int haveLock = 0;
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ int cpu_id = groupSet->threads[thread_id].processorId;
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
haveLock = 1;
}
- for ( int i=0; i<perfmon_numCountersSilvermont; i++ )
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &pmc_flags));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_OR_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- if ((silvermont_counter_map[i].type == PMC) ||
- (silvermont_counter_map[i].type == FIXED))
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, silvermont_counter_map[i].counterRegister);
+ continue;
}
- else
+ counter_result = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ uint64_t counter1 = counter_map[index].counterRegister;
+ switch (type)
{
- if(haveLock)
+ case PMC:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+ {
+ uint64_t ovf_values = 0x0ULL;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+ if (ovf_values & (1ULL<<(index - cpuid_info.perf_num_fixed_ctr)))
+ {
+ eventSet->events[i].threadCounter[thread_id].overflows++;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL,
+ (1ULL<<(index - cpuid_info.perf_num_fixed_ctr))));
+ }
+ }
+ break;
+ case FIXED:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+ {
+ uint64_t ovf_values = 0x0ULL;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+ if (ovf_values & (1ULL<<(index + 32)))
+ {
+ eventSet->events[i].threadCounter[thread_id].overflows++;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<(index + 32))));
+ }
+ }
+ break;
+
+ case POWER:
+ if(haveLock)
+ {
+ CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+ {
+ eventSet->events[i].threadCounter[thread_id].overflows++;
+ }
+ }
+ break;
+
+ case THERMAL:
+ CHECK_TEMP_READ_ERROR(thermal_read(cpu_id, (uint32_t*)&counter_result));
+ break;
+
+ default:
+ break;
+ }
+ eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+ }
+ }
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, pmc_flags));
+ }
+ return 0;
+}
+
+
+int perfmon_finalizeCountersThread_silvermont(int thread_id, PerfmonEventSet* eventSet)
+{
+ int haveLock = 0;
+ int haveTileLock = 0;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+ uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveLock = 1;
+ }
+ if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+ {
+ haveTileLock = 1;
+ }
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ uint64_t reg = counter_map[index].configRegister;
+ PciDeviceIndex dev = counter_map[index].device;
+ switch (type)
+ {
+ case PMC:
+ ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+ if ((haveTileLock) && (event->eventId == 0xB7))
{
- switch (silvermont_counter_map[i].type)
+ if (event->umask == 0x1)
{
- case POWER:
- perfmon_threadData[thread_id].counters[i].counterData =
- power_info.energyUnit *
- power_read(cpu_id, silvermont_counter_map[i].counterRegister);
- break;
-
- default:
- /* should never be reached */
- break;
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
+ }
+ else if (event->umask == 0x2)
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
}
}
- }
+ break;
+ case FIXED:
+ ovf_values_core |= (1ULL<<(index+32));
+ break;
+ default:
+ break;
+ }
+ if ((reg) && ((dev == MSR_DEV) || (haveLock)))
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
}
+ eventSet->events[i].threadCounter[thread_id].init = FALSE;
}
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+ return 0;
}
diff --git a/src/includes/perfmon_silvermont_counters.h b/src/includes/perfmon_silvermont_counters.h
index 266ee4b..f04c87b 100644
--- a/src/includes/perfmon_silvermont_counters.h
+++ b/src/includes/perfmon_silvermont_counters.h
@@ -3,15 +3,15 @@
*
* Filename: perfmon_silvermont_counters.h
*
- * Description: Counter header file of perfmon module for Silvermont.
+ * Description: Counter header file of perfmon module for Intel Atom (Silvermont)
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -27,24 +27,33 @@
*
* =======================================================================================
*/
+#include <registers.h>
#define NUM_COUNTERS_CORE_SILVERMONT 6
#define NUM_COUNTERS_UNCORE_SILVERMONT 0
#define NUM_COUNTERS_SILVERMONT 8
-static PerfmonCounterMap silvermont_counter_map[NUM_COUNTERS_SILVERMONT] = {
+#define SVM_VALID_OPTIONS_FIXED EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_ANYTHREAD_MASK
+#define SVM_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+static RegisterMap silvermont_counter_map[NUM_COUNTERS_SILVERMONT] = {
/* Fixed Counters: instructions retired, cycles unhalted core */
- {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0},
- {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0},
- {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0},
+ {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, SVM_VALID_OPTIONS_FIXED},
+ {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, SVM_VALID_OPTIONS_FIXED},
+ {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, SVM_VALID_OPTIONS_FIXED},
/* PMC Counters: 4 48bit wide */
- {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
- {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0},
+ {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, MSR_OFFCORE_RESP0, 0, SVM_VALID_OPTIONS_PMC},
+ {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, MSR_OFFCORE_RESP1, 0, SVM_VALID_OPTIONS_PMC},
/* Temperature Sensor*/
- {"TMP0", PMC5, THERMAL, 0, 0, 0, 0},
+ {"TMP0", PMC5, THERMAL, 0, IA32_THERM_STATUS, 0, 0},
/* RAPL counters */
{"PWR0", PMC6, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0},
- {"PWR1", PMC7, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0},
+ {"PWR1", PMC7, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0}
};
-
+static BoxMap silvermont_box_map[NUM_UNITS] = {
+ [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+ [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+ [THERMAL] = {0, 0, 0, 0, 0, MSR_DEV, 8},
+ [POWER] = {0, 0, 0, 0, 0, MSR_DEV, 32}
+};
diff --git a/src/includes/perfmon_silvermont_events.txt b/src/includes/perfmon_silvermont_events.txt
index b8a088d..5b2d1a7 100644
--- a/src/includes/perfmon_silvermont_events.txt
+++ b/src/includes/perfmon_silvermont_events.txt
@@ -1,16 +1,16 @@
# =======================================================================================
-#
+#
# Filename: perfmon_silvermont_events.txt
-#
+#
# Description: Event list for Intel Atom (Silvermont)
-#
-# Version: 3.1.3
-# Released: 4.11.2014
-#
-# Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
+# Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
# Project: likwid
#
-# Copyright (C) 2014 Jan Treibig
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
@@ -64,12 +64,16 @@ UMASK_MEM_UOPS_RETIRED_HITM 0x20
UMASK_MEM_UOPS_RETIRED_ALL_LOADS 0x40
UMASK_MEM_UOPS_RETIRED_ALL_STORES 0x80
-EVENT_PAGE_WALKS 0x05 PMC
-UMASK_PAGE_WALKS_D_SIDE_CYCLES 0x01
-UMASK_PAGE_WALKS_I_SIDE_CYCLES 0x02
-UMASK_PAGE_WALKS_WALKS 0x03
+EVENT_PAGE_WALKS 0x05 PMC
+UMASK_PAGE_WALKS_DTLB_COUNT 0x01 0x04 0x00
+UMASK_PAGE_WALKS_DTLB_CYCLES 0x01
+UMASK_PAGE_WALKS_ITLB_COUNT 0x02 0x04 0x00
+UMASK_PAGE_WALKS_ITLB_CYCLES 0x02
+UMASK_PAGE_WALKS_COUNT 0x03 0x04 0x00
+UMASK_PAGE_WALKS_CYCLES 0x03
+
-EVENT_LONGEST_LAT_CACHE 0x2E PMC
+EVENT_LONGEST_LAT_CACHE 0x2E PMC
UMASK_LONGEST_LAT_CACHE_MISS 0x41
UMASK_LONGEST_LAT_CACHE_REFERENCE 0x4F
@@ -83,305 +87,15 @@ EVENT_CPU_CLK_UNHALTED 0x3C PMC
UMASK_CPU_CLK_UNHALTED_CORE_P 0x00
UMASK_CPU_CLK_UNHALTED_REF_P 0x01
-EVENT_ICACHE 0x80 PMC
-UMASK_ICACHE_HIT 0x01
-UMASK_ICACHE_MISSES 0x02
-UMASK_ICACHE_ACCESSES 0x03
-UMASK_ICACHE_IFETCH_STALL 0x04
+EVENT_ICACHE 0x80 PMC
+UMASK_ICACHE_HITS 0x01
+UMASK_ICACHE_MISSES 0x02
+UMASK_ICACHE_ACCESSES 0x03
+UMASK_ICACHE_IFETCH_STALL 0x04
EVENT_NIP_STALL 0xB6 PMC
UMASK_NIP_STALL_ICACHE_MISS 0x04
-EVENT_OFFCORE_RESPONSE 0xB7 PMC
-UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY 0x01 0x00 0x10
-UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_L2_HIT 0x01 0x00 0x12
-UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_SNP_NONE 0x01 0x00 0x1F
-UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_SNOOP_MISS 0x01 0x00 0x21
-UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_SNOOP_HIT 0x01 0x00 0x22
-UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_HITM 0x01 0x00 0x24
-UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_NON_DRAM 0x01 0x00 0x25
-UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_AVG_LAT 0x01 0x00 0x26
-
-UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY 0x01 0x01 0x10
-UMASK_OFFCORE_RESPONSE_0_DMND_RFO_L2_HIT 0x01 0x01 0x12
-UMASK_OFFCORE_RESPONSE_0_DMND_RFO_SNP_NONE 0x01 0x01 0x1F
-UMASK_OFFCORE_RESPONSE_0_DMND_RFO_SNOOP_MISS 0x01 0x01 0x21
-UMASK_OFFCORE_RESPONSE_0_DMND_RFO_SNOOP_HIT 0x01 0x01 0x22
-UMASK_OFFCORE_RESPONSE_0_DMND_RFO_HITM 0x01 0x01 0x24
-UMASK_OFFCORE_RESPONSE_0_DMND_RFO_NON_DRAM 0x01 0x01 0x25
-UMASK_OFFCORE_RESPONSE_0_DMND_RFO_AVG_LAT 0x01 0x01 0x26
-
-UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_ANY 0x01 0x02 0x10
-UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_L2_HIT 0x01 0x02 0x12
-UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_SNP_NONE 0x01 0x02 0x1F
-UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_SNOOP_MISS 0x01 0x02 0x21
-UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_SNOOP_HIT 0x01 0x02 0x22
-UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_HITM 0x01 0x02 0x24
-UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_NON_DRAM 0x01 0x02 0x25
-UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_AVG_LAT 0x01 0x02 0x26
-
-UMASK_OFFCORE_RESPONSE_0_WB_ANY 0x01 0x03 0x10
-UMASK_OFFCORE_RESPONSE_0_WB_L2_HIT 0x01 0x03 0x12
-UMASK_OFFCORE_RESPONSE_0_WB_SNP_NONE 0x01 0x03 0x1F
-UMASK_OFFCORE_RESPONSE_0_WB_SNOOP_MISS 0x01 0x03 0x21
-UMASK_OFFCORE_RESPONSE_0_WB_SNOOP_HIT 0x01 0x03 0x22
-UMASK_OFFCORE_RESPONSE_0_WB_HITM 0x01 0x03 0x24
-UMASK_OFFCORE_RESPONSE_0_WB_NON_DRAM 0x01 0x03 0x25
-UMASK_OFFCORE_RESPONSE_0_WB_AVG_LAT 0x01 0x03 0x26
-
-UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_ANY 0x01 0x04 0x10
-UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_L2_HIT 0x01 0x04 0x12
-UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_SNP_NONE 0x01 0x04 0x1F
-UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_SNOOP_MISS 0x01 0x04 0x21
-UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_SNOOP_HIT 0x01 0x04 0x22
-UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_HITM 0x01 0x04 0x24
-UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_NON_DRAM 0x01 0x04 0x25
-UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_AVG_LAT 0x01 0x04 0x26
-
-UMASK_OFFCORE_RESPONSE_0_PF_RFO_ANY 0x01 0x05 0x10
-UMASK_OFFCORE_RESPONSE_0_PF_RFO_L2_HIT 0x01 0x05 0x12
-UMASK_OFFCORE_RESPONSE_0_PF_RFO_SNP_NONE 0x01 0x05 0x1F
-UMASK_OFFCORE_RESPONSE_0_PF_RFO_SNOOP_MISS 0x01 0x05 0x21
-UMASK_OFFCORE_RESPONSE_0_PF_RFO_SNOOP_HIT 0x01 0x05 0x22
-UMASK_OFFCORE_RESPONSE_0_PF_RFO_HITM 0x01 0x05 0x24
-UMASK_OFFCORE_RESPONSE_0_PF_RFO_NON_DRAM 0x01 0x05 0x25
-UMASK_OFFCORE_RESPONSE_0_PF_RFO_AVG_LAT 0x01 0x05 0x26
-
-UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_ANY 0x01 0x06 0x10
-UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_L2_HIT 0x01 0x06 0x12
-UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_SNP_NONE 0x01 0x06 0x1F
-UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_SNOOP_MISS 0x01 0x06 0x21
-UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_SNOOP_HIT 0x01 0x06 0x22
-UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_HITM 0x01 0x06 0x24
-UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_NON_DRAM 0x01 0x06 0x25
-UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_AVG_LAT 0x01 0x06 0x26
-
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_ANY 0x01 0x07 0x10
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_L2_HIT 0x01 0x07 0x12
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_SNP_NONE 0x01 0x07 0x1F
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_SNOOP_MISS 0x01 0x07 0x21
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_SNOOP_HIT 0x01 0x07 0x22
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_HITM 0x01 0x07 0x24
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_NON_DRAM 0x01 0x07 0x25
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_AVG_LAT 0x01 0x07 0x26
-
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_ANY 0x01 0x08 0x10
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_L2_HIT 0x01 0x08 0x12
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_SNP_NONE 0x01 0x08 0x1F
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_SNOOP_MISS 0x01 0x08 0x21
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_SNOOP_HIT 0x01 0x08 0x22
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_HITM 0x01 0x08 0x24
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_NON_DRAM 0x01 0x08 0x25
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_AVG_LAT 0x01 0x08 0x26
-
-UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_ANY 0x01 0x09 0x10
-UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_L2_HIT 0x01 0x09 0x12
-UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_SNP_NONE 0x01 0x09 0x1F
-UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_SNOOP_MISS 0x01 0x09 0x21
-UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_SNOOP_HIT 0x01 0x09 0x22
-UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_HITM 0x01 0x09 0x24
-UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_NON_DRAM 0x01 0x09 0x25
-UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_AVG_LAT 0x01 0x09 0x26
-
-UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_ANY 0x01 0x0A 0x10
-UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_L2_HIT 0x01 0x0A 0x12
-UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_SNP_NONE 0x01 0x0A 0x1F
-UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_SNOOP_MISS 0x01 0x0A 0x21
-UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_SNOOP_HIT 0x01 0x0A 0x22
-UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_HITM 0x01 0x0A 0x24
-UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_NON_DRAM 0x01 0x0A 0x25
-UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_AVG_LAT 0x01 0x0A 0x26
-
-UMASK_OFFCORE_RESPONSE_0_STRM_ST_ANY 0x01 0x0B 0x10
-UMASK_OFFCORE_RESPONSE_0_STRM_ST_L2_HIT 0x01 0x0B 0x12
-UMASK_OFFCORE_RESPONSE_0_STRM_ST_SNP_NONE 0x01 0x0B 0x1F
-UMASK_OFFCORE_RESPONSE_0_STRM_ST_SNOOP_MISS 0x01 0x0B 0x21
-UMASK_OFFCORE_RESPONSE_0_STRM_ST_SNOOP_HIT 0x01 0x0B 0x22
-UMASK_OFFCORE_RESPONSE_0_STRM_ST_HITM 0x01 0x0B 0x24
-UMASK_OFFCORE_RESPONSE_0_STRM_ST_NON_DRAM 0x01 0x0B 0x25
-UMASK_OFFCORE_RESPONSE_0_STRM_ST_AVG_LAT 0x01 0x0B 0x26
-
-UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_ANY 0x01 0x0C 0x10
-UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_L2_HIT 0x01 0x0C 0x12
-UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_SNP_NONE 0x01 0x0C 0x1F
-UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_SNOOP_MISS 0x01 0x0C 0x21
-UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_SNOOP_HIT 0x01 0x0C 0x22
-UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_HITM 0x01 0x0C 0x24
-UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_NON_DRAM 0x01 0x0C 0x25
-UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_AVG_LAT 0x01 0x0C 0x26
-
-UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_ANY 0x01 0x0D 0x10
-UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_L2_HIT 0x01 0x0D 0x12
-UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_SNP_NONE 0x01 0x0D 0x1F
-UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_SNOOP_MISS 0x01 0x0D 0x21
-UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_SNOOP_HIT 0x01 0x0D 0x22
-UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_HITM 0x01 0x0D 0x24
-UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_NON_DRAM 0x01 0x0D 0x25
-UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_AVG_LAT 0x01 0x0D 0x26
-
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_ANY 0x01 0x0E 0x10
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_L2_HIT 0x01 0x0E 0x12
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_SNP_NONE 0x01 0x0E 0x1F
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_SNOOP_MISS 0x01 0x0E 0x21
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_SNOOP_HIT 0x01 0x0E 0x22
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_HITM 0x01 0x0E 0x24
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_NON_DRAM 0x01 0x0E 0x25
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_AVG_LAT 0x01 0x0E 0x26
-
-UMASK_OFFCORE_RESPONSE_0_ANY_ANY 0x01 0x0F 0x10
-UMASK_OFFCORE_RESPONSE_0_ANY_L2_HIT 0x01 0x0F 0x12
-UMASK_OFFCORE_RESPONSE_0_ANY_SNP_NONE 0x01 0x0F 0x1F
-UMASK_OFFCORE_RESPONSE_0_ANY_SNOOP_MISS 0x01 0x0F 0x21
-UMASK_OFFCORE_RESPONSE_0_ANY_SNOOP_HIT 0x01 0x0F 0x22
-UMASK_OFFCORE_RESPONSE_0_ANY_HITM 0x01 0x0F 0x24
-UMASK_OFFCORE_RESPONSE_0_ANY_NON_DRAM 0x01 0x0F 0x25
-UMASK_OFFCORE_RESPONSE_0_ANY_AVG_LAT 0x01 0x0F 0x26
-
-UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY 0x02 0x00 0x10
-UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_L2_HIT 0x02 0x00 0x12
-UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_SNP_NONE 0x02 0x00 0x1F
-UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_SNOOP_MISS 0x02 0x00 0x21
-UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_SNOOP_HIT 0x02 0x00 0x22
-UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_HITM 0x02 0x00 0x24
-UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_NON_DRAM 0x02 0x00 0x25
-UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_AVG_LAT 0x02 0x00 0x26
-
-UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY 0x02 0x01 0x10
-UMASK_OFFCORE_RESPONSE_1_DMND_RFO_L2_HIT 0x02 0x01 0x12
-UMASK_OFFCORE_RESPONSE_1_DMND_RFO_SNP_NONE 0x02 0x01 0x1F
-UMASK_OFFCORE_RESPONSE_1_DMND_RFO_SNOOP_MISS 0x02 0x01 0x21
-UMASK_OFFCORE_RESPONSE_1_DMND_RFO_SNOOP_HIT 0x02 0x01 0x22
-UMASK_OFFCORE_RESPONSE_1_DMND_RFO_HITM 0x02 0x01 0x24
-UMASK_OFFCORE_RESPONSE_1_DMND_RFO_NON_DRAM 0x02 0x01 0x25
-UMASK_OFFCORE_RESPONSE_1_DMND_RFO_AVG_LAT 0x02 0x01 0x26
-
-UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_ANY 0x02 0x02 0x10
-UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_L2_HIT 0x02 0x02 0x12
-UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_SNP_NONE 0x02 0x02 0x1F
-UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_SNOOP_MISS 0x02 0x02 0x21
-UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_SNOOP_HIT 0x02 0x02 0x22
-UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_HITM 0x02 0x02 0x24
-UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_NON_DRAM 0x02 0x02 0x25
-UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_AVG_LAT 0x02 0x02 0x26
-
-UMASK_OFFCORE_RESPONSE_1_WB_ANY 0x02 0x03 0x10
-UMASK_OFFCORE_RESPONSE_1_WB_L2_HIT 0x02 0x03 0x12
-UMASK_OFFCORE_RESPONSE_1_WB_SNP_NONE 0x02 0x03 0x1F
-UMASK_OFFCORE_RESPONSE_1_WB_SNOOP_MISS 0x02 0x03 0x21
-UMASK_OFFCORE_RESPONSE_1_WB_SNOOP_HIT 0x02 0x03 0x22
-UMASK_OFFCORE_RESPONSE_1_WB_HITM 0x02 0x03 0x24
-UMASK_OFFCORE_RESPONSE_1_WB_NON_DRAM 0x02 0x03 0x25
-UMASK_OFFCORE_RESPONSE_1_WB_AVG_LAT 0x02 0x03 0x26
-
-UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_ANY 0x02 0x04 0x10
-UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_L2_HIT 0x02 0x04 0x12
-UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_SNP_NONE 0x02 0x04 0x1F
-UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_SNOOP_MISS 0x02 0x04 0x21
-UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_SNOOP_HIT 0x02 0x04 0x22
-UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_HITM 0x02 0x04 0x24
-UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_NON_DRAM 0x02 0x04 0x25
-UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_AVG_LAT 0x02 0x04 0x26
-
-UMASK_OFFCORE_RESPONSE_1_PF_RFO_ANY 0x02 0x05 0x10
-UMASK_OFFCORE_RESPONSE_1_PF_RFO_L2_HIT 0x02 0x05 0x12
-UMASK_OFFCORE_RESPONSE_1_PF_RFO_SNP_NONE 0x02 0x05 0x1F
-UMASK_OFFCORE_RESPONSE_1_PF_RFO_SNOOP_MISS 0x02 0x05 0x21
-UMASK_OFFCORE_RESPONSE_1_PF_RFO_SNOOP_HIT 0x02 0x05 0x22
-UMASK_OFFCORE_RESPONSE_1_PF_RFO_HITM 0x02 0x05 0x24
-UMASK_OFFCORE_RESPONSE_1_PF_RFO_NON_DRAM 0x02 0x05 0x25
-UMASK_OFFCORE_RESPONSE_1_PF_RFO_AVG_LAT 0x02 0x05 0x26
-
-UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_ANY 0x02 0x06 0x10
-UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_L2_HIT 0x02 0x06 0x12
-UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_SNP_NONE 0x02 0x06 0x1F
-UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_SNOOP_MISS 0x02 0x06 0x21
-UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_SNOOP_HIT 0x02 0x06 0x22
-UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_HITM 0x02 0x06 0x24
-UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_NON_DRAM 0x02 0x06 0x25
-UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_AVG_LAT 0x02 0x06 0x26
-
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_ANY 0x02 0x07 0x10
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_L2_HIT 0x02 0x07 0x12
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_SNP_NONE 0x02 0x07 0x1F
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_SNOOP_MISS 0x02 0x07 0x21
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_SNOOP_HIT 0x02 0x07 0x22
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_HITM 0x02 0x07 0x24
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_NON_DRAM 0x02 0x07 0x25
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_AVG_LAT 0x02 0x07 0x26
-
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_ANY 0x02 0x08 0x10
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_L2_HIT 0x02 0x08 0x12
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_SNP_NONE 0x02 0x08 0x1F
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_SNOOP_MISS 0x02 0x08 0x21
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_SNOOP_HIT 0x02 0x08 0x22
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_HITM 0x02 0x08 0x24
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_NON_DRAM 0x02 0x08 0x25
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_AVG_LAT 0x02 0x08 0x26
-
-UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_ANY 0x02 0x09 0x10
-UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_L2_HIT 0x02 0x09 0x12
-UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_SNP_NONE 0x02 0x09 0x1F
-UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_SNOOP_MISS 0x02 0x09 0x21
-UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_SNOOP_HIT 0x02 0x09 0x22
-UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_HITM 0x02 0x09 0x24
-UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_NON_DRAM 0x02 0x09 0x25
-UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_AVG_LAT 0x02 0x09 0x26
-
-UMASK_OFFCORE_RESPONSE_1 BUS_LOCKS_ANY 0x02 0x0A 0x10
-UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_L2_HIT 0x02 0x0A 0x12
-UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_SNP_NONE 0x02 0x0A 0x1F
-UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_SNOOP_MISS 0x02 0x0A 0x21
-UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_SNOOP_HIT 0x02 0x0A 0x22
-UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_HITM 0x02 0x0A 0x24
-UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_NON_DRAM 0x02 0x0A 0x25
-UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_AVG_LAT 0x02 0x0A 0x26
-
-UMASK_OFFCORE_RESPONSE_1_STRM_ST_ANY 0x02 0x0B 0x10
-UMASK_OFFCORE_RESPONSE_1_STRM_ST_L2_HIT 0x02 0x0B 0x12
-UMASK_OFFCORE_RESPONSE_1_STRM_ST_SNP_NONE 0x02 0x0B 0x1F
-UMASK_OFFCORE_RESPONSE_1_STRM_ST_SNOOP_MISS 0x02 0x0B 0x21
-UMASK_OFFCORE_RESPONSE_1_STRM_ST_SNOOP_HIT 0x02 0x0B 0x22
-UMASK_OFFCORE_RESPONSE_1_STRM_ST_HITM 0x02 0x0B 0x24
-UMASK_OFFCORE_RESPONSE_1_STRM_ST_NON_DRAM 0x02 0x0B 0x25
-UMASK_OFFCORE_RESPONSE_1_STRM_ST_AVG_LAT 0x02 0x0B 0x26
-
-UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_ANY 0x02 0x0C 0x10
-UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_L2_HIT 0x02 0x0C 0x12
-UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_SNP_NONE 0x02 0x0C 0x1F
-UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_SNOOP_MISS 0x02 0x0C 0x21
-UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_SNOOP_HIT 0x02 0x0C 0x22
-UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_HITM 0x02 0x0C 0x24
-UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_NON_DRAM 0x02 0x0C 0x25
-UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_AVG_LAT 0x02 0x0C 0x26
-
-UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_ANY 0x02 0x0D 0x10
-UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_L2_HIT 0x02 0x0D 0x12
-UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_SNP_NONE 0x02 0x0D 0x1F
-UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_SNOOP_MISS 0x02 0x0D 0x21
-UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_SNOOP_HIT 0x02 0x0D 0x22
-UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_HITM 0x02 0x0D 0x24
-UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_NON_DRAM 0x02 0x0D 0x25
-UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_AVG_LAT 0x02 0x0D 0x26
-
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_ANY 0x02 0x0E 0x10
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_L2_HIT 0x02 0x0E 0x12
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_SNP_NONE 0x02 0x0E 0x1F
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_SNOOP_MISS 0x02 0x0E 0x21
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_SNOOP_HIT 0x02 0x0E 0x22
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_HITM 0x02 0x0E 0x24
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_NON_DRAM 0x02 0x0E 0x25
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_AVG_LAT 0x02 0x0E 0x26
-
-UMASK_OFFCORE_RESPONSE_1_ANY_ANY 0x02 0x0F 0x10
-UMASK_OFFCORE_RESPONSE_1_ANY_L2_HIT 0x02 0x0F 0x12
-UMASK_OFFCORE_RESPONSE_1_ANY_SNP_NONE 0x02 0x0F 0x1F
-UMASK_OFFCORE_RESPONSE_1_ANY_SNOOP_MISS 0x02 0x0F 0x21
-UMASK_OFFCORE_RESPONSE_1_ANY_SNOOP_HIT 0x02 0x0F 0x22
-UMASK_OFFCORE_RESPONSE_1_ANY_HITM 0x02 0x0F 0x24
-UMASK_OFFCORE_RESPONSE_1_ANY_NON_DRAM 0x02 0x0F 0x25
-UMASK_OFFCORE_RESPONSE_1_ANY_AVG_LAT 0x02 0x0F 0x26
-
-
EVENT_INST_RETIRED 0xC0 PMC
UMASK_INST_RETIRED_ANY_P 0x00
@@ -390,32 +104,33 @@ UMASK_UOPS_RETIRED_MS 0x01
UMASK_UOPS_RETIRED_ALL 0x10
EVENT_MACHINE_CLEARS 0xC3 PMC
-UMASK_MACHINE_CLEARS_SMC 0x01
-UMASK_MACHINE_CLEARS_MEMORY_ORDERING 0x02
-UMASK_MACHINE_CLEARS_FP_ASSIST 0x04
-UMASK_MACHINE_CLEARS_ALL 0x08
-
-EVENT_BR_INST_RETIRED 0xC4 PMC
-UMASK_BR_INST_RETIRED_ALL_BRANCHES 0x00
-UMASK_BR_INST_RETIRED_JCC 0x7E
-UMASK_BR_INST_RETIRED_FAR_BRANCH 0xBF
+UMASK_MACHINE_CLEARS_SMC 0x01
+UMASK_MACHINE_CLEARS_MEMORY_ORDERING 0x02
+UMASK_MACHINE_CLEARS_FP_ASSIST 0x04
+UMASK_MACHINE_CLEARS_ALL 0x08
+
+
+EVENT_BR_INST_RETIRED 0xC4 PMC
+UMASK_BR_INST_RETIRED_ALL_BRANCHES 0x00
+UMASK_BR_INST_RETIRED_JCC 0x7E
+UMASK_BR_INST_RETIRED_TAKEN_JCC 0xFE
+UMASK_BR_INST_RETIRED_FAR_BRANCH 0xBF
UMASK_BR_INST_RETIRED_NON_RETURN_IND 0xEB
-UMASK_BR_INST_RETIRED_RETURN 0xF7
-UMASK_BR_INST_RETIRED_CALL 0xF9
-UMASK_BR_INST_RETIRED_IND_CALL 0xFB
-UMASK_BR_INST_RETIRED_REL_CALL 0xFD
-UMASK_BR_INST_RETIRED_TAKEN_JCC 0xFE
-
-EVENT_BR_MISP_RETIRED 0xC5 PMC
-UMASK_BR_MISP_RETIRED_ALL_BRANCHES 0x00
-UMASK_BR_MISP_RETIRED_JCC 0x7E
-UMASK_BR_MISP_RETIRED_FAR_BRANCH 0xBF
+UMASK_BR_INST_RETIRED_RETURN 0xF7
+UMASK_BR_INST_RETIRED_CALL 0xF9
+UMASK_BR_INST_RETIRED_IND_CALL 0xFB
+UMASK_BR_INST_RETIRED_REL_CALL 0xFD
+
+EVENT_BR_MISP_RETIRED 0xC5 PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES 0x00
+UMASK_BR_MISP_RETIRED_JCC 0x7E
+UMASK_BR_MISP_RETIRED_TAKEN_JCC 0xFE
+UMASK_BR_MISP_RETIRED_FAR_BRANCH 0xBF
UMASK_BR_MISP_RETIRED_NON_RETURN_IND 0xEB
-UMASK_BR_MISP_RETIRED_RETURN 0xF7
-UMASK_BR_MISP_RETIRED_CALL 0xF9
-UMASK_BR_MISP_RETIRED_IND_CALL 0xFB
-UMASK_BR_MISP_RETIRED_REL_CALL 0xFD
-UMASK_BR_MISP_RETIRED_TAKEN_JCC 0xFE
+UMASK_BR_MISP_RETIRED_RETURN 0xF7
+UMASK_BR_MISP_RETIRED_CALL 0xF9
+UMASK_BR_MISP_RETIRED_IND_CALL 0xFB
+UMASK_BR_MISP_RETIRED_REL_CALL 0xFD
EVENT_NO_ALLOC_CYCLES 0xCA PMC
UMASK_NO_ALLOC_CYCLES_ROB_FULL 0x01
@@ -430,7 +145,7 @@ UMASK_RS_FULL_STALL_ALL 0x1F
EVENT_CYCLES_DIV_BUSY 0xCD PMC
UMASK_CYCLES_DIV_BUSY_ANY 0x01
-EVENT_BACLEARS 0xE6 PMC
+EVENT_BACLEARS 0xE6 PMC
UMASK_BACLEARS_ALL 0x01
UMASK_BACLEARS_RETURN 0x08
UMASK_BACLEARS_COND 0x10
@@ -438,3 +153,46 @@ UMASK_BACLEARS_COND 0x10
EVENT_MS_DECODED 0xE7 PMC
UMASK_MS_DECODED_MS_ENTRY 0x01
+EVENT_OFFCORE_RESPONSE_0 0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS 0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY 0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY 0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY 0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY 0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY 0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY 0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY 0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_ANY 0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_ANY 0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_UC_CODE_RD_ANY 0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_ANY 0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY 0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_ANY 0x01 0x0C 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L1_DATA_RD_ANY 0x01 0x0D 0x10
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_STREAMING_STORES_ANY 0x01 0x0E 0x10
+UMASK_OFFCORE_RESPONSE_0_ANY_ANY 0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1 0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS 0x02 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY 0x02 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY 0x02 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY 0x02 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY 0x02 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY 0x02 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY 0x02 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY 0x02 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_ANY 0x02 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_ANY 0x02 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_UC_CODE_RD_ANY 0x02 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_ANY 0x02 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY 0x02 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_ANY 0x02 0x0C 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L1_DATA_RD_ANY 0x02 0x0D 0x10
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_STREAMING_STORES_ANY 0x02 0x0E 0x10
+UMASK_OFFCORE_RESPONSE_1_ANY_ANY 0x02 0x0F 0x10
+
+
+
+
diff --git a/src/includes/perfmon_skylake.h b/src/includes/perfmon_skylake.h
new file mode 100644
index 0000000..1a10dc4
--- /dev/null
+++ b/src/includes/perfmon_skylake.h
@@ -0,0 +1,753 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: perfmon_skylake.h
+ *
+ * Description: Header File of perfmon module for Intel Skylake.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <perfmon_skylake_events.h>
+#include <perfmon_skylake_counters.h>
+#include <error.h>
+#include <affinity.h>
+#include <limits.h>
+#include <topology.h>
+#include <access.h>
+
+static int perfmon_numCountersSkylake = NUM_COUNTERS_SKYLAKE;
+static int perfmon_numCoreCountersSkylake = NUM_COUNTERS_CORE_SKYLAKE;
+static int perfmon_numArchEventsSkylake = NUM_ARCH_EVENTS_SKYLAKE;
+
+int perfmon_init_skylake(int cpu_id)
+{
+ lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+ lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+ return 0;
+}
+
+uint32_t skl_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint32_t flags = (1ULL<<(1+(index*4)));
+ for(j=0;j<event->numberOfOptions;j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<(index*4));
+ break;
+ case EVENT_OPTION_ANYTHREAD:
+ flags |= (1ULL<<(2+(index*4)));
+ default:
+ break;
+ }
+ }
+ return flags;
+}
+
+int skl_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ uint64_t offcore_flags = 0x0ULL;
+ uint64_t latency_flags = 0x0ULL;
+
+ flags = (1ULL<<22)|(1ULL<<16);
+ /* Intel with standard 8 bit event mask: [7:0] */
+ flags |= (event->umask<<8) + event->eventId;
+
+ /* set custom cfg and cmask */
+ if ((event->cfgBits != 0) &&
+ (event->eventId != 0xB7) &&
+ (event->eventId != 0xBB))
+ {
+ flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+ }
+
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<17);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_ANYTHREAD:
+ flags |= (1ULL<<21);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL) << 24;
+ break;
+ case EVENT_OPTION_IN_TRANS:
+ flags |= (1ULL<<32);
+ break;
+ case EVENT_OPTION_IN_TRANS_ABORT:
+ flags |= (1ULL<<33);
+ break;
+ case EVENT_OPTION_MATCH0:
+ offcore_flags |= (event->options[j].value & 0x8FFFULL);
+ break;
+ case EVENT_OPTION_MATCH1:
+ offcore_flags |= (event->options[j].value<< 16);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ if (event->eventId == 0xB7)
+ {
+ if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+ {
+ offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+ }
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, offcore_flags));
+ }
+ else if (event->eventId == 0xBB)
+ {
+ if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+ {
+ offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+ }
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, offcore_flags));
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister , flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int skl_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ flags = (1ULL<<22)|(1ULL<<20);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0x1FULL) << 24;
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_CBOX);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+int perfmon_setupCounterThread_skylake(
+ int thread_id,
+ PerfmonEventSet* eventSet)
+{
+ int haveLock = 0;
+ uint64_t flags;
+ uint64_t fixed_flags = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveLock = 1;
+ }
+
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, 0xC00000070000000F));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+ }
+ if (haveLock && eventSet->regTypeMask & ~(0xFULL))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_UBOXFIX)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_CTRL, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_STATUS, 0x0ULL));
+ }
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ uint64_t reg = counter_map[index].configRegister;
+ eventSet->events[i].threadCounter[thread_id].init = TRUE;
+ flags = 0x0ULL;
+ switch (type)
+ {
+ case PMC:
+ skl_pmc_setup(cpu_id, index, event);
+ break;
+
+ case FIXED:
+ fixed_flags |= skl_fixed_setup(cpu_id, index, event);
+ break;
+
+ case POWER:
+ break;
+ case UBOXFIX:
+ if (haveLock)
+ {
+ uint64_t uflags = 0x0ULL;
+ uflags |= (1ULL<<20)|(1ULL<<22);
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, uflags, SETUP_UBOXFIX)
+ HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, uflags);
+ }
+ break;
+ case UBOX:
+ if (haveLock)
+ {
+ uint64_t uflags = 0x0ULL;
+ uflags |= (1ULL<<20)|(1ULL<<22);
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, uflags, CLEAR_UBOX)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, uflags));
+ }
+ break;
+ case CBOX0:
+ case CBOX1:
+ case CBOX2:
+ case CBOX3:
+ skl_cbox_setup(cpu_id, index, event);
+ break;
+ default:
+ break;
+ }
+ }
+ if ((fixed_flags > 0x0ULL))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
+ }
+ return 0;
+}
+
+int perfmon_startCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet)
+{
+ int haveLock = 0;
+ uint64_t flags = 0x0ULL;
+ uint64_t uflags = 0x0ULL;
+ uint64_t tmp = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveLock = 1;
+ }
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ tmp = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ uint64_t counter1 = counter_map[index].counterRegister;
+
+ PciDeviceIndex dev = counter_map[index].device;
+ eventSet->events[i].threadCounter[thread_id].startData = 0;
+ eventSet->events[i].threadCounter[thread_id].counterData = 0;
+ switch (type)
+ {
+ case PMC:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+ flags |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr)); /* enable counter */
+ break;
+
+ case FIXED:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+ flags |= (1ULL<<(index+32)); /* enable fixed counter */
+ break;
+
+ case POWER:
+ if (haveLock)
+ {
+ tmp = 0x0ULL;
+ CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1,(uint32_t*)&tmp));
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST tmp, START_POWER)
+ eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[type].regWidth);
+ }
+ break;
+ case UBOXFIX:
+ if (haveLock)
+ {
+ VERBOSEPRINTREG(cpu_id, counter1, 0x0ULL, CLEAR_UBOXFIX)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+ }
+ break;
+ case UBOX:
+ if (haveLock)
+ {
+ VERBOSEPRINTREG(cpu_id, counter1, 0x0ULL, CLEAR_UBOX)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+ }
+ break;
+ case CBOX0:
+ case CBOX1:
+ case CBOX2:
+ case CBOX3:
+ if (haveLock)
+ {
+ uflags |= (1ULL<<(type-CBOX0));
+ }
+ break;
+ default:
+ break;
+ }
+ eventSet->events[i].threadCounter[thread_id].counterData = eventSet->events[i].threadCounter[thread_id].startData;
+ }
+ }
+
+ if ((haveLock) && (eventSet->regTypeMask & ~(0xFULL)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, uflags|(1ULL<<29), UNFREEZE_UBOXFIX)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_CTRL, uflags|(1ULL<<29)));
+ }
+
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST (1ULL<<63)|(1ULL<<62)|flags, CLEAR_PMC_AND_FIXED_OVERFLOW)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<63)|(1ULL<<62)|flags));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, UNFREEZE_PMC_AND_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
+ }
+
+ return 0;
+}
+
+
+#define SKL_CHECK_CORE_OVERFLOW(offset) \
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+ { \
+ uint64_t ovf_values = 0x0ULL; \
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values)); \
+ if (ovf_values & (1ULL<<offset)) \
+ { \
+ eventSet->events[i].threadCounter[thread_id].overflows++; \
+ } \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<offset))); \
+ }
+
+#define SKL_CHECK_UNCORE_OVERFLOW(offset) \
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+ { \
+ uint64_t ovf_values = 0x0ULL; \
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_STATUS, &ovf_values)); \
+ if (ovf_values & (1ULL<<offset)) \
+ { \
+ eventSet->events[i].threadCounter[thread_id].overflows++; \
+ } \
+ }
+
+#define SKL_CHECK_LOCAL_OVERFLOW \
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+ { \
+ uint64_t ovf_values = 0x0ULL; \
+ uint64_t offset = getCounterTypeOffset(eventSet->events[i].index); \
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, box_map[eventSet->events[i].type].statusRegister, &ovf_values)); \
+ if (ovf_values & (1ULL<<offset)) \
+ { \
+ eventSet->events[i].threadCounter[thread_id].overflows++; \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[eventSet->events[i].type].statusRegister, (1ULL<<offset))); \
+ } \
+ }
+
+int perfmon_stopCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet)
+{
+ int haveLock = 0;
+ uint64_t counter_result = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveLock = 1;
+ }
+
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
+ if ((haveLock) && (eventSet->regTypeMask & ~(0xFULL)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_UBOXFIX)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ counter_result = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ PciDeviceIndex dev = counter_map[index].device;
+ uint64_t counter1 = counter_map[index].counterRegister;
+ uint64_t* current = &(eventSet->events[i].threadCounter[thread_id].counterData);
+ int* overflows = &(eventSet->events[i].threadCounter[thread_id].overflows);
+ switch (type)
+ {
+ case PMC:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ SKL_CHECK_CORE_OVERFLOW(index-cpuid_info.perf_num_fixed_ctr);
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC)
+ *current = field64(counter_result, 0, box_map[type].regWidth);
+ break;
+
+ case FIXED:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ SKL_CHECK_CORE_OVERFLOW(index+32);
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED)
+ *current = field64(counter_result, 0, box_map[type].regWidth);
+ break;
+
+ case POWER:
+ if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
+ {
+ CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, STOP_POWER)
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+ {
+ eventSet->events[i].threadCounter[thread_id].overflows++;
+ }
+ *current = field64(counter_result, 0, box_map[type].regWidth);
+ }
+ break;
+
+ case THERMAL:
+ CHECK_TEMP_READ_ERROR(thermal_read(cpu_id,(uint32_t*)&counter_result));
+ *current = field64(counter_result, 0, box_map[type].regWidth);
+ break;
+
+ case UBOXFIX:
+ if (haveLock)
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ SKL_CHECK_UNCORE_OVERFLOW(box_map[type].ovflOffset);
+ *current = field64(counter_result, 0, 44);
+ }
+ break;
+ case UBOX:
+ if (haveLock)
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ SKL_CHECK_UNCORE_OVERFLOW(box_map[type].ovflOffset);
+ *current = field64(counter_result, 0, 44);
+ }
+ break;
+ case CBOX0:
+ case CBOX1:
+ case CBOX2:
+ case CBOX3:
+ if (haveLock)
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ SKL_CHECK_UNCORE_OVERFLOW(box_map[type].ovflOffset);
+ *current = field64(counter_result, 0, box_map[type].regWidth);
+ }
+ break;
+
+ default:
+ break;
+ }
+ }
+ eventSet->events[i].threadCounter[thread_id].init = FALSE;
+ }
+ if ((haveLock) && (eventSet->regTypeMask & ~(0xFULL)))
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_STATUS, &counter_result));
+ if (counter_result != 0x0ULL)
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_STATUS, counter_result));
+ }
+ }
+
+
+ return 0;
+}
+
+
+int perfmon_readCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet)
+{
+ uint64_t flags = 0x0ULL;
+ uint64_t uflags = 0x0ULL;
+ int haveLock = 0;
+ uint64_t counter_result = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveLock = 1;
+ }
+
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &flags));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, SAFE_PMC_FLAGS)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, RESET_PMC_FLAGS)
+ }
+
+ if ((haveLock) && (eventSet->regTypeMask & ~(0xFULL)))
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_CTRL, &uflags));
+ VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, LLU_CAST uflags, SAFE_UBOXFIX_FLAGS)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_CTRL, 0x0ULL));
+ VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, 0x0ULL, RESET_UBOXFIX_FLAGS)
+ }
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+ {
+ counter_result= 0x0ULL;
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ PciDeviceIndex dev = counter_map[index].device;
+ uint64_t counter1 = counter_map[index].counterRegister;
+ uint64_t* current = &(eventSet->events[i].threadCounter[thread_id].counterData);
+ int* overflows = &(eventSet->events[i].threadCounter[thread_id].overflows);
+ switch (type)
+ {
+ case PMC:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ SKL_CHECK_CORE_OVERFLOW(index-cpuid_info.perf_num_fixed_ctr);
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC)
+ eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+ break;
+
+ case FIXED:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ SKL_CHECK_CORE_OVERFLOW(index+32);
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED)
+ eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+ break;
+
+ case POWER:
+ if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
+ {
+ CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, STOP_POWER)
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+ {
+ eventSet->events[i].threadCounter[thread_id].overflows++;
+ }
+ eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+ }
+ break;
+
+ case THERMAL:
+ CHECK_TEMP_READ_ERROR(thermal_read(cpu_id,(uint32_t*)&counter_result));
+ eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+ break;
+
+ case UBOXFIX:
+ if (haveLock)
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ SKL_CHECK_UNCORE_OVERFLOW(box_map[type].ovflOffset);
+ *current = field64(counter_result, 0, box_map[type].regWidth);
+ }
+ break;
+ case UBOX:
+ if (haveLock)
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ SKL_CHECK_UNCORE_OVERFLOW(box_map[type].ovflOffset);
+ *current = field64(counter_result, 0, box_map[type].regWidth);
+ }
+ break;
+ case CBOX0:
+ case CBOX1:
+ case CBOX2:
+ case CBOX3:
+ if (haveLock)
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ SKL_CHECK_UNCORE_OVERFLOW(box_map[type].ovflOffset);
+ *current = field64(counter_result, 0, box_map[type].regWidth);
+ }
+ break;
+
+ default:
+ break;
+ }
+ }
+ }
+ if ((haveLock) && (eventSet->regTypeMask & ~(0xFULL)))
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_STATUS, &counter_result));
+ if (counter_result != 0x0ULL)
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_STATUS, counter_result));
+ }
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_CTRL, uflags));
+ VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, uflags, RESET_UBOXFIX_FLAGS)
+ }
+
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, RESTORE_PMC_FLAGS)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
+ }
+
+ return 0;
+}
+
+int perfmon_finalizeCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet)
+{
+ int haveLock = 0;
+ int haveTileLock = 0;
+ int clearPBS = 0;
+ uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
+ uint64_t ovf_values_UBOXFIX = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveLock = 1;
+ }
+ if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+ {
+ haveTileLock = 1;
+ }
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ PciDeviceIndex dev = counter_map[index].device;
+ uint64_t reg = counter_map[index].configRegister;
+ switch (type)
+ {
+ case PMC:
+ ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+ if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
+ }
+ else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+ }
+ break;
+ case FIXED:
+ ovf_values_core |= (1ULL<<(index+32));
+ break;
+ default:
+ /*if (counter_map[index].type > UBOXFIX)
+ {
+ if (box_map[counter_map[index].type].ovflOffset >= 0)
+ {
+ ovf_values_UBOXFIX |= (1ULL<<box_map[counter_map[index].type].ovflOffset);
+ }
+ }*/
+ break;
+ }
+ if ((reg) && (((type == PMC)||(type == FIXED))||((type >= UBOXFIX) && (haveLock))))
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, dev, reg, &ovf_values_UBOXFIX));
+ VERBOSEPRINTPCIREG(cpu_id, dev, reg, ovf_values_UBOXFIX, SHOW_CTL);
+ ovf_values_UBOXFIX = 0x0ULL;
+ VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+ if ((type >= SBOX0) && (type <= SBOX3))
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+ }
+ }
+ eventSet->events[i].threadCounter[thread_id].init = FALSE;
+ }
+ if (haveLock && eventSet->regTypeMask & ~(0xFULL))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, LLU_CAST ovf_values_UBOXFIX, CLEAR_UBOXFIX_OVF)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, ovf_values_UBOXFIX));
+ VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST 0x0ULL, CLEAR_UBOXFIX_CTRL)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_CTL, 0x0ULL));
+ }
+
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST ovf_values_core, CLEAR_GLOBAL_OVF)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST 0x0ULL, CLEAR_GLOBAL_CTRL)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
+ return 0;
+}
diff --git a/src/includes/perfmon_skylake_counters.h b/src/includes/perfmon_skylake_counters.h
new file mode 100644
index 0000000..9b0e2c7
--- /dev/null
+++ b/src/includes/perfmon_skylake_counters.h
@@ -0,0 +1,84 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: perfmon_skylake_counters.h
+ *
+ * Description: Counter Header File of perfmon module for Intel Skylake.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#define NUM_COUNTERS_SKYLAKE 24
+#define NUM_COUNTERS_CORE_SKYLAKE 8
+#define NUM_COUNTERS_UNCORE_SKYLAKE 24
+
+#define SKL_VALID_OPTIONS_FIXED EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_COUNT_KERNEL_MASK
+#define SKL_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK| \
+ EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_IN_TRANS_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+static RegisterMap skylake_counter_map[NUM_COUNTERS_SKYLAKE] = {
+ /* Fixed Counters: instructions retired, cycles unhalted core */
+ {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, HAS_VALID_OPTIONS_FIXED},
+ {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, HAS_VALID_OPTIONS_FIXED},
+ {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, HAS_VALID_OPTIONS_FIXED},
+ /* PMC Counters: 4 48bit wide */
+ {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, HAS_VALID_OPTIONS_PMC},
+ {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, HAS_VALID_OPTIONS_PMC},
+ {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, HAS_VALID_OPTIONS_PMC|EVENT_OPTION_IN_TRANS_ABORT_MASK},
+ {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, HAS_VALID_OPTIONS_PMC},
+ /* Temperature Sensor*/
+ {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ /* RAPL counters */
+ {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"PWR4", PMC12, POWER, 0, MSR_PLATFORM_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+ /* Test */
+ {"UBOXFIX", PMC13, UBOXFIX, MSR_UNC_PERF_FIXED_CTRL, MSR_UNC_PERF_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"UBOX0", PMC14, UBOX, MSR_V4_ARB_PERF_FIXED_CTRL0, MSR_V4_ARB_PERF_FIXED_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"UBOX1", PMC15, UBOX, MSR_V4_ARB_PERF_FIXED_CTRL1, MSR_V4_ARB_PERF_FIXED_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"CBOX0C0", PMC16, CBOX0, MSR_V4_C0_PERF_FIXED_CTRL0, MSR_V4_C0_PERF_FIXED_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"CBOX0C1", PMC17, CBOX0, MSR_V4_C0_PERF_FIXED_CTRL1, MSR_V4_C0_PERF_FIXED_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"CBOX1C0", PMC18, CBOX1, MSR_V4_C1_PERF_FIXED_CTRL0, MSR_V4_C1_PERF_FIXED_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"CBOX1C1", PMC19, CBOX1, MSR_V4_C1_PERF_FIXED_CTRL1, MSR_V4_C1_PERF_FIXED_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"CBOX2C0", PMC20, CBOX2, MSR_V4_C2_PERF_FIXED_CTRL0, MSR_V4_C2_PERF_FIXED_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"CBOX2C1", PMC21, CBOX2, MSR_V4_C2_PERF_FIXED_CTRL1, MSR_V4_C2_PERF_FIXED_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"CBOX3C0", PMC22, CBOX3, MSR_V4_C3_PERF_FIXED_CTRL0, MSR_V4_C3_PERF_FIXED_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"CBOX3C1", PMC23, CBOX3, MSR_V4_C3_PERF_FIXED_CTRL1, MSR_V4_C3_PERF_FIXED_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+};
+
+
+static BoxMap skylake_box_map[NUM_UNITS] = {
+ [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_V4_PERF_GLOBAL_STATUS, MSR_V4_PERF_GLOBAL_STATUS_RESET, 0, 0, 0, 48},
+ [THERMAL] = {0, 0, 0, 0, 0, 0, 8},
+ [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_V4_PERF_GLOBAL_STATUS, MSR_V4_PERF_GLOBAL_STATUS_RESET, 0, 0, 0, 48},
+ [POWER] = {0, 0, 0, 0, 0, 0, 32},
+ [UBOXFIX] = {MSR_V4_UNC_PERF_GLOBAL_CTRL, MSR_V4_UNC_PERF_GLOBAL_STATUS, MSR_V4_UNC_PERF_GLOBAL_STATUS, 0, 0, 0, 44},
+ [UBOX] = {MSR_V4_UNC_PERF_GLOBAL_CTRL, MSR_V4_UNC_PERF_GLOBAL_STATUS, MSR_V4_UNC_PERF_GLOBAL_STATUS, 1, 0, 0, 44},
+ [CBOX0] = {MSR_V4_UNC_PERF_GLOBAL_CTRL, MSR_V4_UNC_PERF_GLOBAL_STATUS, MSR_V4_UNC_PERF_GLOBAL_STATUS, 3, 0, 0, 44},
+ [CBOX1] = {MSR_V4_UNC_PERF_GLOBAL_CTRL, MSR_V4_UNC_PERF_GLOBAL_STATUS, MSR_V4_UNC_PERF_GLOBAL_STATUS, 3, 0, 0, 44},
+ [CBOX2] = {MSR_V4_UNC_PERF_GLOBAL_CTRL, MSR_V4_UNC_PERF_GLOBAL_STATUS, MSR_V4_UNC_PERF_GLOBAL_STATUS, 3, 0, 0, 44},
+ [CBOX3] = {MSR_V4_UNC_PERF_GLOBAL_CTRL, MSR_V4_UNC_PERF_GLOBAL_STATUS, MSR_V4_UNC_PERF_GLOBAL_STATUS, 3, 0, 0, 44},
+};
diff --git a/src/includes/perfmon_skylake_events.txt b/src/includes/perfmon_skylake_events.txt
new file mode 100644
index 0000000..9ce3b9a
--- /dev/null
+++ b/src/includes/perfmon_skylake_events.txt
@@ -0,0 +1,599 @@
+# =======================================================================================
+#
+# Filename: perfmon_skylake_events.txt
+#
+# Description: Event list for Intel Skylake
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
+# Author: Jan Treibig (jt), jan.treibig at gmail.com
+# Thomas Roehl (tr), thomas.roehl at googlemail.com
+# Project: likwid
+#
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation, either version 3 of the License, or (at your option) any later
+# version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
+
+EVENT_TEMP_CORE 0x00 TMP0
+UMASK_TEMP_CORE 0x00
+
+EVENT_PWR_PKG_ENERGY 0x00 PWR0
+UMASK_PWR_PKG_ENERGY 0x00
+
+EVENT_PWR_PP0_ENERGY 0x00 PWR1
+UMASK_PWR_PP0_ENERGY 0x00
+
+EVENT_PWR_PP1_ENERGY 0x00 PWR2
+UMASK_PWR_PP1_ENERGY 0x00
+
+EVENT_PWR_DRAM_ENERGY 0x00 PWR3
+UMASK_PWR_DRAM_ENERGY 0x00
+
+EVENT_PWR_PLATFORM_ENERGY 0x00 PWR4
+UMASK_PWR_PLATFORM_ENERGY 0x00
+
+EVENT_INSTR_RETIRED 0x00 FIXC0
+UMASK_INSTR_RETIRED_ANY 0x00
+
+EVENT_CPU_CLK_UNHALTED 0x00 FIXC1
+UMASK_CPU_CLK_UNHALTED_CORE 0x00
+
+EVENT_CPU_CLK_UNHALTED 0x00 FIXC2
+UMASK_CPU_CLK_UNHALTED_REF 0x00
+
+EVENT_ICACHE_16B_IFDATA_STALL 0x80 PMC
+UMASK_ICACHE_16B_IFDATA_STALL 0x04
+
+EVENT_ICACHE_64B_IFTAG 0x83 PMC
+UMASK_ICACHE_64B_IFTAG_HIT 0x01
+UMASK_ICACHE_64B_IFTAG_MISS 0x02
+UMASK_ICACHE_64B_IFTAG_ALL 0x03
+UMASK_ICACHE_64B_IFTAG_STALL 0x04
+
+EVENT_CPU_CLOCK_UNHALTED 0x3C PMC
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P 0x00
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_THREAD_P_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P_ANY 0x00
+UMASK_CPU_CLOCK_UNHALTED_REF_XCLK 0x01
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_REF_XCLK_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_CPU_CLOCK_UNHALTED_REF_XCLK_ANY 0x01
+UMASK_CPU_CLOCK_THREAD_UNHALTED_ONE_THREAD_ACTIVE 0x02
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0x2,EVENT_OPTION_INVERT=0x1
+UMASK_CPU_CLOCK_UNHALTED_TOTAL_CYCLES 0x00
+
+EVENT_BACLEARS 0xE6 PMC
+UMASK_BACLEARS_ANY 0x01
+
+EVENT_ITLB_FLUSH 0xAE PMC
+UMASK_ITLB_FLUSH 0x01
+
+EVENT_LSD_UOPS 0xA8 PMC
+UMASK_LSD_UOPS 0x01
+
+EVENT_ILD_STALL_LCP 0x87 PMC
+UMASK_ILD_STALL_LCP 0x01
+
+EVENT_IDQ 0x79 PMC
+UMASK_IDQ_MITE_UOPS 0x04
+UMASK_IDQ_DSB_UOPS 0x08
+UMASK_IDQ_MS_MITE_UOPS 0x20
+
+EVENT_IDQ 0x79 PMC
+DEFAULT_OPTIONS_IDQ_MS_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_MS_CYCLES 0x30
+DEFAULT_OPTIONS_IDQ_MITE_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_MITE_CYCLES 0x04
+DEFAULT_OPTIONS_IDQ_DSB_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_DSB_CYCLES 0x08
+DEFAULT_OPTIONS_IDQ_MS_DSB_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_MS_DSB_CYCLES 0x10
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_ALL_DSB_CYCLES_4_UOPS 0x18
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS 0x18
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS 0x24
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_ALL_MITE_CYCLES_ANY_UOPS 0x24
+
+EVENT_IDQ_ALL_MITE_CYCLES_ANY_UOPS 0x9C PMC
+UMASK_IDQ_ALL_MITE_CYCLES_ANY_UOPS_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x3
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x2
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=0x1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK 0x01
+
+EVENT_DSB2MITE_SWITCHES_PENALTY_CYCLES 0xAB PMC
+UMASK_DSB2MITE_SWITCHES_PENALTY_CYCLES 0x02
+
+EVENT_INT_MISC 0x0D PMC
+UMASK_INT_MISC_RECOVERY_CYCLES 0x01
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_COUNT EVENT_OPTION_EDGE=1
+UMASK_INT_MISC_RECOVERY_COUNT 0x01
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES_ANY EVENT_OPTION_ANYTHREAD=0x1
+UMASK_INT_MISC_RECOVERY_CYCLES_ANY 0x01
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES_ANY EVENT_OPTION_ANYTHREAD=0x1,EVENT_OPTION_EDGE=1
+UMASK_INT_MISC_RECOVERY_COUNT_ANY 0x01
+UMASK_INT_MISC_CLEAR_RESTEER_CYCLES 0x80
+DEFAULT_OPTIONS_INT_MISC_CLEAR_RESTEER_COUNT EVENT_OPTION_EDGE=1
+UMASK_INT_MISC_CLEAR_RESTEER_COUNT 0x80
+
+
+EVENT_RESOURCE_STALLS 0xA2 PMC
+UMASK_RESOURCE_STALLS_ANY 0x01
+UMASK_RESOURCE_STALLS_SB 0x08
+
+EVENT_UOPS_ISSUED 0x0E PMC
+UMASK_UOPS_ISSUED_ANY 0x01
+UMASK_UOPS_ISSUED_VECTOR_WIDTH_MISMATCH 0x02
+UMASK_UOPS_ISSUED_SLOW_LEA 0x20
+DEFAULT_OPTIONS_UOPS_ISSUED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_ISSUED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_TOTAL_CYCLES 0x01
+
+
+EVENT_TX_EXEC 0x5D PMC
+UMASK_TX_EXEC_MISC1 0x01
+UMASK_TX_EXEC_MISC2 0x02
+UMASK_TX_EXEC_MISC3 0x04
+UMASK_TX_EXEC_MISC4 0x08
+UMASK_TX_EXEC_MISC5 0x10
+
+EVENT_RS_EVENTS_EMPTY 0x5E PMC
+UMASK_RS_EVENTS_EMPTY_CYCLES 0x01
+DEFAULT_OPTIONS_RS_EVENTS_EMPTY_END EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=0x1,EVENT_OPTION_EDGE=0x1
+UMASK_RS_EVENTS_EMPTY_END 0x01
+
+EVENT_HLE_RETIRED 0xC8 PMC
+UMASK_HLE_RETIRED_START 0x01
+UMASK_HLE_RETIRED_COMMIT 0x02
+UMASK_HLE_RETIRED_ABORTED 0x04
+UMASK_HLE_RETIRED_ABORTED_MEM 0x08
+UMASK_HLE_RETIRED_ABORTED_TIMER 0x10
+UMASK_HLE_RETIRED_ABORTED_UNFRIENDLY 0x20
+UMASK_HLE_RETIRED_ABORTED_MEMTYPE 0x40
+UMASK_HLE_RETIRED_ABORTED_EVENTS 0x80
+
+EVENT_RTM_RETIRED 0xC9 PMC
+UMASK_RTM_RETIRED_START 0x01
+UMASK_RTM_RETIRED_COMMIT 0x02
+UMASK_RTM_RETIRED_ABORTED 0x04
+UMASK_RTM_RETIRED_ABORTED_MEM 0x08
+UMASK_RTM_RETIRED_ABORTED_TIMER 0x10
+UMASK_RTM_RETIRED_ABORTED_UNFRIENDLY 0x20
+UMASK_RTM_RETIRED_ABORTED_MEMTYPE 0x40
+UMASK_RTM_RETIRED_ABORTED_EVENTS 0x80
+
+EVENT_MACHINE_CLEARS 0xC3 PMC
+DEFAULT_OPTIONS_MACHINE_CLEARS_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=0x1
+UMASK_MACHINE_CLEARS_COUNT 0x01
+UMASK_MACHINE_CLEARS_MEMORY_ORDERING 0x02
+UMASK_MACHINE_CLEARS_SMC 0x04
+
+EVENT_HW_INTERRUPTS_RECEIVED 0xCB PMC
+UMASK_HW_INTERRUPTS_RECEIVED 0x01
+
+EVENT_INST_RETIRED 0xC0 PMC
+UMASK_INST_RETIRED_ANY 0x00
+
+EVENT_UOPS_RETIRED 0xC2 PMC
+UMASK_UOPS_RETIRED_ALL 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL 0x01
+UMASK_UOPS_RETIRED_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS 0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_RETIRED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_TOTAL_CYCLES 0x01
+
+EVENT_BR_INST_RETIRED 0xC4 PMC
+UMASK_BR_INST_RETIRED_ALL_BRANCHES 0x00
+UMASK_BR_INST_RETIRED_CONDITIONAL 0x01
+UMASK_BR_INST_RETIRED_NEAR_CALL 0x02
+UMASK_BR_INST_RETIRED_NEAR_RETURN 0x08
+UMASK_BR_INST_RETIRED_NOT_TAKEN 0x10
+UMASK_BR_INST_RETIRED_NEAR_TAKEN 0x20
+UMASK_BR_INST_RETIRED_FAR_BRANCH 0x40
+
+EVENT_BR_MISP_RETIRED 0xC5 PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES 0x00
+UMASK_BR_MISP_RETIRED_CONDITIONAL 0x01
+UMASK_BR_MISP_RETIRED_NEAR_TAKEN 0x20
+
+EVENT_FP_ARITH_INST_RETIRED 0xC7 PMC
+UMASK_FP_ARITH_INST_RETIRED_SCALAR_DOUBLE 0x01
+UMASK_FP_ARITH_INST_RETIRED_SCALAR_SINGLE 0x02
+UMASK_FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE 0x04
+UMASK_FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE 0x08
+UMASK_FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE 0x10
+UMASK_FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE 0x20
+
+EVENT_FP_ASSIST_ANY 0xCA PMC
+DEFAULT_OPTIONS_FP_ASSIST_ANY EVENT_OPTION_THRESHOLD=0x1
+UMASK_FP_ASSIST_ANY 0x1E
+
+EVENT_MEM_INST_RETIRED 0xD0 PMC
+UMASK_MEM_INST_RETIRED_STLB_MISS_LOADS 0x11
+UMASK_MEM_INST_RETIRED_STLB_MISS_STORES 0x12
+UMASK_MEM_INST_RETIRED_LOCK_LOADS 0x21
+UMASK_MEM_INST_RETIRED_SPLIT_LOADS 0x41
+UMASK_MEM_INST_RETIRED_SPLIT_STORES 0x42
+UMASK_MEM_INST_RETIRED_ALL_LOADS 0x81
+UMASK_MEM_INST_RETIRED_ALL_STORES 0x82
+UMASK_MEM_INST_RETIRED_ALL 0x83
+
+EVENT_MEM_LOAD_RETIRED 0xD1 PMC
+UMASK_MEM_LOAD_RETIRED_L1_HIT 0x01
+UMASK_MEM_LOAD_RETIRED_L2_HIT 0x02
+UMASK_MEM_LOAD_RETIRED_L3_HIT 0x04
+UMASK_MEM_LOAD_RETIRED_L1_MISS 0x08
+UMASK_MEM_LOAD_RETIRED_L2_MISS 0x10
+UMASK_MEM_LOAD_RETIRED_L3_MISS 0x20
+UMASK_MEM_LOAD_RETIRED_FB_HIT 0x40
+
+EVENT_MEM_LOAD_L3_HIT_RETIRED 0xD2 PMC
+UMASK_MEM_LOAD_L3_HIT_RETIRED_XSNP_MISS 0x01
+UMASK_MEM_LOAD_L3_HIT_RETIRED_XSNP_HIT 0x02
+UMASK_MEM_LOAD_L3_HIT_RETIRED_XSNP_HITM 0x04
+UMASK_MEM_LOAD_L3_HIT_RETIRED_XSNP_NONE 0x08
+
+EVENT_FRONTEND_RETIRED 0xC6 PMC
+UMASK_FRONTEND_RETIRED_DSB_MISS 0x01 0x00 0x11
+UMASK_FRONTEND_RETIRED_L1I_MISS 0x01 0x00 0x12
+UMASK_FRONTEND_RETIRED_L2_MISS 0x01 0x00 0x13
+UMASK_FRONTEND_RETIRED_ITLB_MISS 0x01 0x00 0x14
+UMASK_FRONTEND_RETIRED_STLB_MISS 0x01 0x00 0x15
+UMASK_FRONTEND_RETIRED_LATENCY_GE_2 0x01 0x00 0x400206
+UMASK_FRONTEND_RETIRED_LATENCY_GE_2_BUBBLES_GE_2 0x01 0x00 0x200206
+UMASK_FRONTEND_RETIRED_LATENCY_GE_4 0x01 0x00 0x400406
+
+EVENT_UOPS_EXECUTED 0xB1 PMC
+UMASK_UOPS_EXECUTED_THREAD 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_USED_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_NONE_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CYCLES_NONE_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC 0x01
+UMASK_UOPS_EXECUTED_CORE 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_USED_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_TOTAL_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_NONE_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_CYCLES_NONE_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC 0x02
+UMASK_UOPS_EXECUTED_X87 0x10
+
+
+EVENT_EXE_ACTIVITY 0xA6 PMC
+UMASK_EXE_ACTIVITY_EXE_BOUND_0_PORTS 0x01
+UMASK_EXE_ACTIVITY_1_PORTS_UTIL 0x02
+UMASK_EXE_ACTIVITY_2_PORTS_UTIL 0x04
+UMASK_EXE_ACTIVITY_3_PORTS_UTIL 0x08
+UMASK_EXE_ACTIVITY_4_PORTS_UTIL 0x10
+UMASK_EXE_ACTIVITY_BOUND_ON_STORES 0x40
+
+EVENT_UOPS_DISPATCHED_PORT 0xA1 PMC
+UMASK_UOPS_DISPATCHED_PORT_PORT_0 0x01
+UMASK_UOPS_DISPATCHED_PORT_PORT_1 0x02
+UMASK_UOPS_DISPATCHED_PORT_PORT_2 0x04
+UMASK_UOPS_DISPATCHED_PORT_PORT_3 0x08
+UMASK_UOPS_DISPATCHED_PORT_PORT_4 0x10
+UMASK_UOPS_DISPATCHED_PORT_PORT_5 0x20
+UMASK_UOPS_DISPATCHED_PORT_PORT_6 0x40
+UMASK_UOPS_DISPATCHED_PORT_PORT_7 0x80
+UMASK_UOPS_DISPATCHED_PORT_ARITH_PORTS 0x63
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_ARITH_PORTS_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_DISPATCHED_PORT_ARITH_PORTS_CORE 0x63
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_DATA_PORTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_DISPATCHED_PORT_DATA_PORTS 0x9C
+
+EVENT_CYCLE_ACTIVITY_STALLS_TOTAL 0xA3 PMC
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_TOTAL EVENT_OPTION_THRESHOLD=0x4
+UMASK_CYCLE_ACTIVITY_STALLS_TOTAL 0x04
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L1D_MISS EVENT_OPTION_THRESHOLD=0x8
+UMASK_CYCLE_ACTIVITY_CYCLES_L1D_MISS 0x08
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L1D_MISS EVENT_OPTION_THRESHOLD=0xC
+UMASK_CYCLE_ACTIVITY_CYCLES_L1D_MISS 0x0C
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L2_MISS EVENT_OPTION_THRESHOLD=0x1
+UMASK_CYCLE_ACTIVITY_CYCLES_L2_MISS 0x01
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_L2_MISS EVENT_OPTION_THRESHOLD=0x5
+UMASK_CYCLE_ACTIVITY_STALLS_L2_MISS 0x05
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L3_MISS EVENT_OPTION_THRESHOLD=0x2
+UMASK_CYCLE_ACTIVITY_CYCLES_L3_MISS 0x02
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_L3_MISS EVENT_OPTION_THRESHOLD=0x6
+UMASK_CYCLE_ACTIVITY_STALLS_L3_MISS 0x06
+
+
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_MEM_ANY EVENT_OPTION_THRESHOLD=0x10
+UMASK_CYCLE_ACTIVITY_CYCLES_MEM_ANY 0x10
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_MEM_ANY EVENT_OPTION_THRESHOLD=0x14
+UMASK_CYCLE_ACTIVITY_STALLS_MEM_ANY 0x14
+
+
+EVENT_EPT_WALK_PENDING 0x4F PMC
+UMASK_EPT_WALK_PENDING 0x10
+
+EVENT_ITLB_MISSES 0x85 PMC
+UMASK_ITLB_MISSES_CAUSES_A_WALK 0x01
+UMASK_ITLB_MISSES_WALK_PENDING 0x10
+UMASK_ITLB_MISSES_STLB_HIT 0x20
+UMASK_ITLB_MISSES_WALK_COMPLETED 0x0E
+DEFAULT_OPTIONS_ITLB_MISSES_WALK_ACTIVE EVENT_OPTION_THRESHOLD=0x1
+UMASK_ITLB_MISSES_WALK_ACTIVE 0x10
+
+EVENT_DTLB_LOAD_MISSES 0x08 PMC
+UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK 0x01
+UMASK_DTLB_LOAD_MISSES_WALK_PENDING 0x10
+UMASK_DTLB_LOAD_MISSES_STLB_HIT 0x20
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED 0x0E
+DEFAULT_OPTIONS_DTLB_LOAD_MISSES_WALK_ACTIVE EVENT_OPTION_THRESHOLD=0x1
+UMASK_DTLB_LOAD_MISSES_WALK_ACTIVE 0x10
+
+EVENT_DTLB_STORE_MISSES 0x49 PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK 0x01
+UMASK_DTLB_STORE_MISSES_WALK_PENDING 0x10
+UMASK_DTLB_STORE_MISSES_STLB_HIT 0x20
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED 0x0E
+DEFAULT_OPTIONS_DTLB_STORE_MISSES_WALK_ACTIVE EVENT_OPTION_THRESHOLD=0x1
+UMASK_DTLB_STORE_MISSES_WALK_ACTIVE 0x10
+
+EVENT_TLB_FLUSH 0xBD PMC
+UMASK_TLB_FLUSH_DTLB_THREAD 0x01
+UMASK_TLB_FLUSH_STLB_ANY 0x20
+
+EVENT_L1D 0x51 PMC
+UMASK_L1D_REPLACEMENT 0x01
+UMASK_L1D_M_EVICT 0x04
+
+EVENT_TX_MEM 0x54 PMC
+UMASK_TX_MEM_ABORT_CONFLICT 0x01
+UMASK_TX_MEM_ABORT_CAPACITY 0x02
+UMASK_TX_MEM_ABORT_HLE_STORE_TO_ELIDED_LOCK 0x04
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_NOT_EMPTY 0x08
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_MISMATCH 0x10
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_UNSUPPORTED_ALIGNMENT 0x20
+UMASK_TX_MEM_HLE_ELISION_BUFFER_FULL 0x40
+
+EVENT_L1D_PEND_MISS 0x48 PMC
+UMASK_L1D_PEND_MISS_PENDING 0x01
+UMASK_L1D_PEND_MISS_FB_FULL 0x02
+DEFAULT_OPTIONS_L1D_PEND_MISS_PENDING_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_L1D_PEND_MISS_PENDING_CYCLES 0x01
+DEFAULT_OPTIONS_L1D_PEND_MISS_PENDING_CYCLES_ANY EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=0x1
+UMASK_L1D_PEND_MISS_PENDING_CYCLES_ANY 0x01
+
+EVENT_LOAD_HIT_PRE_SW_PF 0x4C PMC
+UMASK_LOAD_HIT_PRE_SW_PF 0x01
+
+EVENT_LOCK_CYCLES_CACHE_LOCK_DURATION 0x63 PMC
+UMASK_LOCK_CYCLES_CACHE_LOCK_DURATION 0x02
+DEFAULT_OPTIONS_LOCK_CYCLES_CACHE_LOCK_COUNT EVENT_OPTION_EDGE=1
+UMASK_LOCK_CYCLES_CACHE_LOCK_COUNT 0x02
+
+EVENT_LD_BLOCKS 0x03 PMC
+UMASK_LD_BLOCKS_STORE_FORWARD 0x02
+UMASK_LD_BLOCKS_NO_SR 0x08
+
+EVENT_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS 0x07 PMC
+UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS 0x01
+
+EVENT_OFFCORE_REQUESTS 0xB0 PMC
+UMASK_OFFCORE_REQUESTS_DEMAND_DATA_RD 0x01
+UMASK_OFFCORE_REQUESTS_DEMAND_CODE_RD 0x02
+UMASK_OFFCORE_REQUESTS_DEMAND_RFO 0x04
+UMASK_OFFCORE_REQUESTS_ALL_DATA_RD 0x08
+UMASK_OFFCORE_REQUESTS_L3_MISS_DEMAND_DATA_RD 0x10
+UMASK_OFFCORE_REQUESTS_ALL_REQUESTS 0x80
+
+EVENT_OFFCORE_REQUESTS_OUTSTANDING 0x60 PMC
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD 0x01
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD_GE_6 EVENT_OPTION_THRESHOLD=0x6
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD_GE_6 0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD 0x02
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO 0x04
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD 0x08
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_L3_MISS_DEMAND_DATA_RD 0x10
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DEMAND_DATA_RD EVENT_OPTION_THRESHOLD=0x1
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DEMAND_DATA_RD 0x01
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DATA_RD EVENT_OPTION_THRESHOLD=0x1
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DATA_RD 0x08
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DEMAND_CODE_RD EVENT_OPTION_THRESHOLD=0x1
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DEMAND_CODE_RD 0x02
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DEMAND_RFO EVENT_OPTION_THRESHOLD=0x1
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DEMAND_RFO 0x04
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_L3_MISS_DEMAND_DATA_RD EVENT_OPTION_THRESHOLD=0x1
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_L3_MISS_DEMAND_DATA_RD 0x10
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_L3_MISS_DEMAND_DATA_RD_GE_6 EVENT_OPTION_THRESHOLD=0x6
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_L3_MISS_DEMAND_DATA_RD_GE_6 0x10
+
+
+EVENT_OFFCORE_REQUESTS_BUFFER_SQ_FULL 0xB2 PMC
+UMASK_OFFCORE_REQUESTS_BUFFER_SQ_FULL 0x01
+
+EVENT_L2_TRANS 0xF0 PMC
+UMASK_L2_TRANS_L2_WB 0x40
+UMASK_L2_TRANS_ALL_REQUESTS 0x80
+
+EVENT_LONGEST_LAT_CACHE 0x2E PMC
+UMASK_LONGEST_LAT_CACHE_MISS 0x41
+UMASK_LONGEST_LAT_CACHE_REFERENCE 0x4F
+
+
+EVENT_L2_RQSTS 0x24 PMC
+UMASK_L2_RQSTS_DEMAND_DATA_RD_MISS 0x21
+UMASK_L2_RQSTS_DEMAND_DATA_RD_HIT 0x41
+UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD 0xE1
+UMASK_L2_RQSTS_ALL_RFO 0xE2
+UMASK_L2_RQSTS_ALL_CODE_RD 0xE4
+UMASK_L2_RQSTS_ALL_PF 0xF8
+UMASK_L2_RQSTS_PF_MISS 0x38
+UMASK_L2_RQSTS_PF_HIT 0xD8
+UMASK_L2_RQSTS_RFO_HIT 0x42
+UMASK_L2_RQSTS_RFO_MISS 0x22
+UMASK_L2_RQSTS_CODE_RD_HIT 0x44
+UMASK_L2_RQSTS_CODE_RD_MISS 0x24
+UMASK_L2_RQSTS_ALL_DEMAND_MISS 0x27
+UMASK_L2_RQSTS_ALL_DEMAND_REFERENCES 0xE7
+UMASK_L2_RQSTS_MISS 0x3F
+UMASK_L2_RQSTS_REFERENCES 0xFF
+
+EVENT_IDQ_MS 0x79 PMC
+UMASK_IDQ_MS_UOPS 0x30
+DEFAULT_OPTIONS_IDQ_MS_SWITCHES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=0x1
+UMASK_IDQ_MS_SWITCHES 0x30
+
+EVENT_L2_LINES 0xF1 PMC
+UMASK_L2_LINES_IN_ALL 0x07
+
+EVENT_ARITH_DIVIDER_ACTIVE 0x14 PMC
+UMASK_ARITH_DIVIDER_ACTIVE 0x01
+DEFAULT_OPTIONS_ARITH_DIVIDER_COUNT EVENT_OPTION_EDGE=0x1
+UMASK_ARITH_DIVIDER_COUNT 0x01
+
+EVENT_LSD_CYCLES 0xA8 PMC
+DEFAULT_OPTIONS_LSD_CYCLES_ACTIVE EVENT_OPTION_THRESHOLD=0x1
+UMASK_LSD_CYCLES_ACTIVE 0x01
+DEFAULT_OPTIONS_LSD_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x4
+UMASK_LSD_CYCLES_4_UOPS 0x01
+
+EVENT_OTHER_ASSISTS_ANY 0xC1 PMC
+UMASK_OTHER_ASSISTS_ANY 0x3F
+
+EVENT_FRONTEND_RETIRED_LATENCY 0xC6 PMC
+UMASK_FRONTEND_RETIRED_LATENCY_GE_8 0x01 0x00 0x400806
+UMASK_FRONTEND_RETIRED_LATENCY_GE_16 0x01 0x00 0x401006
+UMASK_FRONTEND_RETIRED_LATENCY_GE_32 0x01 0x00 0x402006
+UMASK_FRONTEND_RETIRED_LATENCY_GE_64 0x01 0x00 0x404006
+UMASK_FRONTEND_RETIRED_LATENCY_GE_128 0x01 0x00 0x408006
+UMASK_FRONTEND_RETIRED_LATENCY_GE_256 0x01 0x00 0x410006
+UMASK_FRONTEND_RETIRED_LATENCY_GE_512 0x01 0x00 0x420006
+UMASK_FRONTEND_RETIRED_LATENCY_GE_2_BUBBLES_GE_1 0x01 0x00 0x100206
+UMASK_FRONTEND_RETIRED_LATENCY_GE_2_BUBBLES_GE_3 0x01 0x00 0x300206
+
+EVENT_OFFCORE_RESPONSE_0 0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS 0x01 0xFF 0xFF
+
+EVENT_OFFCORE_RESPONSE_1 0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS 0x01 0xFF 0xFF
+
+EVENT_CACHE_LOOKUP 0x34 CBOX
+UMASK_CACHE_LOOKUP_M 0x01
+UMASK_CACHE_LOOKUP_E 0x02
+UMASK_CACHE_LOOKUP_S 0x04
+UMASK_CACHE_LOOKUP_I 0x08
+UMASK_CACHE_LOOKUP_READ_FILTER 0x10
+UMASK_CACHE_LOOKUP_WRITE_FILTER 0x20
+UMASK_CACHE_LOOKUP_EXTSNP_FILTER 0x40
+UMASK_CACHE_LOOKUP_ANY_REQUEST_FILTER 0x80
+UMASK_CACHE_LOOKUP_READ_M 0x11
+UMASK_CACHE_LOOKUP_WRITE_M 0x21
+UMASK_CACHE_LOOKUP_EXTSNP_M 0x41
+UMASK_CACHE_LOOKUP_ANY_M 0x81
+UMASK_CACHE_LOOKUP_READ_E 0x12
+UMASK_CACHE_LOOKUP_WRITE_E 0x22
+UMASK_CACHE_LOOKUP_EXTSNP_E 0x42
+UMASK_CACHE_LOOKUP_ANY_E 0x82
+UMASK_CACHE_LOOKUP_READ_S 0x14
+UMASK_CACHE_LOOKUP_WRITE_S 0x24
+UMASK_CACHE_LOOKUP_EXTSNP_S 0x44
+UMASK_CACHE_LOOKUP_ANY_S 0x84
+UMASK_CACHE_LOOKUP_READ_ES 0x16
+UMASK_CACHE_LOOKUP_WRITE_ES 0x26
+UMASK_CACHE_LOOKUP_EXTSNP_ES 0x46
+UMASK_CACHE_LOOKUP_ANY_ES 0x86
+UMASK_CACHE_LOOKUP_READ_I 0x18
+UMASK_CACHE_LOOKUP_WRITE_I 0x28
+UMASK_CACHE_LOOKUP_EXTSNP_I 0x48
+UMASK_CACHE_LOOKUP_ANY_I 0x88
+UMASK_CACHE_LOOKUP_READ_MESI 0x1F
+UMASK_CACHE_LOOKUP_WRITE_MESI 0x2F
+UMASK_CACHE_LOOKUP_EXTSNP_MESI 0x4F
+UMASK_CACHE_LOOKUP_ANY_MESI 0x8F
+
+EVENT_XSNP_RESPONSE 0x22 CBOX
+UMASK_XSNP_RESPONSE_MISS_EXTERNAL 0x21
+UMASK_XSNP_RESPONSE_MISS_XCORE 0x41
+UMASK_XSNP_RESPONSE_MISS_EVICTION 0x81
+UMASK_XSNP_RESPONSE_HIT_EXTERNAL 0x24
+UMASK_XSNP_RESPONSE_HIT_XCORE 0x44
+UMASK_XSNP_RESPONSE_HIT_EVICTION 0x84
+UMASK_XSNP_RESPONSE_HITM_EXTERNAL 0x28
+UMASK_XSNP_RESPONSE_HITM_XCORE 0x48
+UMASK_XSNP_RESPONSE_HITM_EVICTION 0x88
+
+EVENT_TRK_OCCUPANCY_ALL 0x80 UBOX0
+UMASK_TRK_OCCUPANCY_ALL 0x01
+
+EVENT_TRK_REQUESTS 0x81 UBOX
+UMASK_TRK_REQUESTS_ALL 0x01
+UMASK_TRK_REQUESTS_WRITES 0x20
+
+EVENT_COH_TRK_OCCUPANCY 0x83 UBOX0
+UMASK_COH_TRK_OCCUPANCY 0x01
+
+EVENT_COH_TRK_REQUESTS 0x84 UBOX
+UMASK_COH_TRK_REQUESTS_ALL 0x01
+
+EVENT_UNCORE_CLOCK 0x00 UBOXFIX
+UMASK_UNCORE_CLOCK 0x01
diff --git a/src/includes/perfmon_types.h b/src/includes/perfmon_types.h
index 1f0663a..c93874e 100644
--- a/src/includes/perfmon_types.h
+++ b/src/includes/perfmon_types.h
@@ -7,13 +7,14 @@
* Configures and reads out performance counters
* on x86 based architectures. Supports multi threading.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -34,145 +35,228 @@
#define PERFMON_TYPES_H
#include <bstrlib.h>
-#include <perfmon_group_types.h>
+#include <timer.h>
+#include <inttypes.h>
+#include <perfgroup.h>
+
+#define MAX_EVENT_OPTIONS NUM_EVENT_OPTIONS
/* ##### EXPORTED TYPE DEFINITIONS #################################### */
+/** \addtogroup PerfMon
+ * @{
+ */
+/////////////////////////////////////////////
+
+/*! \brief Enum of possible event and counter options
+
+List of internally used IDs for all event and counter options that are supported
+by LIKWID.
+\extends PerfmonEventOption
+*/
typedef enum {
- PMC0 = 0,
- PMC1, PMC2, PMC3, PMC4, PMC5, PMC6,
- PMC7, PMC8, PMC9, PMC10, PMC11, PMC12,
- PMC13, PMC14, PMC15, PMC16, PMC17, PMC18,
- PMC19, PMC20, PMC21, PMC22, PMC23, PMC24,
- PMC25, PMC26, PMC27, PMC28, PMC29, PMC30,
- PMC31, PMC32, PMC33, PMC34, PMC35, PMC36,
- PMC37, PMC38, PMC39, PMC40, PMC41, PMC42,
- PMC43, PMC44, PMC45, PMC46, PMC47, PMC48,
- PMC49, PMC50, PMC51, PMC52, PMC53, PMC54,
- PMC55, PMC56, PMC57, PMC58, PMC59, PMC60,
- PMC61, PMC62, PMC63, PMC64, PMC65, PMC66,
- PMC67, PMC68, PMC69, PMC70, PMC71, PMC72,
- PMC73, PMC74, PMC75, PMC76, PMC77, PMC78,
- PMC79, PMC80, PMC81, PMC82, PMC83, PMC84,
- PMC85, PMC86, PMC87, PMC88, PMC89, PMC90,
- PMC91, PMC92, PMC93, PMC94, PMC95, PMC96,
- PMC97, PMC98, PMC99, PMC100, PMC101, PMC102,
- PMC103, PMC104, PMC105, PMC106, PMC107, PMC108,
- NUM_PMC} PerfmonCounterIndex;
+ EVENT_OPTION_NONE = 0, /*!< \brief No option, used as False value */
+ EVENT_OPTION_OPCODE, /*!< \brief Match opcode */
+ EVENT_OPTION_MATCH0, /*!< \brief Match0 register */
+ EVENT_OPTION_MATCH1, /*!< \brief Match1 register */
+ EVENT_OPTION_MATCH2, /*!< \brief Match2 register */
+ EVENT_OPTION_MATCH3, /*!< \brief Match3 register */
+ EVENT_OPTION_MASK0, /*!< \brief Mask0 register */
+ EVENT_OPTION_MASK1, /*!< \brief Mask1 register */
+ EVENT_OPTION_MASK2, /*!< \brief Mask2 register */
+ EVENT_OPTION_MASK3, /*!< \brief Mask3 register */
+ EVENT_OPTION_NID, /*!< \brief Set NUMA node ID */
+ EVENT_OPTION_TID, /*!< \brief Set Thread ID */
+ EVENT_OPTION_STATE, /*!< \brief Match for state */
+ EVENT_OPTION_EDGE, /*!< \brief Increment counter at each edge */
+ EVENT_OPTION_THRESHOLD, /*!< \brief Increment only if exceeding threshold */
+ EVENT_OPTION_INVERT, /*!< \brief Invert behavior of EVENT_OPTION_THRESHOLD, hence increment only below threshold */
+ EVENT_OPTION_COUNT_KERNEL, /*!< \brief Also count events when in kernel space */
+ EVENT_OPTION_ANYTHREAD, /*!< \brief Increment counter at events of all HW threads in the core */
+ EVENT_OPTION_OCCUPANCY, /*!< \brief Count occupancy not occurrences */
+ EVENT_OPTION_OCCUPANCY_FILTER, /*!< \brief Filter for occupancy counting */
+ EVENT_OPTION_OCCUPANCY_EDGE, /*!< \brief Increment occupancy counter at detection of an edge */
+ EVENT_OPTION_OCCUPANCY_INVERT, /*!< \brief Invert filter for occupancy counting */
+ EVENT_OPTION_IN_TRANS, /*!< \brief Count events during transactions */
+ EVENT_OPTION_IN_TRANS_ABORT, /*!< \brief Count events that aborted during transactions */
+ NUM_EVENT_OPTIONS /*!< \brief Amount of defined options */
+} EventOptionType;
+
+/*! \brief Enum of possible states of an event group
+List of states for event groups
+*/
typedef enum {
- PMC = 0,
- FIXED,
- THERMAL,
- UNCORE,
- MBOX0,
- MBOX1,
- MBOX2,
- MBOX3,
- MBOXFIX,
- BBOX0,
- BBOX1,
- RBOX0,
- RBOX1,
- WBOX,
- SBOX0,
- SBOX1,
- SBOX2,
- CBOX0,
- CBOX1,
- CBOX2,
- CBOX3,
- CBOX4,
- CBOX5,
- CBOX6,
- CBOX7,
- CBOX8,
- CBOX9,
- CBOX10,
- CBOX11,
- CBOX12,
- CBOX13,
- CBOX14,
- PBOX,
- POWER,
- UBOX,
- NUM_UNITS} PerfmonType;
+ STATE_NONE = 0, /*!< \brief Not configured, not started and not stopped */
+ STATE_SETUP, /*!< \brief The event set hold by group is configured */
+ STATE_START, /*!< \brief The event set hold by group is current running */
+} GroupState;
-typedef struct {
- char* key;
- PerfmonCounterIndex index;
- PerfmonType type;
- uint64_t configRegister;
- uint64_t counterRegister;
- uint64_t counterRegister2;
- PciDeviceIndex device;
-} PerfmonCounterMap;
+/*! \brief List of option names
+List of strings for all event and counter options used for matching and output
+*/
+extern char* eventOptionTypeName[NUM_EVENT_OPTIONS];
+
+/** \brief Bitmask with no event/counter option set */
+#define EVENT_OPTION_NONE_MASK 0x0ULL
+/** \brief Define for easily creating an bitmask of all configured event/counter options */
+#define OPTIONS_TYPE_MASK(type) \
+ (((type == EVENT_OPTION_NONE)||(type >= NUM_EVENT_OPTIONS)) ? \
+ EVENT_OPTION_NONE_MASK : \
+ (1ULL<<type))
+
+
+/** @cond */
+#define EVENT_OPTION_OPCODE_MASK (1ULL<<EVENT_OPTION_OPCODE)
+#define EVENT_OPTION_MATCH0_MASK (1ULL<<EVENT_OPTION_MATCH0)
+#define EVENT_OPTION_MATCH1_MASK (1ULL<<EVENT_OPTION_MATCH1)
+#define EVENT_OPTION_MATCH2_MASK (1ULL<<EVENT_OPTION_MATCH2)
+#define EVENT_OPTION_MATCH3_MASK (1ULL<<EVENT_OPTION_MATCH3)
+#define EVENT_OPTION_MASK0_MASK (1ULL<<EVENT_OPTION_MASK0)
+#define EVENT_OPTION_MASK1_MASK (1ULL<<EVENT_OPTION_MASK1)
+#define EVENT_OPTION_MASK2_MASK (1ULL<<EVENT_OPTION_MASK2)
+#define EVENT_OPTION_MASK3_MASK (1ULL<<EVENT_OPTION_MASK3)
+#define EVENT_OPTION_NID_MASK (1ULL<<EVENT_OPTION_NID)
+#define EVENT_OPTION_TID_MASK (1ULL<<EVENT_OPTION_TID)
+#define EVENT_OPTION_STATE_MASK (1ULL<<EVENT_OPTION_STATE)
+#define EVENT_OPTION_EDGE_MASK (1ULL<<EVENT_OPTION_EDGE)
+#define EVENT_OPTION_THRESHOLD_MASK (1ULL<<EVENT_OPTION_THRESHOLD)
+#define EVENT_OPTION_INVERT_MASK (1ULL<<EVENT_OPTION_INVERT)
+#define EVENT_OPTION_COUNT_KERNEL_MASK (1ULL<<EVENT_OPTION_COUNT_KERNEL)
+#define EVENT_OPTION_ANYTHREAD_MASK (1ULL<<EVENT_OPTION_ANYTHREAD)
+#define EVENT_OPTION_OCCUPANCY_MASK (1ULL<<EVENT_OPTION_OCCUPANCY)
+#define EVENT_OPTION_OCCUPANCY_FILTER_MASK (1ULL<<EVENT_OPTION_OCCUPANCY_FILTER)
+#define EVENT_OPTION_OCCUPANCY_EDGE_MASK (1ULL<<EVENT_OPTION_OCCUPANCY_EDGE)
+#define EVENT_OPTION_OCCUPANCY_INVERT_MASK (1ULL<<EVENT_OPTION_OCCUPANCY_INVERT)
+#define EVENT_OPTION_IN_TRANS_MASK (1ULL<<EVENT_OPTION_IN_TRANS)
+#define EVENT_OPTION_IN_TRANS_ABORT_MASK (1ULL<<EVENT_OPTION_IN_TRANS_ABORT)
+/** @endcond */
+
+/*! \brief Structure specifying thread to CPU relation
+
+Threads are always numbered incrementally. This structure is used in order to
+resolve the real HW thread ID.
+\extends PerfmonGroupSet
+*/
typedef struct {
- const char* key;
- PerfmonGroup index;
- int isUncore;
- const char* info;
- const char* config;
- int derivedCounters;
- const char ** derivedCounterNames;
-} PerfmonGroupMap;
+ int thread_id; /*!< \brief Thread ID how it is used internally */
+ int processorId; /*!< \brief Real HW thread ID */
+} PerfmonThread;
+/*! \brief Structure specifying event/counter options and their value
+
+Most options set a bitfield in registers and their values are stored in this structure.
+If an option is a binary option, the value is set to 1.
+\extends PerfmonEvent
+*/
typedef struct {
- char* key;
- char* msg;
-} PerfmonGroupHelp;
+ EventOptionType type; /*!< \brief Type of the option */
+ uint64_t value; /*!< \brief Value of the option */
+} PerfmonEventOption;
+
+/*! \brief Structure specifying an performance monitoring event
-/* only used in westmereEX at the moment */
+This structure holds the configuration data for an event. It groups the name,
+the allowed counters and internally used values like event ID and masks. Moreover
+the event options are hold here.
+\extends PerfmonEventSetEntry
+*/
typedef struct {
- uint32_t ctrlRegister;
- uint32_t statusRegister;
- uint32_t ovflRegister;
-} PerfmonUnit;
+ const char* name; /*!< \brief Name of the event */
+ const char* limit; /*!< \brief Valid counters for the event */
+ uint16_t eventId; /*!< \brief ID of the event */
+ uint8_t umask; /*!< \brief Most events need to specify a mask to limit counting */
+ uint8_t cfgBits; /*!< \brief Misc configuration bits */
+ uint64_t cmask; /*!< \brief Misc mask bits */
+ uint8_t numberOfOptions; /*!< \brief Number of options for the event */
+ uint64_t optionMask; /*!< \brief Bitmask for fast check of set options */
+ PerfmonEventOption options[NUM_EVENT_OPTIONS]; /*!< \brief List of options */
+} PerfmonEvent;
+/*! \brief Structure describing performance monitoring counter data
+
+Each event holds one of these structures for each thread to store the counter
+data, if it is configured and the amount of happened overflows.
+\extends PerfmonEventSetEntry
+*/
typedef struct {
- int init;
- int id; /* TODO id is only used for EX type processors */
- double counterData;
+ int init; /*!< \brief Flag if corresponding control register is set up properly */
+ int id; /*!< \brief Offset in higher level control register, e.g. position of enable bit */
+ int overflows; /*!< \brief Amount of overflows */
+ uint64_t startData; /*!< \brief Start data from the counter */
+ uint64_t counterData; /*!< \brief Intermediate data from the counters */
+ double lastResult; /*!< \brief Last measurement result*/
+ double fullResult; /*!< \brief Aggregated measurement result */
} PerfmonCounter;
-typedef struct {
- int processorId;
- PerfmonCounter counters[NUM_PMC];
-} PerfmonThread;
-typedef struct {
- const char* name;
- const char* limit;
- uint16_t eventId;
- uint8_t umask;
- uint8_t cfgBits;
- uint8_t cmask;
-} PerfmonEvent;
+/*! \brief Structure specifying an performance monitoring event
+An eventSet consists of an event and a counter and the read counter values.
+\extends PerfmonEventSet
+*/
typedef struct {
- PerfmonEvent event;
- PerfmonCounterIndex index;
- double* result;
+ PerfmonEvent event; /*!< \brief Event configuration */
+ RegisterIndex index; /*!< \brief Index of the counter register in the counter map */
+ RegisterType type; /*!< \brief Type of the counter register and event */
+ PerfmonCounter* threadCounter; /*!< \brief List of counter data for each thread, list length is \a numberOfThreads in PerfmonGroupSet */
} PerfmonEventSetEntry;
+/*! \brief Structure specifying an performance monitoring event group
+
+A PerfmonEventSet holds a set of event and counter combinations and some global information about all eventSet entries
+\extends PerfmonGroupSet
+*/
typedef struct {
- int numberOfEvents;
- PerfmonEventSetEntry* events;
+ int numberOfEvents; /*!< \brief Number of eventSets in \a events */
+ PerfmonEventSetEntry* events; /*!< \brief List of eventSets */
+ TimerData timer; /*!< \brief Time information how long the counters were running */
+ double rdtscTime; /*!< \brief Evaluation of the Time information in seconds */
+ double runTime; /*!< \brief Sum of all time information in seconds that the group was running */
+#ifdef __x86_64
+ __uint128_t regTypeMask; /*!< \brief Bitmask for easy checks which types are included in the eventSet */
+#else
+ uint64_t regTypeMask; /*!< \brief Bitmask for easy checks which types are included in the eventSet */
+#endif
+ GroupState state; /*!< \brief Current state of the event group (configured, started, none) */
+ GroupInfo group; /*!< \brief Structure holding the performance group information */
} PerfmonEventSet;
+/*! \brief Structure specifying all performance monitoring event groups
+The global PerfmonGroupSet structure holds all eventSets and threads that are
+configured to measure. Only one eventSet can be measured at a time but the groups
+can be switched to perform some kind of multiplexing.
+*/
typedef struct {
- bstring label;
- double* value;
-} PerfmonResult;
+ int numberOfGroups; /*!< \brief List length of \a groups*/
+ int numberOfActiveGroups; /*!< \brief Amount of added eventSets. Only those eventSets can be accessed in \a groups. */
+ int activeGroup; /*!< \brief Currently active eventSet */
+ PerfmonEventSet* groups; /*!< \brief List of eventSets */
+ int numberOfThreads; /*!< \brief Amount of threads in \a threads */
+ PerfmonThread* threads; /*!< \brief List of threads */
+} PerfmonGroupSet;
-typedef struct {
- bstrList* header;
- int numRows;
- int numColumns;
- PerfmonResult* rows;
-} PerfmonResultTable;
+/** \brief List of counter with name, config register, counter registers and
+if needed PCI device */
+extern RegisterMap* counter_map;
+/** \brief List of boxes with name, config register, counter registers and if
+needed PCI device. Mainly used in Uncore handling but also core-local counters
+are defined as a box. */
+extern BoxMap* box_map;
+/** \brief List of events available for the current architecture */
+extern PerfmonEvent* eventHash;
+/** \brief List of PCI devices available for the current architecture */
+extern PciDevice* pci_devices;
+/** @}*/
+
+/* perfmon datatypes */
+extern PerfmonGroupSet *groupSet;
+extern int perfmon_numCounters;
+extern int perfmon_numCoreCounters;
+extern int perfmon_numUncoreCounters;
+extern int perfmon_numArchEvents;
#endif /*PERFMON_TYPES_H*/
diff --git a/src/includes/perfmon_westmere.h b/src/includes/perfmon_westmere.h
index c469766..056a2a7 100644
--- a/src/includes/perfmon_westmere.h
+++ b/src/includes/perfmon_westmere.h
@@ -3,15 +3,16 @@
*
* Filename: perfmon_westmere.h
*
- * Description: Header File of perfmon module for Westmere.
+ * Description: Header File of perfmon module for Intel Westmere.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -29,8 +30,6 @@
*/
#include <perfmon_westmere_events.h>
-#include <perfmon_westmere_groups.h>
-static int perfmon_numGroupsWestmere = NUM_GROUPS_WESTMERE;
static int perfmon_numArchEventsWestmere = NUM_ARCH_EVENTS_WESTMERE;
diff --git a/src/includes/perfmon_westmereEX.h b/src/includes/perfmon_westmereEX.h
index 8cbc921..a0c52ac 100644
--- a/src/includes/perfmon_westmereEX.h
+++ b/src/includes/perfmon_westmereEX.h
@@ -3,15 +3,16 @@
*
* Filename: perfmon_westmereEX.h
*
- * Description: Header File of perfmon module for Westmere EX.
+ * Description: Header File of perfmon module for Intel Westmere EX.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -29,736 +30,982 @@
*/
#include <perfmon_westmereEX_events.h>
-#include <perfmon_westmereEX_groups.h>
#include <perfmon_westmereEX_counters.h>
+#include <perfmon_nehalemEX_westmereEX_common.h>
+#include <error.h>
+#include <affinity.h>
static int perfmon_numCountersWestmereEX = NUM_COUNTERS_WESTMEREEX;
-static int perfmon_numGroupsWestmereEX = NUM_GROUPS_WESTMEREEX;
static int perfmon_numArchEventsWestmereEX = NUM_ARCH_EVENTS_WESTMEREEX;
-static PerfmonUnit westmereEX_PMunits[NUM_UNITS];
/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
-void perfmon_init_westmereEX(PerfmonThread *thread)
+int perfmon_init_westmereEX(int cpu_id)
{
+ lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+ lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+ return 0;
+}
+
+uint32_t wex_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint32_t flags = (1ULL<<(1+(index*4)));
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<(index*4));
+ break;
+ case EVENT_OPTION_ANYTHREAD:
+ flags |= (1ULL<<(2+(index*4)));
+ default:
+ break;
+ }
+ }
+ return flags;
+}
+
+int wex_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
uint64_t flags = 0x0ULL;
- int cpu_id = thread->processorId;
-
- msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL2, 0x0ULL);
- msr_write(cpu_id, MSR_PERFEVTSEL3, 0x0ULL);
- msr_write(cpu_id, MSR_PMC0, 0x0ULL);
- msr_write(cpu_id, MSR_PMC1, 0x0ULL);
- msr_write(cpu_id, MSR_PMC2, 0x0ULL);
- msr_write(cpu_id, MSR_PMC3, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_FIXED_CTR0, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_FIXED_CTR1, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_FIXED_CTR2, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
-
- /* initialize fixed counters
- * FIXED 0: Instructions retired
- * FIXED 1: Clocks unhalted core
- * FIXED 2: Clocks unhalted ref */
- //msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
-
- /* Preinit of PERFEVSEL registers */
- //flags |= (1<<22); /* enable flag */
- //flags |= (1<<16); /* user mode flag */
-
- /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
- msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
- msr_write(cpu_id, MSR_PERFEVTSEL2, flags);
- msr_write(cpu_id, MSR_PERFEVTSEL3, flags);*/
-
- /* Initialize uncore */
- /* MBOX */
- thread->counters[PMC7].id = 0;
- thread->counters[PMC8].id = 1;
- thread->counters[PMC9].id = 2;
- thread->counters[PMC10].id = 3;
- thread->counters[PMC11].id = 4;
- thread->counters[PMC12].id = 5;
- westmereEX_PMunits[MBOX0].ctrlRegister = MSR_M0_PMON_BOX_CTRL;
- westmereEX_PMunits[MBOX0].statusRegister = MSR_M0_PMON_BOX_STATUS;
- westmereEX_PMunits[MBOX0].ovflRegister = MSR_M0_PMON_BOX_OVF_CTRL;
-
- thread->counters[PMC13].id = 0;
- thread->counters[PMC14].id = 1;
- thread->counters[PMC15].id = 2;
- thread->counters[PMC16].id = 3;
- thread->counters[PMC17].id = 4;
- thread->counters[PMC18].id = 5;
- westmereEX_PMunits[MBOX1].ctrlRegister = MSR_M1_PMON_BOX_CTRL;
- westmereEX_PMunits[MBOX1].statusRegister = MSR_M1_PMON_BOX_STATUS;
- westmereEX_PMunits[MBOX1].ovflRegister = MSR_M1_PMON_BOX_OVF_CTRL;
-
- /* BBOX */
- thread->counters[PMC19].id = 0;
- thread->counters[PMC20].id = 1;
- thread->counters[PMC21].id = 2;
- thread->counters[PMC22].id = 3;
- westmereEX_PMunits[BBOX0].ctrlRegister = MSR_B0_PMON_BOX_CTRL;
- westmereEX_PMunits[BBOX0].statusRegister = MSR_B0_PMON_BOX_STATUS;
- westmereEX_PMunits[BBOX0].ovflRegister = MSR_B0_PMON_BOX_OVF_CTRL;
-
- thread->counters[PMC23].id = 0;
- thread->counters[PMC24].id = 1;
- thread->counters[PMC25].id = 2;
- thread->counters[PMC26].id = 3;
- westmereEX_PMunits[BBOX1].ctrlRegister = MSR_B1_PMON_BOX_CTRL;
- westmereEX_PMunits[BBOX1].statusRegister = MSR_B1_PMON_BOX_STATUS;
- westmereEX_PMunits[BBOX1].ovflRegister = MSR_B1_PMON_BOX_OVF_CTRL;
-
- /* RBOX */
- thread->counters[PMC27].id = 0;
- thread->counters[PMC28].id = 1;
- thread->counters[PMC29].id = 2;
- thread->counters[PMC30].id = 3;
- thread->counters[PMC31].id = 4;
- thread->counters[PMC32].id = 5;
- thread->counters[PMC33].id = 6;
- thread->counters[PMC34].id = 7;
- westmereEX_PMunits[RBOX0].ctrlRegister = MSR_R0_PMON_BOX_CTRL;
- westmereEX_PMunits[RBOX0].statusRegister = MSR_R0_PMON_BOX_STATUS;
- westmereEX_PMunits[RBOX0].ovflRegister = MSR_R0_PMON_BOX_OVF_CTRL;
-
- thread->counters[PMC35].id = 0;
- thread->counters[PMC36].id = 1;
- thread->counters[PMC37].id = 2;
- thread->counters[PMC38].id = 3;
- thread->counters[PMC39].id = 4;
- thread->counters[PMC40].id = 5;
- thread->counters[PMC41].id = 6;
- thread->counters[PMC42].id = 7;
- westmereEX_PMunits[RBOX1].ctrlRegister = MSR_R1_PMON_BOX_CTRL;
- westmereEX_PMunits[RBOX1].statusRegister = MSR_R1_PMON_BOX_STATUS;
- westmereEX_PMunits[RBOX1].ovflRegister = MSR_R1_PMON_BOX_OVF_CTRL;
-
- /* WBOX */
- thread->counters[PMC43].id = 0;
- thread->counters[PMC44].id = 1;
- thread->counters[PMC45].id = 2;
- thread->counters[PMC46].id = 3;
- thread->counters[PMC47].id = 31;
- westmereEX_PMunits[WBOX].ctrlRegister = MSR_W_PMON_BOX_CTRL;
- westmereEX_PMunits[WBOX].statusRegister = MSR_W_PMON_BOX_STATUS;
- westmereEX_PMunits[WBOX].ovflRegister = MSR_W_PMON_BOX_OVF_CTRL;
-
- thread->counters[PMC48].id = 0;
- westmereEX_PMunits[UBOX].ctrlRegister = MSR_U_PMON_GLOBAL_CTRL;
- westmereEX_PMunits[UBOX].statusRegister = MSR_U_PMON_GLOBAL_STATUS;
- westmereEX_PMunits[UBOX].ovflRegister = MSR_U_PMON_GLOBAL_OVF_CTRL;
-
- /* Set IDs for all CBOXes */
- int walker = 0;
- for (int i=PMC49; i<=PMC98; i++)
- {
- thread->counters[i].id = walker;
- walker = (walker == 4 ? 0 : walker + 1);
- }
- westmereEX_PMunits[CBOX0].ctrlRegister = MSR_C0_PMON_BOX_CTRL;
- westmereEX_PMunits[CBOX0].statusRegister = MSR_C0_PMON_BOX_STATUS;
- westmereEX_PMunits[CBOX0].ovflRegister = MSR_C0_PMON_BOX_OVF_CTRL;
- westmereEX_PMunits[CBOX1].ctrlRegister = MSR_C1_PMON_BOX_CTRL;
- westmereEX_PMunits[CBOX1].statusRegister = MSR_C1_PMON_BOX_STATUS;
- westmereEX_PMunits[CBOX1].ovflRegister = MSR_C1_PMON_BOX_OVF_CTRL;
- westmereEX_PMunits[CBOX2].ctrlRegister = MSR_C2_PMON_BOX_CTRL;
- westmereEX_PMunits[CBOX2].statusRegister = MSR_C2_PMON_BOX_STATUS;
- westmereEX_PMunits[CBOX2].ovflRegister = MSR_C2_PMON_BOX_OVF_CTRL;
- westmereEX_PMunits[CBOX3].ctrlRegister = MSR_C3_PMON_BOX_CTRL;
- westmereEX_PMunits[CBOX3].statusRegister = MSR_C3_PMON_BOX_STATUS;
- westmereEX_PMunits[CBOX3].ovflRegister = MSR_C3_PMON_BOX_OVF_CTRL;
- westmereEX_PMunits[CBOX4].ctrlRegister = MSR_C4_PMON_BOX_CTRL;
- westmereEX_PMunits[CBOX4].statusRegister = MSR_C4_PMON_BOX_STATUS;
- westmereEX_PMunits[CBOX4].ovflRegister = MSR_C4_PMON_BOX_OVF_CTRL;
- westmereEX_PMunits[CBOX5].ctrlRegister = MSR_C5_PMON_BOX_CTRL;
- westmereEX_PMunits[CBOX5].statusRegister = MSR_C5_PMON_BOX_STATUS;
- westmereEX_PMunits[CBOX5].ovflRegister = MSR_C5_PMON_BOX_OVF_CTRL;
- westmereEX_PMunits[CBOX6].ctrlRegister = MSR_C6_PMON_BOX_CTRL;
- westmereEX_PMunits[CBOX6].statusRegister = MSR_C6_PMON_BOX_STATUS;
- westmereEX_PMunits[CBOX6].ovflRegister = MSR_C6_PMON_BOX_OVF_CTRL;
- westmereEX_PMunits[CBOX7].ctrlRegister = MSR_C7_PMON_BOX_CTRL;
- westmereEX_PMunits[CBOX7].statusRegister = MSR_C7_PMON_BOX_STATUS;
- westmereEX_PMunits[CBOX7].ovflRegister = MSR_C7_PMON_BOX_OVF_CTRL;
- westmereEX_PMunits[CBOX8].ctrlRegister = MSR_C8_PMON_BOX_CTRL;
- westmereEX_PMunits[CBOX8].statusRegister = MSR_C8_PMON_BOX_STATUS;
- westmereEX_PMunits[CBOX8].ovflRegister = MSR_C8_PMON_BOX_OVF_CTRL;
- westmereEX_PMunits[CBOX9].ctrlRegister = MSR_C9_PMON_BOX_CTRL;
- westmereEX_PMunits[CBOX9].statusRegister = MSR_C9_PMON_BOX_STATUS;
- westmereEX_PMunits[CBOX9].ovflRegister = MSR_C9_PMON_BOX_OVF_CTRL;
-
- thread->counters[PMC99].id = 0;
- thread->counters[PMC100].id = 1;
- thread->counters[PMC101].id = 2;
- thread->counters[PMC102].id = 3;
- westmereEX_PMunits[SBOX0].ctrlRegister = MSR_S0_PMON_BOX_CTRL;
- westmereEX_PMunits[SBOX0].statusRegister = MSR_S0_PMON_BOX_STATUS;
- westmereEX_PMunits[SBOX0].ovflRegister = MSR_S0_PMON_BOX_OVF_CTRL;
- thread->counters[PMC103].id = 0;
- thread->counters[PMC104].id = 1;
- thread->counters[PMC105].id = 2;
- thread->counters[PMC106].id = 3;
- westmereEX_PMunits[SBOX1].ctrlRegister = MSR_S1_PMON_BOX_CTRL;
- westmereEX_PMunits[SBOX1].statusRegister = MSR_S1_PMON_BOX_STATUS;
- westmereEX_PMunits[SBOX1].ovflRegister = MSR_S1_PMON_BOX_OVF_CTRL;
-
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
- lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id))
- {
- msr_write(cpu_id, MSR_W_PMON_BOX_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_W_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_W_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_W_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_W_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_W_PMON_FIXED_CTR, 0x0ULL);
-
- msr_write(cpu_id, MSR_M0_PMON_BOX_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL4, 0x0ULL);
- msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL5, 0x0ULL);
-
- msr_write(cpu_id, MSR_M1_PMON_BOX_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL4, 0x0ULL);
- msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL5, 0x0ULL);
-
- msr_write(cpu_id, MSR_B0_PMON_BOX_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL3, 0x0ULL);
-
- msr_write(cpu_id, MSR_B1_PMON_BOX_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL3, 0x0ULL);
-
- msr_write(cpu_id, MSR_R0_PMON_BOX_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL4, 0x0ULL);
- msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL5, 0x0ULL);
- msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL6, 0x0ULL);
- msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL7, 0x0ULL);
-
- msr_write(cpu_id, MSR_R1_PMON_BOX_CTRL, 0x0ULL);
- msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL8, 0x0ULL);
- msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL9, 0x0ULL);
- msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL10, 0x0ULL);
- msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL11, 0x0ULL);
- msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL12, 0x0ULL);
- msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL13, 0x0ULL);
- msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL14, 0x0ULL);
- msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL15, 0x0ULL);
-
- msr_write(cpu_id, MSR_U_PMON_GLOBAL_EVNT_SEL, 0x0ULL);
-
- msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL4, 0x0ULL);
-
- msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL4, 0x0ULL);
-
- msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL4, 0x0ULL);
-
- msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL4, 0x0ULL);
-
- msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL4, 0x0ULL);
-
- msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL4, 0x0ULL);
-
- msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL4, 0x0ULL);
-
- msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL4, 0x0ULL);
-
- msr_write(cpu_id, MSR_C8_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_C8_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_C8_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_C8_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_C8_PMON_EVNT_SEL4, 0x0ULL);
-
- msr_write(cpu_id, MSR_C9_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_C9_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_C9_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_C9_PMON_EVNT_SEL3, 0x0ULL);
- msr_write(cpu_id, MSR_C9_PMON_EVNT_SEL4, 0x0ULL);
-
- msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL3, 0x0ULL);
-
- msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL0, 0x0ULL);
- msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL1, 0x0ULL);
- msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL2, 0x0ULL);
- msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL3, 0x0ULL);
+ uint64_t offcore_flags = 0x0ULL;
+ flags = (1ULL<<22)|(1ULL<<16);
+ /* Intel with standard 8 bit event mask: [7:0] */
+ flags |= (event->umask<<8) + event->eventId;
+
+ /* set custom cfg and cmask */
+ if ((event->cfgBits != 0) &&
+ (event->eventId != 0xB7) &&
+ (event->eventId != 0xBB))
+ {
+ flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+ }
+
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_COUNT_KERNEL:
+ flags |= (1ULL<<17);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= (event->options[j].value & 0xFFULL)<<24;
+ break;
+ case EVENT_OPTION_MATCH0:
+ offcore_flags |= (event->options[j].value & 0xFFULL);
+ break;
+ case EVENT_OPTION_MATCH1:
+ offcore_flags |= (event->options[j].value & 0xF7ULL)<<8;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (event->eventId == 0xB7)
+ {
+ if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
{
- uint32_t ubflags = 0x0UL;
- ubflags |= (1<<29); /* reset all */
- msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
+ offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
}
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, offcore_flags));
}
+ else if (event->eventId == 0xBB)
+ {
+ if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+ {
+ offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+ }
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, offcore_flags));
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister , flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
}
-#define MBOX_GATE(NUM) \
- flags = 0x41ULL; \
-switch (event->cfgBits) \
-{ \
- case 0x00: /* primary Event */ \
- flags |= (event->eventId<<9); \
- break; \
- case 0x01: /* secondary Events */ \
- /* TODO fvid index is missing defaults to 0 */ \
- flags |= (1<<7); /* toggle flag mode */ \
- flags |= (event->eventId<<19); \
- switch (event->eventId) \
- { \
- case 0x00: /* CYCLES_DSP_FILL: DSP */ \
- { \
- uint64_t dsp_flags = 0x0ULL; \
- dsp_flags |= (event->umask<<7); \
- msr_write(cpu_id, MSR_M##NUM##_PMON_DSP, dsp_flags); \
- } \
- break; \
- case 0x01: /* CYCLES_SCHED_MODE: ISS */ \
- { \
- uint32_t iss_flags = 0x0UL; \
- iss_flags |= (event->umask<<4); \
- msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags); \
- } \
- break; \
- case 0x05: /* CYCLES_PGT_STATE: PGT */ \
- { \
- uint32_t pgt_flags = 0x0UL; \
- pgt_flags |= (event->umask<<6); \
- msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags); \
- } \
- break; \
- case 0x06: /* BCMD_SCHEDQ_OCCUPANCY: MAP */ \
- { \
- uint32_t map_flags = 0x0UL; \
- map_flags |= (event->umask<<6); \
- msr_write(cpu_id, MSR_M##NUM##_PMON_MAP, map_flags); \
- } \
- break; \
- } \
- break; \
- case 0x02: /* DRAM_CMD: PLD/ISS */ \
- flags |= (event->eventId<<9); \
- { \
- uint32_t pld_flags = 0x0UL; \
- uint32_t iss_flags = 0x0UL; \
- pld_flags |= (event->umask<<8); \
- if (event->cmask != 0) \
- { \
- iss_flags |= (event->cmask<<7); \
- pld_flags |= 1; /* toggle cmd flag */ \
- } \
- msr_write(cpu_id, MSR_M##NUM##_PMON_PLD, pld_flags); \
- msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags); \
- } \
- break; \
- case 0x03: /* DSP_FILL: DSP */ \
- flags |= (event->eventId<<9); \
- { \
- uint64_t dsp_flags = 0x0ULL; \
- dsp_flags |= (event->umask<<7); \
- msr_write(cpu_id, MSR_M##NUM##_PMON_DSP, dsp_flags); \
- } \
- break; \
- case 0x04: /* DRAM_MISC: PLD */ \
- flags |= (event->eventId<<9); \
- { \
- uint64_t pld_flags = 0x0ULL; \
- switch (event->cmask) \
- { \
- case 0x0: \
- pld_flags |= (1<<16); \
- pld_flags |= (event->umask<<19); \
- break; \
- case 0x1: \
- pld_flags |= (event->umask<<18); \
- break; \
- case 0x2: \
- pld_flags |= (event->umask<<17); \
- break; \
- case 0x3: \
- pld_flags |= (event->umask<<7); \
- break; \
- } \
- msr_write(cpu_id, MSR_M##NUM##_PMON_PLD, pld_flags); \
- } \
- break; \
- case 0x05: /* FRM_TYPE: ISS */ \
- flags |= (event->eventId<<9); \
- { \
- uint32_t iss_flags = 0x0UL; \
- iss_flags |= event->umask; \
- msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags); \
- } \
- break; \
- case 0x06: /* FVC_EV0: FVC */ \
- flags |= (event->eventId<<9); \
- { \
- uint32_t fvc_flags = 0x0UL; \
- fvc_flags |= (event->umask<<12); \
- if (event->umask == 0x5) \
- { \
- fvc_flags |= (event->cmask<<6); \
- } \
- else \
- { \
- fvc_flags |= (event->cmask<<9); \
- } \
- msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags); \
- VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV0) \
- } \
- break; \
- case 0x07: /* FVC_EV1: FVC */ \
- flags |= (event->eventId<<9); \
- { \
- uint32_t fvc_flags = 0x0UL; \
- fvc_flags |= (event->umask<<15); \
- if (event->umask == 0x5) \
- { \
- fvc_flags |= (event->cmask<<6); \
- } \
- else \
- { \
- fvc_flags |= (event->cmask<<9); \
- } \
- msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags); \
- VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV1) \
- } \
- break; \
- case 0x08: /* FVC_EV2: FVC */ \
- flags |= (event->eventId<<9); \
- { \
- uint32_t fvc_flags = 0x0UL; \
- fvc_flags |= (event->umask<<18); \
- if (event->umask == 0x5) \
- { \
- fvc_flags |= (event->cmask<<6); \
- } \
- else \
- { \
- fvc_flags |= (event->cmask<<9); \
- } \
- msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags); \
- VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV2) \
- } \
- break; \
- case 0x09: /* FVC_EV3: FVC(ZDP) */ \
- flags |= (event->eventId<<9); \
- { \
- uint32_t fvc_flags = 0x0UL; \
- fvc_flags |= (event->umask<<21); \
- if (event->umask == 0x5) \
- { \
- fvc_flags |= (event->cmask<<6); \
- } \
- else \
- { \
- fvc_flags |= (event->cmask<<9); \
- } \
- msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags); \
- } \
- break; \
- case 0x0A: /* ISS_SCHED: ISS */ \
- flags |= (event->eventId<<9); \
- { \
- uint32_t iss_flags = 0x0UL; \
- iss_flags |= (event->umask<<10); \
- msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags); \
- } \
- break; \
- case 0x0B: /* PGT_PAGE_EV: PGT */ \
- flags |= (event->eventId<<9); \
- { \
- uint32_t pgt_flags = 0x0UL; \
- pgt_flags |= event->umask; \
- msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags); \
- } \
- break; \
- case 0x0C: /* PGT_PAGE_EV2: PGT */ \
- flags |= (event->eventId<<9); \
- { \
- uint32_t pgt_flags = 0x0UL; \
- pgt_flags |= (event->umask<<11); \
- msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags); \
- } \
- break; \
- case 0x0D: /* THERM_TRP_DN: THR */ \
- flags |= (event->eventId<<9); \
- { \
- uint32_t thr_flags = 0x0UL; \
- thr_flags |= (1<<3); \
- thr_flags |= (event->umask<<9); \
- msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, thr_flags); \
- } \
- break; \
+int wex_bbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ RegisterType type = counter_map[index].type;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+
+ flags = 0x1ULL;
+ flags |= (event->eventId<<1);
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_MATCH0:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1, event->options[j].value));
+ VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, event->options[j].value, SETUP_BBOX_MATCH);
+ break;
+ case EVENT_OPTION_MASK0:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2, event->options[j].value));
+ VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister2, event->options[j].value, SETUP_BBOX_MASK);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_BBOX);
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
}
-/* RBOX macros */
-#define RBOX_GATE(NUM) \
- flags = 0x01ULL; /* set local enable flag */ \
-switch (event->eventId) { \
- case 0x00: \
- flags |= (event->umask<<1); /* configure sub register */ \
- { \
- uint32_t iperf_flags = 0x0UL; \
- iperf_flags |= (event->cfgBits<<event->cmask); /* configure event */ \
- switch (event->umask) { /* pick correct iperf register */ \
- case 0x00: \
- msr_write(cpu_id, MSR_R##NUM##_PMON_IPERF0_P0, iperf_flags); \
- break; \
- case 0x01: \
- msr_write(cpu_id, MSR_R##NUM##_PMON_IPERF1_P0, iperf_flags); \
- break; \
- case 0x06: \
- msr_write(cpu_id, MSR_R##NUM##_PMON_IPERF0_P1, iperf_flags); \
- break; \
- case 0x07: \
- msr_write(cpu_id, MSR_R##NUM##_PMON_IPERF1_P1, iperf_flags); \
- break; \
- case 0x0C: \
- msr_write(cpu_id, MSR_R##NUM##_PMON_IPERF0_P2, iperf_flags); \
- break; \
- case 0x0D: \
- msr_write(cpu_id, MSR_R##NUM##_PMON_IPERF1_P2, iperf_flags); \
- break; \
- case 0x12: \
- msr_write(cpu_id, MSR_R##NUM##_PMON_IPERF0_P3, iperf_flags); \
- break; \
- case 0x13: \
- msr_write(cpu_id, MSR_R##NUM##_PMON_IPERF1_P3, iperf_flags); \
- break; \
- } } \
- break; \
- case 0x01: \
- flags |= (event->umask<<1); /* configure sub register */ \
- { \
- uint32_t qlx_flags = 0x0UL; \
- qlx_flags |= (event->cfgBits); /* configure event */ \
- if (event->cmask) qlx_flags |= (event->cmask<<4); \
- switch (event->umask) { /* pick correct qlx register */ \
- case 0x02: \
- msr_write(cpu_id, MSR_R##NUM##_PMON_QLX_P0, qlx_flags); \
- break; \
- case 0x03: \
- msr_write(cpu_id, MSR_R##NUM##_PMON_QLX_P0, (qlx_flags<<8)); \
- break; \
- case 0x08: \
- msr_write(cpu_id, MSR_R##NUM##_PMON_QLX_P0, qlx_flags); \
- break; \
- case 0x09: \
- msr_write(cpu_id, MSR_R##NUM##_PMON_QLX_P1, (qlx_flags<<8)); \
- break; \
- case 0x0E: \
- msr_write(cpu_id, MSR_R##NUM##_PMON_QLX_P0, qlx_flags); \
- break; \
- case 0x0F: \
- msr_write(cpu_id, MSR_R##NUM##_PMON_QLX_P2, (qlx_flags<<8)); \
- break; \
- case 0x14: \
- msr_write(cpu_id, MSR_R##NUM##_PMON_QLX_P0, qlx_flags); \
- break; \
- case 0x15: \
- msr_write(cpu_id, MSR_R##NUM##_PMON_QLX_P3, (qlx_flags<<8)); \
- break; \
- } } \
- break; \
+int wex_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+ uint64_t reg = counter_map[index].configRegister;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+
+ flags = (1ULL<<22);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0x1FULL) << 24);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+ VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_CBOX);
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
}
+int wex_wbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ uint64_t flags = 0x0ULL;
+ uint64_t reg = counter_map[index].configRegister;
+ int j;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ flags |= (1ULL<<22); /* set enable bit */
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
+ {
+ for (j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0xFFULL) << 24);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags));
+ VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_WBOX);
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
-void perfmon_setupCounterThread_westmereEX(
- int thread_id,
- PerfmonEvent* event,
- PerfmonCounterIndex index)
+int wex_sbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
{
- int haveLock = 0;
+ int j;
uint64_t flags = 0x0ULL;
- uint64_t reg = westmereEX_counter_map[index].configRegister;
- int cpu_id = perfmon_threadData[thread_id].processorId;
- uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
- perfmon_threadData[thread_id].counters[index].init = TRUE;
+ int write_mm_cfg = 0;
+ RegisterType type = counter_map[index].type;
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
{
- haveLock = 1;
+ return 0;
}
- switch (westmereEX_counter_map[index].type)
+ flags = (1ULL<<22);
+ flags |= (event->umask<<8) + event->eventId;
+ if (event->numberOfOptions > 0)
{
- case PMC:
- flags = (1<<22)|(1<<16);
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ case EVENT_OPTION_INVERT:
+ flags |= (1ULL<<23);
+ break;
+ case EVENT_OPTION_THRESHOLD:
+ flags |= ((event->options[j].value & 0xFFULL) << 24);
+ break;
+ case EVENT_OPTION_MATCH0:
+ if (event->eventId == 0x0)
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1,event->options[j].value));
+ VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, event->options[j].value, SETUP_SBOX_MATCH);
+ write_mm_cfg = 1;
+ }
+ break;
+ case EVENT_OPTION_MASK0:
+ if (event->eventId == 0x0)
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2,event->options[j].value));
+ VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, event->options[j].value, SETUP_SBOX_MASK);
+ write_mm_cfg = 1;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (write_mm_cfg && event->eventId == 0x0)
+ {
+ if (type == SBOX0)
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_S0_PMON_MM_CFG, (1ULL<<63), SETUP_SBOX_MATCH_CTRL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_S0_PMON_MM_CFG ,(1ULL<<63)));
+ }
+ else if (type == SBOX1)
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_S1_PMON_MM_CFG, (1ULL<<63), SETUP_SBOX_MATCH_CTRL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_S1_PMON_MM_CFG ,(1ULL<<63)));
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_SBOX);
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
- /* Intel with standard 8 bit event mask: [7:0] */
- flags |= (event->umask<<8) + event->eventId;
+int wex_ubox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ int j;
+ uint64_t flags = 0x0ULL;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
- if (event->cfgBits != 0) /* set custom cfg and cmask */
+ flags = (1ULL<<22);
+ flags |= (event->eventId & 0xFF);
+ if (event->numberOfOptions > 0)
+ {
+ for(j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
{
- flags &= ~(0xFFFFU<<16); /* clear upper 16bits */
- flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+ case EVENT_OPTION_EDGE:
+ flags |= (1ULL<<18);
+ break;
+ default:
+ break;
}
+ }
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, UBOX_CTRL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister , flags));
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
- msr_write(cpu_id, reg , flags);
- VERBOSEPRINTREG(cpu_id, reg, flags, PMC_EV_SEL)
- break;
- case FIXED:
- fixed_flags |= (0x2 <<(index*4));
- msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
- break;
- case MBOX0:
- if (haveLock)
+
+int wex_mbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ uint64_t flags = 0x41ULL;
+ uint64_t subflags1 = 0x0ULL;
+ uint64_t subflags2 = 0x0ULL;
+ int number;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (((counter_map[index].configRegister& 0xFF0) == 0xCA0) ||
+ ((counter_map[index].configRegister& 0xFF0) == 0xCB0))
+ number = 0;
+ else
+ number = 1;
+
+ if (event->numberOfOptions > 0 && (event->cfgBits == 0x02 || event->cfgBits == 0x04))
+ {
+ for (int j=0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_MATCH0:
+ subflags2 = (event->options[j].value & 0x3FFFFFFFFULL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ADDR_MATCH], subflags2));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ADDR_MATCH], subflags2, SETUP_MBOX_ADDR_MATCH);
+ break;
+ case EVENT_OPTION_MASK0:
+ subflags2 = ((event->options[j].value & 0x1FFFFFFC0ULL)>>6);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ADDR_MASK], subflags2));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ADDR_MASK], subflags2, SETUP_MBOX_ADDR_MASK);
+ break;
+ default:
+ break;
+ }
+ }
+ subflags2 = 0x0ULL;
+ }
+ switch (event->cfgBits)
+ {
+ case 0x00:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ break;
+ case 0x01:
+ flags |= (1ULL<<7);
+ flags |= (event->eventId & 0x7ULL)<<19;
+ switch (event->eventId)
{
- MBOX_GATE(0);
- msr_write(cpu_id, reg , flags);
- VERBOSEPRINTREG(cpu_id, reg, flags, MBOX0_CTRL)
+ case 0x00:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][DSP], &subflags1));
+ subflags1 |= (event->umask & 0xFULL)<<7;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][DSP], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][DSP], subflags1, SETUP_MBOX_DSP);
+ break;
+ case 0x01:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], &subflags1));
+ subflags1 |= (event->umask & 0x7ULL)<<4;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ISS], subflags1, SETUP_MBOX_ISS);
+ break;
+ case 0x05:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PGT], &subflags1));
+ subflags1 |= (event->umask & 0x1ULL)<<6;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PGT], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][PGT], subflags1, SETUP_MBOX_PGT);
+ break;
+ case 0x06:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][MAP], &subflags1));
+ subflags1 |= (event->umask & 0x7ULL)<<6;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][MAP], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][MAP], subflags1, SETUP_MBOX_MAP);
+ break;
}
break;
-
- case MBOX1:
- if (haveLock)
+ case 0x02:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PLD], &subflags1));
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], &subflags2));
+ subflags1 |= (event->umask & 0x1FULL)<<8;
+ if ((event->cmask & 0xF0ULL) != 0)
+ {
+ subflags1 |= (1ULL<<0);
+ }
+ if ((event->cmask & 0xFULL) != 0)
{
- MBOX_GATE(1);
- msr_write(cpu_id, reg , flags);
- VERBOSEPRINTREG(cpu_id, reg, flags, MBOX1_CTRL)
+ subflags2 |= (event->cmask & 0x7ULL)<<7;
}
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PLD], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][PLD], subflags1, SETUP_MBOX_PLD);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], subflags2));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ISS], subflags2, SETUP_MBOX_ISS);
break;
-
- case BBOX0:
-
- case BBOX1:
- if (haveLock)
+ case 0x03:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][DSP], &subflags1));
+ subflags1 |= (event->umask & 0xFULL)<<7;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][DSP], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][DSP], subflags1, SETUP_MBOX_DSP);
+ break;
+ case 0x04:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PLD], &subflags1));
+ switch (event->cmask)
{
- flags = 0x1ULL; /* set enable bit */
- flags |= (event->eventId<<1);
- msr_write(cpu_id, reg , flags);
- VERBOSEPRINTREG(cpu_id, reg, flags, BBOX_CTRL)
+ case 0x0:
+ subflags1 |= (1ULL<<16);
+ subflags1 |= (event->umask & 0x1FULL)<<19;
+ break;
+ case 0x1:
+ subflags1 |= (event->umask & 0x1ULL)<<18;
+ break;
+ case 0x2:
+ subflags1 |= (event->umask & 0x1ULL)<<17;
+ break;
+ case 0x3:
+ subflags1 |= (event->umask & 0x1ULL)<<7;
+ break;
}
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PLD], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][PLD], subflags1, SETUP_MBOX_PLD);
break;
-
- case RBOX0:
- if (haveLock)
+ case 0x05:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], &subflags1));
+ subflags1 |= (event->umask & 0xFULL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ISS], subflags1, SETUP_MBOX_ISS);
+ break;
+ case 0x06:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], &subflags1));
+ subflags1 |= (event->umask & 0x7ULL)<<12;
+ if (event->umask == 0x5)
+ {
+ subflags1 |= (event->cmask & 0x7ULL)<<6;
+ }
+ else
+ {
+ subflags1 |= (event->cmask & 0x7ULL)<<9;
+ }
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ZDP], subflags1, SETUP_MBOX_ZDP);
+ break;
+ case 0x07:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], &subflags1));
+ subflags1 |= (event->umask & 0x7ULL)<<15;
+ if (event->umask == 0x5)
+ {
+ subflags1 |= (event->cmask & 0x7ULL)<<6;
+ }
+ else
+ {
+ subflags1 |= (event->cmask & 0x7ULL)<<9;
+ }
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ZDP], subflags1, SETUP_MBOX_ZDP);
+ break;
+ case 0x08:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], &subflags1));
+ subflags1 |= (event->umask & 0x7ULL)<<18;
+ if (event->umask == 0x5)
+ {
+ subflags1 |= (event->cmask & 0x7ULL)<<6;
+ }
+ else
+ {
+ subflags1 |= (event->cmask & 0x7ULL)<<9;
+ }
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ZDP], subflags1, SETUP_MBOX_ZDP);
+ break;
+ case 0x09:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], &subflags1));
+ subflags1 |= (event->umask & 0x7ULL)<<21;
+ if (event->umask == 0x5)
+ {
+ subflags1 |= (event->cmask & 0x7ULL)<<6;
+ }
+ else
+ {
+ subflags1 |= (event->cmask & 0x7ULL)<<9;
+ }
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ZDP], subflags1, SETUP_MBOX_ZDP);
+ break;
+ case 0x0A:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], &subflags1));
+ subflags1 |= (event->umask & 0x1ULL)<<10;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ISS], subflags1, SETUP_MBOX_ISS);
+ break;
+ case 0x0B:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PGT], &subflags1));
+ subflags1 |= (event->umask & 0x1ULL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PGT], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][PGT], subflags1, SETUP_MBOX_PGT);
+ break;
+ case 0x0C:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PGT], &subflags1));
+ subflags1 |= (event->umask & 0x1ULL)<<11;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PGT], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][PGT], subflags1, SETUP_MBOX_PGT);
+ break;
+ case 0x0D:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][THR], &subflags1));
+ subflags1 |= (event->umask & 0x3ULL)<<9;
+ if (event->cmask == 0x0)
+ {
+ subflags1 |= (1ULL<<3);
+ }
+ else
+ {
+ subflags1 &= ~(1ULL<<3);
+ subflags1 |= (event->cmask & 0x7ULL)<<4;
+ }
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][THR], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][THR], subflags1, SETUP_MBOX_THR);
+ break;
+ case 0x0E:
+ flags |= (event->eventId & 0x1FULL)<<9;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][THR], &subflags1));
+ subflags1 |= (event->umask & 0x3ULL)<<7;
+ if (event->cmask == 0x0)
+ {
+ subflags1 |= (1ULL<<3);
+ }
+ else
{
- RBOX_GATE(0);
- msr_write(cpu_id, reg , flags);
- VERBOSEPRINTREG(cpu_id, reg, flags, RBOX0_CTRL)
+ subflags1 &= ~(1ULL<<3);
+ subflags1 |= (event->cmask & 0x7ULL)<<4;
}
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][THR], subflags1));
+ VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][THR], subflags1, SETUP_MBOX_THR);
break;
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_MBOX)
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
- case RBOX1:
- if (haveLock)
+
+int wex_rbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+ uint64_t flags = 0x01ULL;
+ uint64_t subflags = 0x0ULL;
+ int number;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if ((counter_map[index].configRegister & 0xFF0) == 0xE10)
+ number = 0;
+ else if ((counter_map[index].configRegister & 0xFF0) == 0xE30)
+ number = 1;
+
+ switch (event->eventId) {
+ case 0x00:
+ flags |= (event->umask & 0x1FULL)<<1;
+ subflags |= (event->cfgBits<<event->cmask);
+ switch (event->umask)
{
- RBOX_GATE(1);
- msr_write(cpu_id, reg , flags);
- VERBOSEPRINTREG(cpu_id, reg, flags, RBOX1_CTRL)
+ case 0x00:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF0][0], subflags));
+ break;
+ case 0x01:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF1][0], subflags));
+ break;
+ case 0x06:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF0][1], subflags));
+ break;
+ case 0x07:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF1][1], subflags));
+ break;
+ case 0x0C:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF0][2], subflags));
+ break;
+ case 0x0D:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF1][2], subflags));
+ break;
+ case 0x12:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF0][3], subflags));
+ break;
+ case 0x13:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF1][3], subflags));
+ break;
}
break;
+ case 0x01:
+ flags |= (event->umask & 0x1FULL)<<1;
+ subflags |= (event->cfgBits & 0xFULL);
+ if (event->cmask != 0x0)
+ {
+ subflags |= (event->cmask & 0xFULL)<<4;
+ }
+ switch (event->umask)
+ {
+ case 0x02:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][0], subflags));
+ break;
+ case 0x03:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][0], (subflags<<8)));
+ break;
+ case 0x08:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][1], subflags));
+ break;
+ case 0x09:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][1], (subflags<<8)));
+ break;
+ case 0x0E:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][2], subflags));
+ break;
+ case 0x0F:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][2], (subflags<<8)));
+ break;
+ case 0x14:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][3], subflags));
+ break;
+ case 0x15:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][3], (subflags<<8)));
+ break;
+ }
+ break;
+ }
+ if (flags != currentConfig[cpu_id][index])
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+ VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_RBOX)
+ currentConfig[cpu_id][index] = flags;
+ }
+ return 0;
+}
+
+
+int wex_uncore_freeze(int cpu_id, PerfmonEventSet* eventSet, int flags)
+{
+ uint64_t freeze_flags = 0x0ULL;
- case WBOX:
- if (haveLock)
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (eventSet->regTypeMask & ~(0xF))
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, &freeze_flags));
+ freeze_flags &= ~(1ULL<<28);
+ VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST freeze_flags, FREEZE_UNCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, freeze_flags));
+ }
+ if (flags != FREEZE_FLAG_ONLYFREEZE)
+ {
+ if (flags & FREEZE_FLAG_CLEAR_CTR)
+ {
+ uint64_t clear_flags = 0x0ULL;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, &clear_flags));
+ clear_flags |= 29;
+ VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST freeze_flags, CLEAR_UNCORE_CTR);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, freeze_flags));
+ }
+ else if (flags & FREEZE_FLAG_CLEAR_CTL)
+ {
+ int ret = 0;
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if (event->eventId == 0xFF) /* Fixed Counter */
+ uint32_t reg = counter_map[eventSet->events[i].index].configRegister;
+ if (reg != 0x0ULL)
{
- flags = 0x1ULL; /* set enable bit */
+ ret = HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL);
+ if (ret != 0)
+ continue;
+ VERBOSEPRINTREG(cpu_id, reg, 0x0ULL, CLEAR_UNCORE_CTL);
}
- else
+ }
+ }
+
+ }
+ return 0;
+}
+
+int wex_uncore_unfreeze(int cpu_id, PerfmonEventSet* eventSet, int flags)
+{
+ uint64_t unfreeze_flags = 0x0ULL;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+ {
+ return 0;
+ }
+ if (flags != FREEZE_FLAG_ONLYFREEZE)
+ {
+ if (flags & FREEZE_FLAG_CLEAR_CTR)
+ {
+ uint64_t clear_flags = 0x0ULL;
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, &clear_flags));
+ clear_flags |= 29;
+ VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST clear_flags, CLEAR_UNCORE_CTR);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, clear_flags));
+ }
+ else if (flags & FREEZE_FLAG_CLEAR_CTL)
+ {
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ uint32_t reg = counter_map[eventSet->events[i].index].configRegister;
+ if (reg != 0x0ULL)
{
- flags |= (1<<22); /* set enable bit */
- flags |= (event->umask<<8) + event->eventId;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL));
+ VERBOSEPRINTREG(cpu_id, reg, 0x0ULL, CLEAR_UNCORE_CTL);
}
- msr_write(cpu_id, reg , flags);
- VERBOSEPRINTREG(cpu_id, reg, flags, WBOX_CTRL)
}
- break;
+ }
+ }
+ if (eventSet->regTypeMask & ~(0xF))
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, &unfreeze_flags));
+ unfreeze_flags |= (1ULL<<28);
+ VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST unfreeze_flags, UNFREEZE_UNCORE);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, unfreeze_flags));
+ }
+ return 0;
+}
+
+#define WEX_RESET_OVF_BOX(id) \
+ if (haveLock && eventSet->regTypeMask & (REG_TYPE_MASK(id))) \
+ { \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ovflRegister, 0xFFFFFFFF)); \
+ }
+
+
+int perfmon_setupCounterThread_westmereEX(int thread_id, PerfmonEventSet* eventSet)
+{
+ int haveLock = 0;
+ uint64_t flags = 0x0ULL;
+ uint64_t fixed_flags = 0x0ULL;
+ uint64_t ubox_flags = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+ uint32_t uflags[NUM_UNITS] = { [0 ... NUM_UNITS-1] = 0x0U };
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+ {
+ haveLock = 1;
+ }
+
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL));
+ }
+
+ if (haveLock && (eventSet->regTypeMask & ~(0xF)))
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, 0x0ULL));
+ }
+
+ if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(MBOX0))))
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_TIMESTAMP, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_DSP, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_ISS, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_MAP, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_MSC_THR, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_PGT, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_PLD, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_ZDP, 0x0ULL));
+ }
+ if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(MBOX1))))
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_TIMESTAMP, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_DSP, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_ISS, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_MAP, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_MSC_THR, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_PGT, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_PLD, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_ZDP, 0x0ULL));
+ }
+ if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(RBOX0))))
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF0_P0, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF0_P1, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF0_P2, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF0_P3, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF1_P0, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF1_P1, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF1_P2, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF1_P3, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_QLX_P0, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_QLX_P1, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_QLX_P2, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_QLX_P3, 0x0ULL));
+ }
+ if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(RBOX1))))
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF0_P0, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF0_P1, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF0_P2, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF0_P3, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF1_P0, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF1_P1, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF1_P2, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF1_P3, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_QLX_P0, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_QLX_P1, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_QLX_P2, 0x0ULL));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_QLX_P3, 0x0ULL));
+ }
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ PerfmonEvent *event = &(eventSet->events[i].event);
+ uint64_t reg = counter_map[index].configRegister;
+ eventSet->events[i].threadCounter[thread_id].init = TRUE;
+ flags = 0x0ULL;
+ switch (type)
+ {
+ case PMC:
+ wex_pmc_setup(cpu_id, index, event);
+ break;
+
+ case FIXED:
+ fixed_flags |= wex_fixed_setup(cpu_id, index, event);
+ break;
+
+ case MBOX0:
+ case MBOX1:
+ wex_mbox_setup(cpu_id, index, event);
+ break;
+
+ case BBOX0:
+ case BBOX1:
+ wex_bbox_setup(cpu_id, index, event);
+ break;
- case UBOX:
- if (haveLock)
- {
- flags = 0x0ULL;
- flags |= (1<<22);
- flags |= (event->eventId);
- msr_write(cpu_id, reg , flags);
- }
-
- case CBOX0:
- case CBOX1:
- case CBOX2:
- case CBOX3:
- case CBOX4:
- case CBOX5:
- case CBOX6:
- case CBOX7:
- case CBOX8:
- case CBOX9:
- case SBOX0:
- case SBOX1:
- if (haveLock)
- {
- flags = 0x0ULL;
- flags |= (1<<22);
- flags |= (event->umask<<8);
- flags |= (event->eventId);
- msr_write(cpu_id, reg , flags);
+ case RBOX0:
+ case RBOX1:
+ wex_rbox_setup(cpu_id, index, event);
+ break;
+
+ case WBOX:
+ wex_wbox_setup(cpu_id, index, event);
+ break;
+
+ case CBOX0:
+ case CBOX1:
+ case CBOX2:
+ case CBOX3:
+ case CBOX4:
+ case CBOX5:
+ case CBOX6:
+ case CBOX7:
+ case CBOX8:
+ case CBOX9:
+ wex_cbox_setup(cpu_id, index, event);
+ break;
+
+ case WBOX0FIX:
+ if (haveLock && eventSet->regTypeMask & (REG_TYPE_MASK(WBOX0FIX)))
+ {
+ flags = 0x1;
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags));
+ VERBOSEPRINTREG(cpu_id, reg, LLU_CAST flags, WBOX0FIX_CTRL);
+ eventSet->regTypeMask |= REG_TYPE_MASK(WBOX);
+ }
+ break;
+
+ case UBOX:
+ wex_ubox_setup(cpu_id, index, event);
+ ubox_flags = 0x1ULL;
+
+ case SBOX0:
+ case SBOX1:
+ wex_sbox_setup(cpu_id, index, event);
+ break;
+ default:
+ break;
+ }
+ if (type != WBOX0FIX)
+ {
+ uflags[type] |= (1U<<getCounterTypeOffset(index));
+ }
+ else
+ {
+ uflags[WBOX] |= (1<<31);
+ }
+ }
+
+ if (haveLock && (eventSet->regTypeMask & ~(0xF)))
+ {
+ for ( int i=0; i<NUM_UNITS; i++ )
+ {
+ if ((uflags[i] != 0x0ULL) && (i != WBOX0FIX))
+ {
+ VERBOSEPRINTPCIREG(cpu_id, MSR_DEV, box_map[i].ctrlRegister, uflags[i], CLEAR_CTL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[i].ctrlRegister, uflags[i]));
+ VERBOSEPRINTPCIREG(cpu_id, MSR_DEV, box_map[i].ovflRegister, uflags[i], CLEAR_OVF_CTL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[i].ovflRegister, uflags[i]));
}
- break;
+ }
+ }
- default:
- /* should never be reached */
- break;
+ if (fixed_flags != 0x0ULL)
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
}
+ if (ubox_flags != 0x0ULL)
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST ubox_flags, ACTIVATE_UBOX);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, ubox_flags));
+ }
+ return 0;
}
/* Actions for Performance Monitoring Session:
@@ -777,167 +1024,323 @@ void perfmon_setupCounterThread_westmereEX(
* 3) Set enable bit in global U Box control register
* */
-void perfmon_startCountersThread_westmereEX(int thread_id)
+
+int perfmon_startCountersThread_westmereEX(int thread_id, PerfmonEventSet* eventSet)
{
int haveLock = 0;
- uint64_t flags = 0x0ULL;
- uint32_t uflags[NUM_UNITS];
- int enable_ubox = 0;
- int cpu_id = perfmon_threadData[thread_id].processorId;
-
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+ uint64_t core_ctrl_flags = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
- uint32_t ubflags = 0x0UL;
- ubflags |= (1<<29); /* reset all */
haveLock = 1;
- // msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
- // VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags, UBOX_GLOBAL_CTRL)
}
- for ( int i=0; i<NUM_UNITS; i++ )
- {
- uflags[i] = 0x0UL;
- }
+ //wex_uncore_freeze(cpu_id, eventSet, FREEZE_FLAG_CLEAR_CTR);
- for ( int i=0; i<NUM_PMC; i++ )
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE) {
- if (westmereEX_counter_map[i].type == PMC)
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+ {
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- msr_write(cpu_id, westmereEX_counter_map[i].counterRegister , 0x0ULL);
- flags |= (1<<(i-OFFSET_PMC)); /* enable counter */
+ continue;
}
- else if (westmereEX_counter_map[i].type == FIXED)
+ RegisterIndex index = eventSet->events[i].index;
+ uint64_t counter1 = counter_map[index].counterRegister;
+ eventSet->events[i].threadCounter[thread_id].startData = 0;
+ eventSet->events[i].threadCounter[thread_id].counterData = 0;
+ switch (type)
{
- msr_write(cpu_id, westmereEX_counter_map[i].counterRegister , 0x0ULL);
- flags |= (1ULL<<(i+32)); /* enable fixed counter */
- }
- else if (westmereEX_counter_map[i].type > UNCORE)
- {
- if(haveLock)
- {
- msr_write(cpu_id, westmereEX_counter_map[i].counterRegister , 0x0ULL);
- uflags[westmereEX_counter_map[i].type] |=
- (1<<(perfmon_threadData[thread_id].counters[i].id)); /* enable uncore counter */
- if (westmereEX_counter_map[i].type == UBOX)
- {
- enable_ubox = 1;
- }
- }
+ case PMC:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+ core_ctrl_flags |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+ break;
+ case FIXED:
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+ core_ctrl_flags |= (1ULL<<(index+32));
+ break;
+ default:
+ break;
}
}
}
- VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, GLOBAL_CTRL);
- if (haveLock)
+ wex_uncore_unfreeze(cpu_id, eventSet, FREEZE_FLAG_CLEAR_CTR);
+
+ /* Finally enable counters */
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST core_ctrl_flags, GLOBAL_CTRL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, core_ctrl_flags));
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<63)|(1ULL<<62)|core_ctrl_flags));
+ }
+ return 0;
+}
+
+#define WEX_CHECK_OVERFLOW(id, offset) \
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+ { \
+ uint64_t tmp = 0x0ULL; \
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, box_map[id].statusRegister, &tmp)); \
+ if (tmp & (1ULL<<offset)) \
+ { \
+ eventSet->events[i].threadCounter[thread_id].overflows++; \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ovflRegister, (1ULL<<offset))); \
+ } \
+ }
+
+#define WEX_CLEAR_OVERFLOW(id, offset) \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ctrlRegister, (1<<offset)));
+
+
+#define WEX_CHECK_UNCORE_OVERFLOW(id, offset) \
+ if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+ { \
+ uint64_t tmp = 0x0ULL; \
+ int check_local = 0; \
+ if ((id == SBOX0) || (id == SBOX1) || (id == WBOX) || (id == UBOX)) \
+ { \
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_STATUS, &tmp)); \
+ int gl_offset = -1; \
+ switch (id) \
+ { \
+ case UBOX: \
+ gl_offset = 0; \
+ break; \
+ case WBOX: \
+ gl_offset = 1; \
+ break; \
+ case SBOX1: \
+ gl_offset = 2; \
+ break; \
+ case SBOX0: \
+ gl_offset = 3; \
+ break; \
+ default: \
+ break; \
+ } \
+ if ((gl_offset != -1) && (tmp & (1ULL<<gl_offset))) \
+ { \
+ check_local = 1; \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_OVF_CTRL, (1ULL<<gl_offset))); \
+ } \
+ } \
+ else \
+ { \
+ check_local = 1; \
+ } \
+ if (check_local) \
+ { \
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, box_map[id].statusRegister, &tmp)); \
+ if (tmp & (1ULL<<offset)) \
+ { \
+ eventSet->events[i].threadCounter[thread_id].overflows++; \
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ovflRegister, (1ULL<<offset))); \
+ } \
+ } \
+ }
+
+int perfmon_stopCountersThread_westmereEX(int thread_id, PerfmonEventSet* eventSet)
+{
+ int i;
+ int haveLock = 0;
+ uint64_t counter_result = 0x0ULL;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+
+ if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
- for ( int i=0; i<NUM_UNITS; i++ )
+ haveLock = 1;
+ }
+
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST 0x0ULL, GLOBAL_CTRL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ }
+ wex_uncore_freeze(cpu_id, eventSet, FREEZE_FLAG_CLEAR_CTL);
+
+ for (i = 0; i < eventSet->numberOfEvents; i++)
+ {
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- /* if counters are enabled write the according box ctrl register */
- if (uflags[i])
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- msr_write(cpu_id, westmereEX_PMunits[i].ctrlRegister, uflags[i]);
- VERBOSEPRINTREG(cpu_id, westmereEX_PMunits[i].ctrlRegister, LLU_CAST uflags[i], BOXCTRL);
+ continue;
}
+ counter_result = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ uint64_t counter1 = counter_map[index].counterRegister;
+ switch (type)
+ {
+ case PMC:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ WEX_CHECK_OVERFLOW(PMC, index-cpuid_info.perf_num_fixed_ctr);
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC);
+ break;
+ case FIXED:
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ WEX_CHECK_OVERFLOW(PMC, index+32);
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED);
+ break;
+ default:
+ if(haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(type)))
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ WEX_CHECK_UNCORE_OVERFLOW(type, index);
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_UNCORE);
+ }
+ break;
+ }
+ eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
}
-
- /* set global enable flag in U BOX ctrl register */
- uint32_t ubflags = 0x0UL;
- ubflags |= (1<<28); /* enable all */
- if (enable_ubox)
- {
- ubflags |= (1<<0);
- }
- VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST ubflags, UBOX_GLOBAL_CTRL);
- msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
}
- /* Finally enable counters */
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
- msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x30000000FULL);
+
+ return 0;
}
-void perfmon_stopCountersThread_westmereEX(int thread_id)
+int perfmon_readCountersThread_westmereEX(int thread_id, PerfmonEventSet* eventSet)
{
int haveLock = 0;
- int cpu_id = perfmon_threadData[thread_id].processorId;
-
- msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+ int cpu_id = groupSet->threads[thread_id].processorId;
+ uint64_t counter_result = 0x0ULL;
+ uint64_t core_ctrl_flags = 0x0ULL;
if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
- uint32_t ubflags = 0x0UL;
haveLock = 1;
- // ubflags |= (1<<29); /* reset all */
- msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
}
- for ( int i=0; i<NUM_COUNTERS_WESTMEREEX; i++ )
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &core_ctrl_flags));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST core_ctrl_flags, SAFE_PMC_FLAGS)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, RESET_PMC_FLAGS)
+ }
+ wex_uncore_freeze(cpu_id, eventSet, FREEZE_FLAG_ONLYFREEZE);
+
+ for (int i=0;i < eventSet->numberOfEvents;i++)
+ {
+ if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
{
- if (westmereEX_counter_map[i].type > UNCORE)
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+ {
+ continue;
+ }
+ counter_result = 0x0ULL;
+ RegisterIndex index = eventSet->events[i].index;
+ uint64_t counter1 = counter_map[index].counterRegister;
+ if (type > UNCORE)
{
if(haveLock)
{
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
-
- VERBOSEPRINTREG(cpu_id, westmereEX_counter_map[i].counterRegister,
- LLU_CAST perfmon_threadData[thread_id].counters[i].counterData, READ_UNCORE);
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ WEX_CHECK_UNCORE_OVERFLOW(counter_map[index].type, getCounterTypeOffset(index));
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_UNCORE);
}
}
- else
+ else if (type == FIXED)
{
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
-
- VERBOSEPRINTREG(cpu_id, westmereEX_counter_map[i].counterRegister,
- LLU_CAST perfmon_threadData[thread_id].counters[i].counterData, READ_CORE);
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ WEX_CHECK_OVERFLOW(PMC, index+32);
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED);
+ }
+ else if (type == PMC)
+ {
+ CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+ WEX_CHECK_OVERFLOW(PMC, index-cpuid_info.perf_num_fixed_ctr);
+ VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC);
}
+ eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
}
}
-#if 0
- flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
- printf ("Status: 0x%llX \n", LLU_CAST flags);
- if((flags & 0x3) || (flags & (0x3ULL<<32)) )
+ wex_uncore_unfreeze(cpu_id, eventSet, FREEZE_FLAG_ONLYFREEZE);
+ if ((eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED))) && (core_ctrl_flags != 0x0ULL))
{
- printf ("Overflow occured \n");
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST core_ctrl_flags, RESTORE_PMC_FLAGS)
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, core_ctrl_flags));
}
-#endif
+ return 0;
}
-void perfmon_readCountersThread_westmereEX(int thread_id)
+
+int perfmon_finalizeCountersThread_westmereEX(int thread_id, PerfmonEventSet* eventSet)
{
int haveLock = 0;
- int cpu_id = perfmon_threadData[thread_id].processorId;
+ int haveTileLock = 0;
+ int cpu_id = groupSet->threads[thread_id].processorId;
+ uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
{
haveLock = 1;
}
+ if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+ {
+ haveTileLock = 1;
+ }
- for ( int i=0; i<NUM_COUNTERS_WESTMEREEX; i++ )
+ for (int i=0;i < eventSet->numberOfEvents;i++)
{
- if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+ RegisterType type = eventSet->events[i].type;
+ if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
{
- if (westmereEX_counter_map[i].type > UNCORE)
- {
- if(haveLock)
+ continue;
+ }
+ RegisterIndex index = eventSet->events[i].index;
+ uint64_t reg = counter_map[index].configRegister;
+ PciDeviceIndex dev = counter_map[index].device;
+ switch (type)
+ {
+ case PMC:
+ ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+ if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
{
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
}
- }
- else
- {
- perfmon_threadData[thread_id].counters[i].counterData =
- msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
- }
+ else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+ }
+ break;
+ case FIXED:
+ ovf_values_core |= (1ULL<<(index+32));
+ break;
+ default:
+ if (((haveLock) && (type > UNCORE)))
+ {
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL));
+ }
+ break;
+ }
+ if ((reg) && (((dev == MSR_DEV) && (type < UNCORE)) || (((haveLock) && (type > UNCORE)))))
+ {
+ VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
}
+ eventSet->events[i].threadCounter[thread_id].init = FALSE;
}
+ if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, CLEAR_PMC_AND_FIXED_CTL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+ VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core, CLEAR_PMC_AND_FIXED_OVERFLOW);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+ }
+ if (haveLock && (eventSet->regTypeMask & ~(0xF)))
+ {
+ VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, 0x0ULL, CLEAR_UNCORE_CTL);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, 0x0ULL));
+ VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_OVF_CTRL, 0x0ULL, CLEAR_UNCORE_OVERFLOW);
+ CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_OVF_CTRL, 0x0ULL));
+ }
+ return 0;
}
-
diff --git a/src/includes/perfmon_westmereEX_counters.h b/src/includes/perfmon_westmereEX_counters.h
index fd65746..85e4c6d 100644
--- a/src/includes/perfmon_westmereEX_counters.h
+++ b/src/includes/perfmon_westmereEX_counters.h
@@ -5,13 +5,14 @@
*
* Description: Counter Header File of perfmon module for Westmere EX.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -29,125 +30,170 @@
*/
#define NUM_COUNTERS_CORE_WESTMEREEX 7
-#define NUM_COUNTERS_UNCORE_WESTMEREEX 107
-#define NUM_COUNTERS_WESTMEREEX 107
+#define NUM_COUNTERS_UNCORE_WESTMEREEX 117
+#define NUM_COUNTERS_WESTMEREEX 117
-static PerfmonCounterMap westmereEX_counter_map[NUM_COUNTERS_WESTMEREEX] = {
+#define WEX_VALID_OPTIONS_FIXED EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_ANYTHREAD_MASK
+#define WEX_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+#define WEX_VALID_OPTIONS_MBOX EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+#define WEX_VALID_OPTIONS_BBOX EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+#define WEX_VALID_OPTIONS_CBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_EDGE_MASK
+#define WEX_VALID_OPTIONS_SBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+#define WEX_VALID_OPTIONS_WBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_EDGE_MASK
+
+static RegisterMap westmereEX_counter_map[NUM_COUNTERS_WESTMEREEX] = {
/* Fixed Counters: instructions retired, cycles unhalted core */
- {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0},
- {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0},
- {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0},
+ {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, WEX_VALID_OPTIONS_FIXED},
+ {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, WEX_VALID_OPTIONS_FIXED},
+ {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, WEX_VALID_OPTIONS_FIXED},
/* PMC Counters: 4 48bit wide */
- {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
- {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0},
- {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0},
- {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0},
+ {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, WEX_VALID_OPTIONS_PMC},
+ {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, WEX_VALID_OPTIONS_PMC},
+ {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, WEX_VALID_OPTIONS_PMC},
+ {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, WEX_VALID_OPTIONS_PMC},
/* MBOX */
- {"MBOX0C0",PMC7, MBOX0, MSR_M0_PMON_EVNT_SEL0, MSR_M0_PMON_CTR0, 0, 0},
- {"MBOX0C1",PMC8, MBOX0, MSR_M0_PMON_EVNT_SEL1, MSR_M0_PMON_CTR1, 0, 0},
- {"MBOX0C2",PMC9, MBOX0, MSR_M0_PMON_EVNT_SEL2, MSR_M0_PMON_CTR2, 0, 0},
- {"MBOX0C3",PMC10, MBOX0, MSR_M0_PMON_EVNT_SEL3, MSR_M0_PMON_CTR3, 0, 0},
- {"MBOX0C4",PMC11, MBOX0, MSR_M0_PMON_EVNT_SEL4, MSR_M0_PMON_CTR4, 0, 0},
- {"MBOX0C5",PMC12, MBOX0, MSR_M0_PMON_EVNT_SEL5, MSR_M0_PMON_CTR5, 0, 0},
- {"MBOX1C0",PMC13, MBOX1, MSR_M1_PMON_EVNT_SEL0, MSR_M1_PMON_CTR0, 0, 0},
- {"MBOX1C1",PMC14, MBOX1, MSR_M1_PMON_EVNT_SEL1, MSR_M1_PMON_CTR1, 0, 0},
- {"MBOX1C2",PMC15, MBOX1, MSR_M1_PMON_EVNT_SEL2, MSR_M1_PMON_CTR2, 0, 0},
- {"MBOX1C3",PMC16, MBOX1, MSR_M1_PMON_EVNT_SEL3, MSR_M1_PMON_CTR3, 0, 0},
- {"MBOX1C4",PMC17, MBOX1, MSR_M1_PMON_EVNT_SEL4, MSR_M1_PMON_CTR4, 0, 0},
- {"MBOX1C5",PMC18, MBOX1, MSR_M1_PMON_EVNT_SEL5, MSR_M1_PMON_CTR5, 0, 0},
+ {"MBOX0C0",PMC7, MBOX0, MSR_M0_PMON_EVNT_SEL0, MSR_M0_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_MBOX},
+ {"MBOX0C1",PMC8, MBOX0, MSR_M0_PMON_EVNT_SEL1, MSR_M0_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_MBOX},
+ {"MBOX0C2",PMC9, MBOX0, MSR_M0_PMON_EVNT_SEL2, MSR_M0_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_MBOX},
+ {"MBOX0C3",PMC10, MBOX0, MSR_M0_PMON_EVNT_SEL3, MSR_M0_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_MBOX},
+ {"MBOX0C4",PMC11, MBOX0, MSR_M0_PMON_EVNT_SEL4, MSR_M0_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_MBOX},
+ {"MBOX0C5",PMC12, MBOX0, MSR_M0_PMON_EVNT_SEL5, MSR_M0_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_MBOX},
+ {"MBOX1C0",PMC13, MBOX1, MSR_M1_PMON_EVNT_SEL0, MSR_M1_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_MBOX},
+ {"MBOX1C1",PMC14, MBOX1, MSR_M1_PMON_EVNT_SEL1, MSR_M1_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_MBOX},
+ {"MBOX1C2",PMC15, MBOX1, MSR_M1_PMON_EVNT_SEL2, MSR_M1_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_MBOX},
+ {"MBOX1C3",PMC16, MBOX1, MSR_M1_PMON_EVNT_SEL3, MSR_M1_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_MBOX},
+ {"MBOX1C4",PMC17, MBOX1, MSR_M1_PMON_EVNT_SEL4, MSR_M1_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_MBOX},
+ {"MBOX1C5",PMC18, MBOX1, MSR_M1_PMON_EVNT_SEL5, MSR_M1_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_MBOX},
/* BBOX */
- {"BBOX0C0",PMC19, BBOX0, MSR_B0_PMON_EVNT_SEL0, MSR_B0_PMON_CTR0, 0, 0},
- {"BBOX0C1",PMC20, BBOX0, MSR_B0_PMON_EVNT_SEL1, MSR_B0_PMON_CTR1, 0, 0},
- {"BBOX0C2",PMC21, BBOX0, MSR_B0_PMON_EVNT_SEL2, MSR_B0_PMON_CTR2, 0, 0},
- {"BBOX0C3",PMC22, BBOX0, MSR_B0_PMON_EVNT_SEL3, MSR_B0_PMON_CTR3, 0, 0},
- {"BBOX1C0",PMC23, BBOX1, MSR_B1_PMON_EVNT_SEL0, MSR_B1_PMON_CTR0, 0, 0},
- {"BBOX1C1",PMC24, BBOX1, MSR_B1_PMON_EVNT_SEL1, MSR_B1_PMON_CTR1, 0, 0},
- {"BBOX1C2",PMC25, BBOX1, MSR_B1_PMON_EVNT_SEL2, MSR_B1_PMON_CTR2, 0, 0},
- {"BBOX1C3",PMC26, BBOX1, MSR_B1_PMON_EVNT_SEL3, MSR_B1_PMON_CTR3, 0, 0},
+ {"BBOX0C0",PMC19, BBOX0, MSR_B0_PMON_EVNT_SEL0, MSR_B0_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_BBOX},
+ {"BBOX0C1",PMC20, BBOX0, MSR_B0_PMON_EVNT_SEL1, MSR_B0_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_BBOX},
+ {"BBOX0C2",PMC21, BBOX0, MSR_B0_PMON_EVNT_SEL2, MSR_B0_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_BBOX},
+ {"BBOX0C3",PMC22, BBOX0, MSR_B0_PMON_EVNT_SEL3, MSR_B0_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_BBOX},
+ {"BBOX1C0",PMC23, BBOX1, MSR_B1_PMON_EVNT_SEL0, MSR_B1_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_BBOX},
+ {"BBOX1C1",PMC24, BBOX1, MSR_B1_PMON_EVNT_SEL1, MSR_B1_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_BBOX},
+ {"BBOX1C2",PMC25, BBOX1, MSR_B1_PMON_EVNT_SEL2, MSR_B1_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_BBOX},
+ {"BBOX1C3",PMC26, BBOX1, MSR_B1_PMON_EVNT_SEL3, MSR_B1_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_BBOX},
/* RBOX */
- {"RBOX0C0",PMC27, RBOX0, MSR_R0_PMON_EVNT_SEL0, MSR_R0_PMON_CTR0, 0, 0},
- {"RBOX0C1",PMC28, RBOX0, MSR_R0_PMON_EVNT_SEL1, MSR_R0_PMON_CTR1, 0, 0},
- {"RBOX0C2",PMC29, RBOX0, MSR_R0_PMON_EVNT_SEL2, MSR_R0_PMON_CTR2, 0, 0},
- {"RBOX0C3",PMC30, RBOX0, MSR_R0_PMON_EVNT_SEL3, MSR_R0_PMON_CTR3, 0, 0},
- {"RBOX0C4",PMC31, RBOX0, MSR_R0_PMON_EVNT_SEL4, MSR_R0_PMON_CTR4, 0, 0},
- {"RBOX0C5",PMC32, RBOX0, MSR_R0_PMON_EVNT_SEL5, MSR_R0_PMON_CTR5, 0, 0},
- {"RBOX0C6",PMC33, RBOX0, MSR_R0_PMON_EVNT_SEL6, MSR_R0_PMON_CTR6, 0, 0},
- {"RBOX0C7",PMC34, RBOX0, MSR_R0_PMON_EVNT_SEL7, MSR_R0_PMON_CTR7, 0, 0},
- {"RBOX1C0",PMC35, RBOX1, MSR_R1_PMON_EVNT_SEL8, MSR_R1_PMON_CTR8, 0, 0},
- {"RBOX1C1",PMC36, RBOX1, MSR_R1_PMON_EVNT_SEL9, MSR_R1_PMON_CTR9, 0, 0},
- {"RBOX1C2",PMC37, RBOX1, MSR_R1_PMON_EVNT_SEL10, MSR_R1_PMON_CTR10, 0, 0},
- {"RBOX1C3",PMC38, RBOX1, MSR_R1_PMON_EVNT_SEL11, MSR_R1_PMON_CTR11, 0, 0},
- {"RBOX1C4",PMC39, RBOX1, MSR_R1_PMON_EVNT_SEL12, MSR_R1_PMON_CTR12, 0, 0},
- {"RBOX1C5",PMC40, RBOX1, MSR_R1_PMON_EVNT_SEL13, MSR_R1_PMON_CTR13, 0, 0},
- {"RBOX1C6",PMC41, RBOX1, MSR_R1_PMON_EVNT_SEL14, MSR_R1_PMON_CTR14, 0, 0},
- {"RBOX1C7",PMC42, RBOX1, MSR_R1_PMON_EVNT_SEL15, MSR_R1_PMON_CTR15, 0, 0},
+ {"RBOX0C0",PMC27, RBOX0, MSR_R0_PMON_EVNT_SEL0, MSR_R0_PMON_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX0C1",PMC28, RBOX0, MSR_R0_PMON_EVNT_SEL1, MSR_R0_PMON_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX0C2",PMC29, RBOX0, MSR_R0_PMON_EVNT_SEL2, MSR_R0_PMON_CTR2, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX0C3",PMC30, RBOX0, MSR_R0_PMON_EVNT_SEL3, MSR_R0_PMON_CTR3, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX0C4",PMC31, RBOX0, MSR_R0_PMON_EVNT_SEL4, MSR_R0_PMON_CTR4, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX0C5",PMC32, RBOX0, MSR_R0_PMON_EVNT_SEL5, MSR_R0_PMON_CTR5, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX0C6",PMC33, RBOX0, MSR_R0_PMON_EVNT_SEL6, MSR_R0_PMON_CTR6, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX0C7",PMC34, RBOX0, MSR_R0_PMON_EVNT_SEL7, MSR_R0_PMON_CTR7, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX1C0",PMC35, RBOX1, MSR_R1_PMON_EVNT_SEL8, MSR_R1_PMON_CTR8, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX1C1",PMC36, RBOX1, MSR_R1_PMON_EVNT_SEL9, MSR_R1_PMON_CTR9, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX1C2",PMC37, RBOX1, MSR_R1_PMON_EVNT_SEL10, MSR_R1_PMON_CTR10, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX1C3",PMC38, RBOX1, MSR_R1_PMON_EVNT_SEL11, MSR_R1_PMON_CTR11, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX1C4",PMC39, RBOX1, MSR_R1_PMON_EVNT_SEL12, MSR_R1_PMON_CTR12, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX1C5",PMC40, RBOX1, MSR_R1_PMON_EVNT_SEL13, MSR_R1_PMON_CTR13, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX1C6",PMC41, RBOX1, MSR_R1_PMON_EVNT_SEL14, MSR_R1_PMON_CTR14, 0, 0, EVENT_OPTION_NONE_MASK},
+ {"RBOX1C7",PMC42, RBOX1, MSR_R1_PMON_EVNT_SEL15, MSR_R1_PMON_CTR15, 0, 0, EVENT_OPTION_NONE_MASK},
/* WBOX */
- {"WBOX0",PMC43, WBOX, MSR_W_PMON_EVNT_SEL0, MSR_W_PMON_CTR0, 0, 0},
- {"WBOX1",PMC44, WBOX, MSR_W_PMON_EVNT_SEL1, MSR_W_PMON_CTR1, 0, 0},
- {"WBOX2",PMC45, WBOX, MSR_W_PMON_EVNT_SEL2, MSR_W_PMON_CTR2, 0, 0},
- {"WBOX3",PMC46, WBOX, MSR_W_PMON_EVNT_SEL3, MSR_W_PMON_CTR3, 0, 0},
- {"WBOX4",PMC47, WBOX, MSR_W_PMON_FIXED_CTR_CTL, MSR_W_PMON_FIXED_CTR, 0, 0},
+ {"WBOX0",PMC43, WBOX, MSR_W_PMON_EVNT_SEL0, MSR_W_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_WBOX},
+ {"WBOX1",PMC44, WBOX, MSR_W_PMON_EVNT_SEL1, MSR_W_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_WBOX},
+ {"WBOX2",PMC45, WBOX, MSR_W_PMON_EVNT_SEL2, MSR_W_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_WBOX},
+ {"WBOX3",PMC46, WBOX, MSR_W_PMON_EVNT_SEL3, MSR_W_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_WBOX},
+ {"WBOXFIX",PMC47, WBOX0FIX, MSR_W_PMON_FIXED_CTR_CTL, MSR_W_PMON_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
/* UBOX */
- {"UBOX0",PMC48, UBOX, MSR_U_PMON_GLOBAL_EVNT_SEL, MSR_U_PMON_GLOBAL_CTR, 0, 0},
+ {"UBOX0",PMC48, UBOX, MSR_U_PMON_GLOBAL_EVNT_SEL, MSR_U_PMON_GLOBAL_CTR, 0, 0, EVENT_OPTION_EDGE_MASK},
/* CBOXes */
- {"CBOX0C0",PMC49, CBOX0, MSR_C0_PMON_EVNT_SEL0, MSR_C0_PMON_CTR0, 0, 0},
- {"CBOX0C1",PMC50, CBOX0, MSR_C0_PMON_EVNT_SEL1, MSR_C0_PMON_CTR1, 0, 0},
- {"CBOX0C2",PMC51, CBOX0, MSR_C0_PMON_EVNT_SEL2, MSR_C0_PMON_CTR2, 0, 0},
- {"CBOX0C3",PMC52, CBOX0, MSR_C0_PMON_EVNT_SEL3, MSR_C0_PMON_CTR3, 0, 0},
- {"CBOX0C4",PMC53, CBOX0, MSR_C0_PMON_EVNT_SEL4, MSR_C0_PMON_CTR4, 0, 0},
- {"CBOX1C0",PMC54, CBOX1, MSR_C1_PMON_EVNT_SEL0, MSR_C1_PMON_CTR0, 0, 0},
- {"CBOX1C1",PMC55, CBOX1, MSR_C1_PMON_EVNT_SEL1, MSR_C1_PMON_CTR1, 0, 0},
- {"CBOX1C2",PMC56, CBOX1, MSR_C1_PMON_EVNT_SEL2, MSR_C1_PMON_CTR2, 0, 0},
- {"CBOX1C3",PMC57, CBOX1, MSR_C1_PMON_EVNT_SEL3, MSR_C1_PMON_CTR3, 0, 0},
- {"CBOX1C4",PMC58, CBOX1, MSR_C1_PMON_EVNT_SEL4, MSR_C1_PMON_CTR4, 0, 0},
- {"CBOX2C0",PMC59, CBOX2, MSR_C2_PMON_EVNT_SEL0, MSR_C2_PMON_CTR0, 0, 0},
- {"CBOX2C1",PMC60, CBOX2, MSR_C2_PMON_EVNT_SEL1, MSR_C2_PMON_CTR1, 0, 0},
- {"CBOX2C2",PMC61, CBOX2, MSR_C2_PMON_EVNT_SEL2, MSR_C2_PMON_CTR2, 0, 0},
- {"CBOX2C3",PMC62, CBOX2, MSR_C2_PMON_EVNT_SEL3, MSR_C2_PMON_CTR3, 0, 0},
- {"CBOX2C4",PMC63, CBOX2, MSR_C2_PMON_EVNT_SEL4, MSR_C2_PMON_CTR4, 0, 0},
- {"CBOX3C0",PMC64, CBOX3, MSR_C3_PMON_EVNT_SEL0, MSR_C3_PMON_CTR0, 0, 0},
- {"CBOX3C1",PMC65, CBOX3, MSR_C3_PMON_EVNT_SEL1, MSR_C3_PMON_CTR1, 0, 0},
- {"CBOX3C2",PMC66, CBOX3, MSR_C3_PMON_EVNT_SEL2, MSR_C3_PMON_CTR2, 0, 0},
- {"CBOX3C3",PMC67, CBOX3, MSR_C3_PMON_EVNT_SEL3, MSR_C3_PMON_CTR3, 0, 0},
- {"CBOX3C4",PMC68, CBOX3, MSR_C3_PMON_EVNT_SEL4, MSR_C3_PMON_CTR4, 0, 0},
- {"CBOX4C0",PMC69, CBOX4, MSR_C4_PMON_EVNT_SEL0, MSR_C4_PMON_CTR0, 0, 0},
- {"CBOX4C1",PMC70, CBOX4, MSR_C4_PMON_EVNT_SEL1, MSR_C4_PMON_CTR1, 0, 0},
- {"CBOX4C2",PMC71, CBOX4, MSR_C4_PMON_EVNT_SEL2, MSR_C4_PMON_CTR2, 0, 0},
- {"CBOX4C3",PMC72, CBOX4, MSR_C4_PMON_EVNT_SEL3, MSR_C4_PMON_CTR3, 0, 0},
- {"CBOX4C4",PMC73, CBOX4, MSR_C4_PMON_EVNT_SEL4, MSR_C4_PMON_CTR4, 0, 0},
- {"CBOX5C0",PMC74, CBOX5, MSR_C5_PMON_EVNT_SEL0, MSR_C5_PMON_CTR0, 0, 0},
- {"CBOX5C1",PMC75, CBOX5, MSR_C5_PMON_EVNT_SEL1, MSR_C5_PMON_CTR1, 0, 0},
- {"CBOX5C2",PMC76, CBOX5, MSR_C5_PMON_EVNT_SEL2, MSR_C5_PMON_CTR2, 0, 0},
- {"CBOX5C3",PMC77, CBOX5, MSR_C5_PMON_EVNT_SEL3, MSR_C5_PMON_CTR3, 0, 0},
- {"CBOX5C4",PMC78, CBOX5, MSR_C5_PMON_EVNT_SEL4, MSR_C5_PMON_CTR4, 0, 0},
- {"CBOX6C0",PMC79, CBOX6, MSR_C6_PMON_EVNT_SEL0, MSR_C6_PMON_CTR0, 0, 0},
- {"CBOX6C1",PMC80, CBOX6, MSR_C6_PMON_EVNT_SEL1, MSR_C6_PMON_CTR1, 0, 0},
- {"CBOX6C2",PMC81, CBOX6, MSR_C6_PMON_EVNT_SEL2, MSR_C6_PMON_CTR2, 0, 0},
- {"CBOX6C3",PMC82, CBOX6, MSR_C6_PMON_EVNT_SEL3, MSR_C6_PMON_CTR3, 0, 0},
- {"CBOX6C4",PMC83, CBOX6, MSR_C6_PMON_EVNT_SEL4, MSR_C6_PMON_CTR4, 0, 0},
- {"CBOX7C0",PMC84, CBOX7, MSR_C7_PMON_EVNT_SEL0, MSR_C7_PMON_CTR0, 0, 0},
- {"CBOX7C1",PMC85, CBOX7, MSR_C7_PMON_EVNT_SEL1, MSR_C7_PMON_CTR1, 0, 0},
- {"CBOX7C2",PMC86, CBOX7, MSR_C7_PMON_EVNT_SEL2, MSR_C7_PMON_CTR2, 0, 0},
- {"CBOX7C3",PMC87, CBOX7, MSR_C7_PMON_EVNT_SEL3, MSR_C7_PMON_CTR3, 0, 0},
- {"CBOX7C4",PMC88, CBOX7, MSR_C7_PMON_EVNT_SEL4, MSR_C7_PMON_CTR4, 0, 0},
- {"CBOX8C0",PMC89, CBOX8, MSR_C8_PMON_EVNT_SEL0, MSR_C8_PMON_CTR0, 0, 0},
- {"CBOX8C1",PMC90, CBOX8, MSR_C8_PMON_EVNT_SEL1, MSR_C8_PMON_CTR1, 0, 0},
- {"CBOX8C2",PMC91, CBOX8, MSR_C8_PMON_EVNT_SEL2, MSR_C8_PMON_CTR2, 0, 0},
- {"CBOX8C3",PMC92, CBOX8, MSR_C8_PMON_EVNT_SEL3, MSR_C8_PMON_CTR3, 0, 0},
- {"CBOX8C4",PMC93, CBOX8, MSR_C8_PMON_EVNT_SEL4, MSR_C8_PMON_CTR4, 0, 0},
- {"CBOX9C0",PMC94, CBOX9, MSR_C9_PMON_EVNT_SEL0, MSR_C9_PMON_CTR0, 0, 0},
- {"CBOX9C1",PMC95, CBOX9, MSR_C9_PMON_EVNT_SEL1, MSR_C9_PMON_CTR1, 0, 0},
- {"CBOX9C2",PMC96, CBOX9, MSR_C9_PMON_EVNT_SEL2, MSR_C9_PMON_CTR2, 0, 0},
- {"CBOX9C3",PMC97, CBOX9, MSR_C9_PMON_EVNT_SEL3, MSR_C9_PMON_CTR3, 0, 0},
- {"CBOX9C4",PMC98, CBOX9, MSR_C9_PMON_EVNT_SEL4, MSR_C9_PMON_CTR4, 0, 0},
+ {"CBOX0C0",PMC49, CBOX0, MSR_C0_PMON_EVNT_SEL0, MSR_C0_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX0C1",PMC50, CBOX0, MSR_C0_PMON_EVNT_SEL1, MSR_C0_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX0C2",PMC51, CBOX0, MSR_C0_PMON_EVNT_SEL2, MSR_C0_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX0C3",PMC52, CBOX0, MSR_C0_PMON_EVNT_SEL3, MSR_C0_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX0C4",PMC53, CBOX0, MSR_C0_PMON_EVNT_SEL4, MSR_C0_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX0C5",PMC54, CBOX0, MSR_C0_PMON_EVNT_SEL5, MSR_C0_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX1C0",PMC55, CBOX1, MSR_C1_PMON_EVNT_SEL0, MSR_C1_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX1C1",PMC56, CBOX1, MSR_C1_PMON_EVNT_SEL1, MSR_C1_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX1C2",PMC57, CBOX1, MSR_C1_PMON_EVNT_SEL2, MSR_C1_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX1C3",PMC58, CBOX1, MSR_C1_PMON_EVNT_SEL3, MSR_C1_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX1C4",PMC59, CBOX1, MSR_C1_PMON_EVNT_SEL4, MSR_C1_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX1C5",PMC60, CBOX1, MSR_C1_PMON_EVNT_SEL5, MSR_C1_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX2C0",PMC61, CBOX2, MSR_C2_PMON_EVNT_SEL0, MSR_C2_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX2C1",PMC62, CBOX2, MSR_C2_PMON_EVNT_SEL1, MSR_C2_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX2C2",PMC63, CBOX2, MSR_C2_PMON_EVNT_SEL2, MSR_C2_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX2C3",PMC64, CBOX2, MSR_C2_PMON_EVNT_SEL3, MSR_C2_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX2C4",PMC65, CBOX2, MSR_C2_PMON_EVNT_SEL4, MSR_C2_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX2C5",PMC66, CBOX2, MSR_C2_PMON_EVNT_SEL5, MSR_C2_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX3C0",PMC67, CBOX3, MSR_C3_PMON_EVNT_SEL0, MSR_C3_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX3C1",PMC68, CBOX3, MSR_C3_PMON_EVNT_SEL1, MSR_C3_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX3C2",PMC69, CBOX3, MSR_C3_PMON_EVNT_SEL2, MSR_C3_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX3C3",PMC70, CBOX3, MSR_C3_PMON_EVNT_SEL3, MSR_C3_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX3C4",PMC71, CBOX3, MSR_C3_PMON_EVNT_SEL4, MSR_C3_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX3C5",PMC72, CBOX3, MSR_C3_PMON_EVNT_SEL5, MSR_C3_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX4C0",PMC73, CBOX4, MSR_C4_PMON_EVNT_SEL0, MSR_C4_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX4C1",PMC74, CBOX4, MSR_C4_PMON_EVNT_SEL1, MSR_C4_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX4C2",PMC75, CBOX4, MSR_C4_PMON_EVNT_SEL2, MSR_C4_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX4C3",PMC76, CBOX4, MSR_C4_PMON_EVNT_SEL3, MSR_C4_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX4C4",PMC77, CBOX4, MSR_C4_PMON_EVNT_SEL4, MSR_C4_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX4C5",PMC78, CBOX4, MSR_C4_PMON_EVNT_SEL5, MSR_C4_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX5C0",PMC79, CBOX5, MSR_C5_PMON_EVNT_SEL0, MSR_C5_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX5C1",PMC80, CBOX5, MSR_C5_PMON_EVNT_SEL1, MSR_C5_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX5C2",PMC81, CBOX5, MSR_C5_PMON_EVNT_SEL2, MSR_C5_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX5C3",PMC82, CBOX5, MSR_C5_PMON_EVNT_SEL3, MSR_C5_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX5C4",PMC83, CBOX5, MSR_C5_PMON_EVNT_SEL4, MSR_C5_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX5C5",PMC84, CBOX5, MSR_C5_PMON_EVNT_SEL5, MSR_C5_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX6C0",PMC85, CBOX6, MSR_C6_PMON_EVNT_SEL0, MSR_C6_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX6C1",PMC86, CBOX6, MSR_C6_PMON_EVNT_SEL1, MSR_C6_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX6C2",PMC87, CBOX6, MSR_C6_PMON_EVNT_SEL2, MSR_C6_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX6C3",PMC88, CBOX6, MSR_C6_PMON_EVNT_SEL3, MSR_C6_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX6C4",PMC89, CBOX6, MSR_C6_PMON_EVNT_SEL4, MSR_C6_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX6C5",PMC90, CBOX6, MSR_C6_PMON_EVNT_SEL5, MSR_C6_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX7C0",PMC91, CBOX7, MSR_C7_PMON_EVNT_SEL0, MSR_C7_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX7C1",PMC92, CBOX7, MSR_C7_PMON_EVNT_SEL1, MSR_C7_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX7C2",PMC93, CBOX7, MSR_C7_PMON_EVNT_SEL2, MSR_C7_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX7C3",PMC94, CBOX7, MSR_C7_PMON_EVNT_SEL3, MSR_C7_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX7C4",PMC95, CBOX7, MSR_C7_PMON_EVNT_SEL4, MSR_C7_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX7C5",PMC96, CBOX7, MSR_C7_PMON_EVNT_SEL5, MSR_C7_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX8C0",PMC97, CBOX8, MSR_C8_PMON_EVNT_SEL0, MSR_C8_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX8C1",PMC98, CBOX8, MSR_C8_PMON_EVNT_SEL1, MSR_C8_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX8C2",PMC99, CBOX8, MSR_C8_PMON_EVNT_SEL2, MSR_C8_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX8C3",PMC100, CBOX8, MSR_C8_PMON_EVNT_SEL3, MSR_C8_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX8C4",PMC101, CBOX8, MSR_C8_PMON_EVNT_SEL4, MSR_C8_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX8C5",PMC102, CBOX8, MSR_C8_PMON_EVNT_SEL5, MSR_C8_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX9C0",PMC103, CBOX9, MSR_C9_PMON_EVNT_SEL0, MSR_C9_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX9C1",PMC104, CBOX9, MSR_C9_PMON_EVNT_SEL1, MSR_C9_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX9C2",PMC105, CBOX9, MSR_C9_PMON_EVNT_SEL2, MSR_C9_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX9C3",PMC106, CBOX9, MSR_C9_PMON_EVNT_SEL3, MSR_C9_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX9C4",PMC107, CBOX9, MSR_C9_PMON_EVNT_SEL4, MSR_C9_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+ {"CBOX9C5",PMC108, CBOX9, MSR_C9_PMON_EVNT_SEL5, MSR_C9_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
/* SBOXes */
- {"SBOX0C0",PMC99 , SBOX0, MSR_S0_PMON_EVNT_SEL0, MSR_S0_PMON_CTR0, 0, 0},
- {"SBOX0C1",PMC100, SBOX0, MSR_S0_PMON_EVNT_SEL1, MSR_S0_PMON_CTR1, 0, 0},
- {"SBOX0C2",PMC101, SBOX0, MSR_S0_PMON_EVNT_SEL2, MSR_S0_PMON_CTR2, 0, 0},
- {"SBOX0C3",PMC102, SBOX0, MSR_S0_PMON_EVNT_SEL3, MSR_S0_PMON_CTR3, 0, 0},
- {"SBOX1C0",PMC103, SBOX1, MSR_S1_PMON_EVNT_SEL0, MSR_S1_PMON_CTR0, 0, 0},
- {"SBOX1C1",PMC104, SBOX1, MSR_S1_PMON_EVNT_SEL1, MSR_S1_PMON_CTR1, 0, 0},
- {"SBOX1C2",PMC105, SBOX1, MSR_S1_PMON_EVNT_SEL2, MSR_S1_PMON_CTR2, 0, 0},
- {"SBOX1C3",PMC106, SBOX1, MSR_S1_PMON_EVNT_SEL3, MSR_S1_PMON_CTR3, 0, 0}
+ {"SBOX0C0",PMC109 , SBOX0, MSR_S0_PMON_EVNT_SEL0, MSR_S0_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_SBOX},
+ {"SBOX0C1",PMC110, SBOX0, MSR_S0_PMON_EVNT_SEL1, MSR_S0_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_SBOX},
+ {"SBOX0C2",PMC111, SBOX0, MSR_S0_PMON_EVNT_SEL2, MSR_S0_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_SBOX},
+ {"SBOX0C3",PMC112, SBOX0, MSR_S0_PMON_EVNT_SEL3, MSR_S0_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_SBOX},
+ {"SBOX1C0",PMC113, SBOX1, MSR_S1_PMON_EVNT_SEL0, MSR_S1_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_SBOX},
+ {"SBOX1C1",PMC114, SBOX1, MSR_S1_PMON_EVNT_SEL1, MSR_S1_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_SBOX},
+ {"SBOX1C2",PMC115, SBOX1, MSR_S1_PMON_EVNT_SEL2, MSR_S1_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_SBOX},
+ {"SBOX1C3",PMC116, SBOX1, MSR_S1_PMON_EVNT_SEL3, MSR_S1_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_SBOX}
};
+
+static BoxMap westmereEX_box_map[NUM_UNITS] = {
+ [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+ [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+ [MBOX0] = {MSR_M0_PMON_BOX_CTRL, MSR_M0_PMON_BOX_STATUS, MSR_M0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_M0_PMON_ADDR_MATCH, MSR_M0_PMON_ADDR_MASK},
+ [MBOX1] = {MSR_M1_PMON_BOX_CTRL, MSR_M1_PMON_BOX_STATUS, MSR_M1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_M1_PMON_ADDR_MATCH, MSR_M1_PMON_ADDR_MASK},
+ [BBOX0] = {MSR_B0_PMON_BOX_CTRL, MSR_B0_PMON_BOX_STATUS, MSR_B0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_B0_PMON_MATCH,MSR_B0_PMON_MASK},
+ [BBOX1] = {MSR_B1_PMON_BOX_CTRL, MSR_B1_PMON_BOX_STATUS, MSR_B1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_B1_PMON_MATCH,MSR_B1_PMON_MASK},
+ [RBOX0] = {MSR_R0_PMON_BOX_CTRL, MSR_R0_PMON_BOX_STATUS, MSR_R0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [RBOX1] = {MSR_R1_PMON_BOX_CTRL, MSR_R1_PMON_BOX_STATUS, MSR_R1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [SBOX0] = {MSR_S0_PMON_BOX_CTRL, MSR_S0_PMON_BOX_STATUS, MSR_S0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_S0_PMON_MATCH, MSR_S0_PMON_MASK},
+ [SBOX1] = {MSR_S1_PMON_BOX_CTRL, MSR_S1_PMON_BOX_STATUS, MSR_S1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_S1_PMON_MATCH, MSR_S1_PMON_MASK},
+ [CBOX0] = {MSR_C0_PMON_BOX_CTRL, MSR_C0_PMON_BOX_STATUS, MSR_C0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [CBOX1] = {MSR_C1_PMON_BOX_CTRL, MSR_C1_PMON_BOX_STATUS, MSR_C1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [CBOX2] = {MSR_C2_PMON_BOX_CTRL, MSR_C2_PMON_BOX_STATUS, MSR_C2_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [CBOX3] = {MSR_C3_PMON_BOX_CTRL, MSR_C3_PMON_BOX_STATUS, MSR_C3_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [CBOX4] = {MSR_C4_PMON_BOX_CTRL, MSR_C4_PMON_BOX_STATUS, MSR_C4_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [CBOX5] = {MSR_C5_PMON_BOX_CTRL, MSR_C5_PMON_BOX_STATUS, MSR_C5_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [CBOX6] = {MSR_C6_PMON_BOX_CTRL, MSR_C6_PMON_BOX_STATUS, MSR_C6_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [CBOX7] = {MSR_C7_PMON_BOX_CTRL, MSR_C7_PMON_BOX_STATUS, MSR_C7_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [CBOX8] = {MSR_C8_PMON_BOX_CTRL, MSR_C8_PMON_BOX_STATUS, MSR_C8_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [CBOX9] = {MSR_C9_PMON_BOX_CTRL, MSR_C9_PMON_BOX_STATUS, MSR_C9_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [WBOX] = {MSR_W_PMON_BOX_CTRL, MSR_W_PMON_BOX_STATUS, MSR_W_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [WBOX0FIX] = {MSR_W_PMON_BOX_CTRL, MSR_W_PMON_BOX_STATUS, MSR_W_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+ [UBOX] = {MSR_U_PMON_GLOBAL_CTRL, MSR_U_PMON_GLOBAL_STATUS, MSR_U_PMON_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+};
diff --git a/src/includes/perfmon_westmereEX_events.txt b/src/includes/perfmon_westmereEX_events.txt
index 2aabf8d..014dfa6 100644
--- a/src/includes/perfmon_westmereEX_events.txt
+++ b/src/includes/perfmon_westmereEX_events.txt
@@ -1,16 +1,17 @@
# =======================================================================================
-#
+#
# Filename: perfmon_westmereEX_events.txt
-#
+#
# Description: Event list for Intel WestmereEX
-#
-# Version: 3.1.3
-# Released: 4.11.2014
-#
-# Author: Jan Treibig (jt), jan.treibig at gmail.com
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
+# Author: Jan Treibig (jt), jan.treibig at gmail.com
+# Thomas Roehl (tr), thomas.roehl at googlemail.com
# Project: likwid
#
-# Copyright (C) 2014 Jan Treibig
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
@@ -68,16 +69,16 @@ EVENT_MEM_STORE_RETIRED_DTLB 0x0C PMC
UMASK_MEM_STORE_RETIRED_DTLB_MISS 0x01
EVENT_UOPS_ISSUED 0x0E PMC
-UMASK_UOPS_ISSUED_ANY 0x01
+UMASK_UOPS_ISSUED_ANY 0x01
UMASK_UOPS_ISSUED_STALLED_CYCLES 0x01 0xC1 0x01
-UMASK_UOPS_ISSUED_FUSED 0x02
+UMASK_UOPS_ISSUED_FUSED 0x02
EVENT_MEM_UNCORE_RETIRED 0x0F PMC
-UMASK_MEM_UNCORE_RETIRED_LOCAL_HITM 0x02
-UMASK_MEM_UNCORE_RETIRED_LOCAL_DRAM_AND_REMOTE_CACHE_HIT 0x08
-UMASK_MEM_UNCORE_RETIRED_LOCAL_DRAM 0x10
-UMASK_MEM_UNCORE_RETIRED_REMOTE_DRAM 0x20
-UMASK_MEM_UNCORE_RETIRED_UNCACHEABLE 0x80
+UMASK_MEM_UNCORE_RETIRED_LOCAL_HITM 0x02
+UMASK_MEM_UNCORE_RETIRED_LOCAL_DRAM_AND_REMOTE_CACHE_HIT 0x08
+UMASK_MEM_UNCORE_RETIRED_LOCAL_DRAM 0x10
+UMASK_MEM_UNCORE_RETIRED_REMOTE_DRAM 0x20
+UMASK_MEM_UNCORE_RETIRED_UNCACHEABLE 0x80
EVENT_FP_COMP_OPS_EXE 0x10 PMC
UMASK_FP_COMP_OPS_EXE_X87 0x01
@@ -253,10 +254,10 @@ UMASK_BR_INST_EXEC_INDIRECT_NON_CALL 0x04
UMASK_BR_INST_EXEC_NON_CALLS 0x07
UMASK_BR_INST_EXEC_RETURN_NEAR 0x08
UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL 0x10
-UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL 0x20
-UMASK_BR_INST_EXEC_NEAR_CALLS 0x30
-UMASK_BR_INST_EXEC_TAKEN 0x40
-UMASK_BR_INST_EXEC_ANY 0x7F
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL 0x20
+UMASK_BR_INST_EXEC_NEAR_CALLS 0x30
+UMASK_BR_INST_EXEC_TAKEN 0x40
+UMASK_BR_INST_EXEC_ANY 0x7F
EVENT_BR_MISP_EXEC 0x89 PMC
UMASK_BR_MISP_EXEC_COND 0x01
@@ -473,8 +474,66 @@ UMASK_SIMD_INT_64_PACKED_LOGICAL 0x10
UMASK_SIMD_INT_64_PACKED_ARITH 0x20
UMASK_SIMD_INT_64_SHUFFLE_MOVE 0x40
-EVENT_UNCORE_CYCLES 0xFF WBOX4
-UMASK_UNCORE_CYCLES 0x00
+EVENT_OFFCORE_RESPONSE_0 0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_LOCAL_DRAM EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_LOCAL_DRAM EVENT_OPTION_MATCH0=0xFF,EVENT_OPTION_MATCH1=0x40
+UMASK_OFFCORE_RESPONSE_0_LOCAL_DRAM 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_LOCAL_CACHE EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_LOCAL_CACHE EVENT_OPTION_MATCH0=0xFF,EVENT_OPTION_MATCH1=0x7
+UMASK_OFFCORE_RESPONSE_0_LOCAL_CACHE 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_REMOTE_DRAM EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_REMOTE_DRAM EVENT_OPTION_MATCH0=0xFF,EVENT_OPTION_MATCH1=0x20
+UMASK_OFFCORE_RESPONSE_0_REMOTE_DRAM 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_REMOTE_CACHE EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_REMOTE_CACHE EVENT_OPTION_MATCH0=0xFF,EVENT_OPTION_MATCH1=0x18
+UMASK_OFFCORE_RESPONSE_0_REMOTE_CACHE 0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY 0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY 0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY 0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY 0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY 0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY 0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY 0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY 0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY 0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY 0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_ANY 0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY 0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY 0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1 0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_LOCAL_DRAM EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_LOCAL_DRAM EVENT_OPTION_MATCH0=0xFF,EVENT_OPTION_MATCH1=0x40
+UMASK_OFFCORE_RESPONSE_1_LOCAL_DRAM 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_LOCAL_CACHE EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_LOCAL_CACHE EVENT_OPTION_MATCH0=0xFF,EVENT_OPTION_MATCH1=0x7
+UMASK_OFFCORE_RESPONSE_1_LOCAL_CACHE 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_REMOTE_DRAM EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_REMOTE_DRAM EVENT_OPTION_MATCH0=0xFF,EVENT_OPTION_MATCH1=0x20
+UMASK_OFFCORE_RESPONSE_1_REMOTE_DRAM 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_REMOTE_CACHE EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_REMOTE_CACHE EVENT_OPTION_MATCH0=0xFF,EVENT_OPTION_MATCH1=0x18
+UMASK_OFFCORE_RESPONSE_1_REMOTE_CACHE 0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY 0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY 0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY 0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY 0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY 0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY 0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY 0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY 0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY 0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY 0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_ANY 0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY 0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY 0x01 0x0F 0x10
+
+EVENT_UNCORE_CLOCK 0xFF WBOXFIX
+UMASK_UNCORE_CLOCK 0x00
EVENT_C_CYCLES_TURBO 0x04 WBOX
UMASK_C_CYCLES_TURBO_C0 0x01
@@ -488,26 +547,26 @@ UMASK_C_CYCLES_TURBO_C7 0x80
UMASK_C_CYCLES_TURBO_C_ALL 0xFF
EVENT_C_C0_THROTTLE_DIE 0x01 WBOX
-UMASK_C_C0_THROTTLE_DIE_C0 0x01
-UMASK_C_C0_THROTTLE_DIE_C1 0x02
-UMASK_C_C0_THROTTLE_DIE_C2 0x04
-UMASK_C_C0_THROTTLE_DIE_C3 0x08
-UMASK_C_C0_THROTTLE_DIE_C4 0x10
-UMASK_C_C0_THROTTLE_DIE_C5 0x20
-UMASK_C_C0_THROTTLE_DIE_C6 0x40
-UMASK_C_C0_THROTTLE_DIE_C7 0x80
-UMASK_C_C0_THROTTLE_DIE_C_ALL 0xFF
+UMASK_C_C0_THROTTLE_DIE_C0 0x01
+UMASK_C_C0_THROTTLE_DIE_C1 0x02
+UMASK_C_C0_THROTTLE_DIE_C2 0x04
+UMASK_C_C0_THROTTLE_DIE_C3 0x08
+UMASK_C_C0_THROTTLE_DIE_C4 0x10
+UMASK_C_C0_THROTTLE_DIE_C5 0x20
+UMASK_C_C0_THROTTLE_DIE_C6 0x40
+UMASK_C_C0_THROTTLE_DIE_C7 0x80
+UMASK_C_C0_THROTTLE_DIE_C_ALL 0xFF
EVENT_C_C0_THROTTLE_PROCHOT 0x03 WBOX
-UMASK_C_C0_THROTTLE_PROCHOT_C0 0x01
-UMASK_C_C0_THROTTLE_PROCHOT_C1 0x02
-UMASK_C_C0_THROTTLE_PROCHOT_C2 0x04
-UMASK_C_C0_THROTTLE_PROCHOT_C3 0x08
-UMASK_C_C0_THROTTLE_PROCHOT_C4 0x10
-UMASK_C_C0_THROTTLE_PROCHOT_C5 0x20
-UMASK_C_C0_THROTTLE_PROCHOT_C6 0x40
-UMASK_C_C0_THROTTLE_PROCHOT_C7 0x80
-UMASK_C_C0_THROTTLE_PROCHOT_C_ALL 0xFF
+UMASK_C_C0_THROTTLE_PROCHOT_C0 0x01
+UMASK_C_C0_THROTTLE_PROCHOT_C1 0x02
+UMASK_C_C0_THROTTLE_PROCHOT_C2 0x04
+UMASK_C_C0_THROTTLE_PROCHOT_C3 0x08
+UMASK_C_C0_THROTTLE_PROCHOT_C4 0x10
+UMASK_C_C0_THROTTLE_PROCHOT_C5 0x20
+UMASK_C_C0_THROTTLE_PROCHOT_C6 0x40
+UMASK_C_C0_THROTTLE_PROCHOT_C7 0x80
+UMASK_C_C0_THROTTLE_PROCHOT_C_ALL 0xFF
EVENT_C_C0_THROTTLE_TMP 0x00 WBOX
UMASK_C_C0_THROTTLE_TMP_C0 0x01
@@ -559,8 +618,8 @@ UMASK_BCMD_SCHEDQ_OCCUPANCY_F2B 0x06 0x01 0x00
UMASK_BCMD_SCHEDQ_OCCUPANCY_SPRWR 0x07 0x01 0x00
UMASK_BCMD_SCHEDQ_OCCUPANCY_ALL 0x08 0x01 0x00
-EVENT_BBOX_CYCLES 0x1B MBOX
-UMASK_BBOX_CYCLES 0xFF
+EVENT_MBOX_CLOCKTICKS 0x1B MBOX0C0|MBOX1C0
+UMASK_MBOX_CLOCKTICKS 0xFF
EVENT_CYCLES_DSP_FILL 0x00 MBOX
UMASK_CYCLES_DSP_FILL_RDQ_FULL 0x01 0x01 0x00
@@ -588,34 +647,35 @@ UMASK_CYCLES_SCHED_MODE_WRPRIO 0x02 0x01 0x00
UMASK_CYCLES_SCHED_MODE_ADAPTIVE 0x03 0x01 0x00
EVENT_DRAM_CMD 0x0A MBOX
+OPTIONS_DRAM_CMD_ALL EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
UMASK_DRAM_CMD_ALL 0x00 0x02 0x00
-UMASK_DRAM_CMD_ILLEGAL 0x01 0x02 0x00
+UMASK_DRAM_CMD_ILLEGAL 0x00 0x02 0x00
UMASK_DRAM_CMD_PREALL 0x01 0x02 0x00
-UMASK_DRAM_CMD_PREALL_TRDOFF 0x01 0x02 0x00
-UMASK_DRAM_CMD_PREALL_RDPRIO 0x01 0x02 0x01
-UMASK_DRAM_CMD_PREALL_WRPRIO 0x01 0x02 0x02
-UMASK_DRAM_CMD_PREALL_ADAPTIVE 0x01 0x02 0x02
+UMASK_DRAM_CMD_PREALL_TRDOFF 0x01 0x02 0x10
+UMASK_DRAM_CMD_PREALL_RDPRIO 0x01 0x02 0x11
+UMASK_DRAM_CMD_PREALL_WRPRIO 0x01 0x02 0x12
+UMASK_DRAM_CMD_PREALL_ADAPTIVE 0x01 0x02 0x13
UMASK_DRAM_CMD_RAS 0x02 0x02 0x00
-UMASK_DRAM_CMD_RAS_TRDOFF 0x02 0x02 0x00
-UMASK_DRAM_CMD_RAS_RDPRIO 0x02 0x02 0x01
-UMASK_DRAM_CMD_RAS_WRPRIO 0x02 0x02 0x02
-UMASK_DRAM_CMD_RAS_ADAPTIVE 0x02 0x02 0x02
-UMASK_DRAM_CMD_CAS_RD_OPN 0x02 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_OPN 0x02 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_OPN_TRDOFF 0x02 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_OPN_RDPRIO 0x04 0x02 0x01
-UMASK_DRAM_CMD_CAS_WR_OPN_WRPRIO 0x04 0x02 0x02
-UMASK_DRAM_CMD_CAS_WR_OPN_ADAPTIVE 0x04 0x02 0x03
+UMASK_DRAM_CMD_RAS_TRDOFF 0x02 0x02 0x10
+UMASK_DRAM_CMD_RAS_RDPRIO 0x02 0x02 0x11
+UMASK_DRAM_CMD_RAS_WRPRIO 0x02 0x02 0x12
+UMASK_DRAM_CMD_RAS_ADAPTIVE 0x02 0x02 0x13
+UMASK_DRAM_CMD_CAS_RD_OPN 0x03 0x02 0x00
+UMASK_DRAM_CMD_CAS_WR_OPN 0x04 0x02 0x00
+UMASK_DRAM_CMD_CAS_WR_OPN_TRDOFF 0x04 0x02 0x10
+UMASK_DRAM_CMD_CAS_WR_OPN_RDPRIO 0x04 0x02 0x11
+UMASK_DRAM_CMD_CAS_WR_OPN_WRPRIO 0x04 0x02 0x12
+UMASK_DRAM_CMD_CAS_WR_OPN_ADAPTIVE 0x04 0x02 0x13
UMASK_DRAM_CMD_CAS_RD_CLS 0x05 0x02 0x00
-UMASK_DRAM_CMD_CAS_RD_CLS_TRDOFF 0x05 0x02 0x00
-UMASK_DRAM_CMD_CAS_RD_CLS_RDPRIO 0x05 0x02 0x01
-UMASK_DRAM_CMD_CAS_RD_CLS_WRPRIO 0x05 0x02 0x02
-UMASK_DRAM_CMD_CAS_RD_CLS_ADAPTIVE 0x05 0x02 0x03
+UMASK_DRAM_CMD_CAS_RD_CLS_TRDOFF 0x05 0x02 0x10
+UMASK_DRAM_CMD_CAS_RD_CLS_RDPRIO 0x05 0x02 0x11
+UMASK_DRAM_CMD_CAS_RD_CLS_WRPRIO 0x05 0x02 0x12
+UMASK_DRAM_CMD_CAS_RD_CLS_ADAPTIVE 0x05 0x02 0x13
UMASK_DRAM_CMD_CAS_WR_CLS 0x06 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_CLS_TRDOFF 0x06 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_CLS_RDPRIO 0x06 0x02 0x01
-UMASK_DRAM_CMD_CAS_WR_CLS_WRPRIO 0x06 0x02 0x02
-UMASK_DRAM_CMD_CAS_WR_CLS_ADAPTIVE 0x06 0x02 0x03
+UMASK_DRAM_CMD_CAS_WR_CLS_TRDOFF 0x06 0x02 0x10
+UMASK_DRAM_CMD_CAS_WR_CLS_RDPRIO 0x06 0x02 0x11
+UMASK_DRAM_CMD_CAS_WR_CLS_WRPRIO 0x06 0x02 0x12
+UMASK_DRAM_CMD_CAS_WR_CLS_ADAPTIVE 0x06 0x02 0x13
UMASK_DRAM_CMD_MRS 0x07 0x02 0x00
UMASK_DRAM_CMD_RFR 0x09 0x02 0x00
UMASK_DRAM_CMD_ENSR 0x0A 0x02 0x00
@@ -647,7 +707,6 @@ UMASK_DRAM_MISC_RETRIES_ALL 0x00 0x04 0x03
UMASK_DRAM_MISC_RETRIES_FVID 0x01 0x04 0x03
UMASK_DRAM_MISC_VALID 0x01 0x04 0x02
UMASK_DRAM_MISC_NON_NOP_TRKL 0x01 0x04 0x01
-
UMASK_DRAM_MISC_ILLEGAL 0x00 0x04 0x00
UMASK_DRAM_MISC_PREALL 0x01 0x04 0x00
UMASK_DRAM_MISC_RAS 0x02 0x04 0x00
@@ -704,12 +763,12 @@ UMASK_FVC_EV1_FAST_RESET 0x04 0x07 0x00
UMASK_FVC_EV1_BBOX_CMDS_READS 0x05 0x07 0x00
UMASK_FVC_EV1_BBOX_CMDS_WRITES 0x05 0x07 0x01
UMASK_FVC_EV1_BBOX_RSP_ACK 0x06 0x07 0x00
-UMASK_FVC_EV1_BBOX_RSP_RETRY 0x06 0x07 0x10
-UMASK_FVC_EV1_BBOX_RSP_COR 0x06 0x07 0x20
-UMASK_FVC_EV1_BBOX_RSP_UNCOR 0x06 0x07 0x30
-UMASK_FVC_EV1_BBOX_RSP_SPEC_ACK 0x06 0x07 0x40
-UMASK_FVC_EV1_BBOX_RSP_SPR_ACK 0x06 0x07 0x50
-UMASK_FVC_EV1_BBOX_RSP_SPR_UNCORE 0x06 0x07 0x70
+UMASK_FVC_EV1_BBOX_RSP_RETRY 0x06 0x07 0x01
+UMASK_FVC_EV1_BBOX_RSP_COR 0x06 0x07 0x02
+UMASK_FVC_EV1_BBOX_RSP_UNCOR 0x06 0x07 0x03
+UMASK_FVC_EV1_BBOX_RSP_SPEC_ACK 0x06 0x07 0x04
+UMASK_FVC_EV1_BBOX_RSP_SPR_ACK 0x06 0x07 0x05
+UMASK_FVC_EV1_BBOX_RSP_SPR_UNCORE 0x06 0x07 0x07
UMASK_FVC_EV1_SMI_NB_TRIG 0x07 0x07 0x00
EVENT_FVC_EV2 0x0F MBOX
@@ -721,30 +780,30 @@ UMASK_FVC_EV2_FAST_RESET 0x04 0x08 0x00
UMASK_FVC_EV2_BBOX_CMDS_READS 0x05 0x08 0x00
UMASK_FVC_EV2_BBOX_CMDS_WRITES 0x05 0x08 0x01
UMASK_FVC_EV2_BBOX_RSP_ACK 0x06 0x08 0x00
-UMASK_FVC_EV2_BBOX_RSP_RETRY 0x06 0x08 0x10
-UMASK_FVC_EV2_BBOX_RSP_COR 0x06 0x08 0x20
-UMASK_FVC_EV2_BBOX_RSP_UNCOR 0x06 0x08 0x30
-UMASK_FVC_EV2_BBOX_RSP_SPEC_ACK 0x06 0x08 0x40
-UMASK_FVC_EV2_BBOX_RSP_SPR_ACK 0x06 0x08 0x50
-UMASK_FVC_EV2_BBOX_RSP_SPR_UNCORE 0x06 0x08 0x70
+UMASK_FVC_EV2_BBOX_RSP_RETRY 0x06 0x08 0x01
+UMASK_FVC_EV2_BBOX_RSP_COR 0x06 0x08 0x02
+UMASK_FVC_EV2_BBOX_RSP_UNCOR 0x06 0x08 0x03
+UMASK_FVC_EV2_BBOX_RSP_SPEC_ACK 0x06 0x08 0x04
+UMASK_FVC_EV2_BBOX_RSP_SPR_ACK 0x06 0x08 0x05
+UMASK_FVC_EV2_BBOX_RSP_SPR_UNCORE 0x06 0x08 0x07
UMASK_FVC_EV2_SMI_NB_TRIG 0x07 0x08 0x00
EVENT_FVC_EV3 0x10 MBOX
UMASK_FVC_EV3_SMI_CRC_ERR 0x00 0x09 0x00
-UMASK_FVC_EV3_MEM_ECC_ERR 0x00 0x09 0x00
-UMASK_FVC_EV3_POISON_TXN 0x00 0x09 0x00
-UMASK_FVC_EV3_ALERT_FRAMES 0x00 0x09 0x00
-UMASK_FVC_EV3_FAST_RESET 0x00 0x09 0x00
-UMASK_FVC_EV3_BBOX_CMDS_READS 0x00 0x09 0x00
-UMASK_FVC_EV3_BBOX_CMDS_WRITES 0x00 0x09 0x01
-UMASK_FVC_EV3_BBOX_RSP_ACK 0x00 0x09 0x00
-UMASK_FVC_EV3_BBOX_RSP_RETRY 0x00 0x09 0x10
-UMASK_FVC_EV3_BBOX_RSP_COR 0x00 0x09 0x20
-UMASK_FVC_EV3_BBOX_RSP_UNCOR 0x00 0x09 0x30
-UMASK_FVC_EV3_BBOX_RSP_SPEC_ACK 0x00 0x09 0x40
-UMASK_FVC_EV3_BBOX_RSP_SPR_ACK 0x00 0x09 0x50
-UMASK_FVC_EV3_BBOX_RSP_SPR_UNCORE 0x00 0x09 0x70
-UMASK_FVC_EV3_SMI_NB_TRIG 0x00 0x09 0x00
+UMASK_FVC_EV3_MEM_ECC_ERR 0x01 0x09 0x00
+UMASK_FVC_EV3_POISON_TXN 0x02 0x09 0x00
+UMASK_FVC_EV3_ALERT_FRAMES 0x03 0x09 0x00
+UMASK_FVC_EV3_FAST_RESET 0x04 0x09 0x00
+UMASK_FVC_EV3_BBOX_CMDS_READS 0x05 0x09 0x00
+UMASK_FVC_EV3_BBOX_CMDS_WRITES 0x05 0x09 0x01
+UMASK_FVC_EV3_BBOX_RSP_ACK 0x06 0x09 0x00
+UMASK_FVC_EV3_BBOX_RSP_RETRY 0x06 0x09 0x01
+UMASK_FVC_EV3_BBOX_RSP_COR 0x06 0x09 0x02
+UMASK_FVC_EV3_BBOX_RSP_UNCOR 0x06 0x09 0x03
+UMASK_FVC_EV3_BBOX_RSP_SPEC_ACK 0x06 0x09 0x04
+UMASK_FVC_EV3_BBOX_RSP_SPR_ACK 0x06 0x09 0x05
+UMASK_FVC_EV3_BBOX_RSP_SPR_UNCORE 0x06 0x09 0x07
+UMASK_FVC_EV3_SMI_NB_TRIG 0x07 0x09 0x00
EVENT_FVID_RACE 0x18 MBOX
UMASK_FVID_RACE 0x00 0x00 0x00
@@ -799,12 +858,44 @@ UMASK_THERM_TRP_DN_ALL_GT_MID_RISE 0x03 0x0D 0x00
UMASK_THERM_TRP_DN_ALL_GT_MID_FALL 0x02 0x0D 0x00
UMASK_THERM_TRP_DN_ALL_GT_LO 0x01 0x0D 0x00
UMASK_THERM_TRP_DN_ALL_LT_LO 0x00 0x0D 0x00
+UMASK_THERM_TRP_DN_DIMM0_GT_MID_RISE 0x03 0x0D 0x01
+UMASK_THERM_TRP_DN_DIMM1_GT_MID_RISE 0x03 0x0D 0x02
+UMASK_THERM_TRP_DN_DIMM2_GT_MID_RISE 0x03 0x0D 0x03
+UMASK_THERM_TRP_DN_DIMM3_GT_MID_RISE 0x03 0x0D 0x04
+UMASK_THERM_TRP_DN_DIMM0_GT_MID_FALL 0x02 0x0D 0x01
+UMASK_THERM_TRP_DN_DIMM1_GT_MID_FALL 0x02 0x0D 0x02
+UMASK_THERM_TRP_DN_DIMM2_GT_MID_FALL 0x02 0x0D 0x03
+UMASK_THERM_TRP_DN_DIMM3_GT_MID_FALL 0x02 0x0D 0x04
+UMASK_THERM_TRP_DN_DIMM0_GT_LO 0x01 0x0D 0x01
+UMASK_THERM_TRP_DN_DIMM1_GT_LO 0x01 0x0D 0x02
+UMASK_THERM_TRP_DN_DIMM2_GT_LO 0x01 0x0D 0x03
+UMASK_THERM_TRP_DN_DIMM3_GT_LO 0x01 0x0D 0x04
+UMASK_THERM_TRP_DN_DIMM0_LT_LO 0x00 0x0D 0x01
+UMASK_THERM_TRP_DN_DIMM1_LT_LO 0x00 0x0D 0x02
+UMASK_THERM_TRP_DN_DIMM2_LT_LO 0x00 0x0D 0x03
+UMASK_THERM_TRP_DN_DIMM3_LT_LO 0x00 0x0D 0x04
EVENT_THERM_TRP_UP 0x04 MBOX
UMASK_THERM_TRP_UP_ALL_GT_MID_RISE 0x03 0x0E 0x00
UMASK_THERM_TRP_UP_ALL_GT_MID_FALL 0x02 0x0E 0x00
UMASK_THERM_TRP_UP_ALL_GT_LO 0x01 0x0E 0x00
UMASK_THERM_TRP_UP_ALL_LT_LO 0x00 0x0E 0x00
+UMASK_THERM_TRP_UP_DIMM0_GT_MID_RISE 0x03 0x0E 0x01
+UMASK_THERM_TRP_UP_DIMM1_GT_MID_RISE 0x03 0x0E 0x02
+UMASK_THERM_TRP_UP_DIMM2_GT_MID_RISE 0x03 0x0E 0x03
+UMASK_THERM_TRP_UP_DIMM3_GT_MID_RISE 0x03 0x0E 0x04
+UMASK_THERM_TRP_UP_DIMM0_GT_MID_FALL 0x02 0x0E 0x01
+UMASK_THERM_TRP_UP_DIMM1_GT_MID_FALL 0x02 0x0E 0x02
+UMASK_THERM_TRP_UP_DIMM2_GT_MID_FALL 0x02 0x0E 0x03
+UMASK_THERM_TRP_UP_DIMM3_GT_MID_FALL 0x02 0x0E 0x04
+UMASK_THERM_TRP_UP_DIMM0_GT_LO 0x01 0x0E 0x01
+UMASK_THERM_TRP_UP_DIMM1_GT_LO 0x01 0x0E 0x02
+UMASK_THERM_TRP_UP_DIMM2_GT_LO 0x01 0x0E 0x03
+UMASK_THERM_TRP_UP_DIMM3_GT_LO 0x01 0x0E 0x04
+UMASK_THERM_TRP_UP_DIMM0_LT_LO 0x00 0x0E 0x01
+UMASK_THERM_TRP_UP_DIMM1_LT_LO 0x00 0x0E 0x02
+UMASK_THERM_TRP_UP_DIMM2_LT_LO 0x00 0x0E 0x03
+UMASK_THERM_TRP_UP_DIMM3_LT_LO 0x00 0x0E 0x04
EVENT_TRANS_CMDS 0x12 MBOX
UMASK_TRANS_CMDS 0x00 0x00 0x00
@@ -813,112 +904,165 @@ EVENT_TT_CMD_CONFLICT 0x19 MBOX
UMASK_TT_CMD_CONFLICT 0x00 0x00 0x00
EVENT_ACK_BEFORE_LAST_SNP 0x19 BBOX0C3|BBOX1C3
-UMASK_ACK_BEFORE_LAST_SNP 0x03
-
-EVENT_ADDR_IN_MATCH 0x04 BBOX0C2|BBOX1C2
-UMASK_ADDR_IN_MATCH 0x02
+UMASK_ACK_BEFORE_LAST_SNP 0x00
EVENT_CONFLICTS 0x17 BBOX0C3|BBOX1C3
-UMASK_CONFLICTS 0x03
+UMASK_CONFLICTS 0x00
EVENT_COHQ_BYPASS 0x0E BBOX0C3|BBOX1C3
-UMASK_COHQ_BYPASS 0x03
+UMASK_COHQ_BYPASS 0x00
EVENT_COHQ_IMT_ALLOC_WAIT 0x0E BBOX0C3|BBOX1C3
-UMASK_COHQ_IMT_ALLOC_WAIT 0x03
+UMASK_COHQ_IMT_ALLOC_WAIT 0x00
EVENT_DIRQ_INSERTS 0x17 BBOX0C1|BBOX1C1
-UMASK_DIRQ_INSERTS 0x01
+UMASK_DIRQ_INSERTS 0x00
EVENT_DIRQ_OCCUPANCY 0x17 BBOX0C0|BBOX1C0
UMASK_DIRQ_OCCUPANCY 0x00
EVENT_DEMAND_FETCH 0x0F BBOX0C3|BBOX1C3
-UMASK_DEMAND_FETCH 0x03
+UMASK_DEMAND_FETCH 0x00
EVENT_DRSQ_INSERTS 0x09 BBOX0C1|BBOX1C1
-UMASK_DRSQ_INSERTS 0x01
+UMASK_DRSQ_INSERTS 0x00
EVENT_DRSQ_OCCUPANCY 0x09 BBOX0C0|BBOX1C0
UMASK_DRSQ_OCCUPANCY 0x00
EVENT_EARLY_ACK 0x02 BBOX0C3|BBOX1C3
-UMASK_EARLY_ACK 0x03
+UMASK_EARLY_ACK 0x00
EVENT_IMPLICIT_WBS 0x12 BBOX0C3|BBOX1C3
-UMASK_IMPLICIT_WBS 0x03
+UMASK_IMPLICIT_WBS 0x00
EVENT_IMT_FULL 0x12 BBOX0C3|BBOX1C3
-UMASK_IMT_FULL 0x03
+UMASK_IMT_FULL 0x00
EVENT_IMT_INSERTS_ALL 0x07 BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_ALL 0x01
+UMASK_IMT_INSERTS_ALL 0x00
EVENT_IMT_INSERTS_INVITOE 0x0F BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_INVITOE 0x01
+UMASK_IMT_INSERTS_INVITOE 0x00
EVENT_IMT_INSERTS_IOH 0x0A BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_IOH 0x01
+UMASK_IMT_INSERTS_IOH 0x00
EVENT_IMT_INSERTS_IOH_INVITOE 0x10 BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_IOH_INVITOE 0x01
+UMASK_IMT_INSERTS_IOH_INVITOE 0x00
EVENT_IMT_INSERTS_IOH_WR 0x0D BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_IOH_WR 0x01
+UMASK_IMT_INSERTS_IOH_WR 0x00
EVENT_IMT_INSERTS_NON_IOH 0x0B BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_NON_IOH 0x01
+UMASK_IMT_INSERTS_NON_IOH 0x00
EVENT_IMT_INSERTS_NON_IOH_INVITOE 0x1C BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_NON_IOH_INVITOE 0x01
+UMASK_IMT_INSERTS_NON_IOH_INVITOE 0x00
-EVENT_INSERTS_NON_IOH_RD 0x1F BBOX0C1|BBOX1C1
-UMASK_INSERTS_NON_IOH_RD 0x01
+EVENT_IMT_INSERTS_NON_IOH_RD 0x1F BBOX0C1|BBOX1C1
+UMASK_IMT_INSERTS_NON_IOH_RD 0x00
EVENT_IMT_INSERTS_NON_IOH_WR 0x0E BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_NON_IOH_WR 0x01
+UMASK_IMT_INSERTS_NON_IOH_WR 0x00
EVENT_IMT_INSERTS_RD 0x1D BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_RD 0x01
+UMASK_IMT_INSERTS_RD 0x00
EVENT_IMT_INSERTS_WR 0x0C BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_WR 0x01
+UMASK_IMT_INSERTS_WR 0x00
EVENT_IMT_NE_CYCLES 0x07 BBOX0C2|BBOX1C2
-UMASK_IMT_NE_CYCLES 0x02
+UMASK_IMT_NE_CYCLES 0x00
EVENT_IMT_PREALLOC 0x06 BBOX0C3|BBOX1C3
-UMASK_IMT_PREALLOC 0x03
+UMASK_IMT_PREALLOC 0x00
EVENT_IMT_VALID_OCCUPANCY 0x07 BBOX0C0|BBOX1C0
UMASK_IMT_VALID_OCCUPANCY 0x00
-EVENT_MSG_ADDR_IN_MATCH 0x01 BBOX0C0|BBOX1C0
-UMASK_MSG_ADDR_IN_MATCH 0x00
+EVENT_MSGS_B_TO_S 0x03 BBOX0C2|BBOX1C2
+UMASK_MSGS_B_TO_S 0x00
-EVENT_MSGS_B_TO_S 0x03 BBOX0C1|BBOX1C1
-UMASK_MSGS_B_TO_S 0x01
+EVENT_MSGS_S_TO_B 0x02 BBOX0C2|BBOX1C2
+UMASK_MSGS_S_TO_B 0x00
-EVENT_MSGS_B_TO_S 0x03 BBOX0C2|BBOX1C2
-UMASK_MSGS_B_TO_S 0x02
+EVENT_MSGS_IN_NON_SNP 0x01 BBOX0C2|BBOX1C2
+UMASK_MSGS_IN_NON_SNP 0x00
EVENT_MSG_IN_MATCH 0x01 BBOX0C1|BBOX1C1
-UMASK_MSG_IN_MATCH 0x01
+OPTIONS_MSG_IN_MATCH EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_IN_MATCH 0x00
-EVENT_MSGS_IN_NON_SNP 0x01 BBOX0C2|BBOX1C2
-UMASK_MSGS_IN_NON_SNP 0x02
+EVENT_MSG_ADDR_IN_MATCH 0x01 BBOX0C0|BBOX1C0
+OPTIONS_MSG_ADDR_IN_MATCH EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_ADDR_IN_MATCH 0x00
EVENT_MSG_OPCODE_ADDR_IN_MATCH 0x03 BBOX0C0|BBOX1C0
+OPTIONS_MSG_OPCODE_ADDR_IN_MATCH EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
UMASK_MSG_OPCODE_ADDR_IN_MATCH 0x00
EVENT_MSG_OPCODE_IN_MATCH 0x05 BBOX0C1|BBOX1C1
-UMASK_MSG_OPCODE_IN_MATCH 0x01
+OPTIONS_MSG_OPCODE_IN_MATCH EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_OPCODE_IN_MATCH 0x00
EVENT_MSG_OPCODE_OUT_MATCH 0x06 BBOX0C1|BBOX1C1
-UMASK_MSG_OPCODE_OUT_MATCH 0x01
+OPTIONS_MSG_OPCODE_OUT_MATCH EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_OPCODE_OUT_MATCH 0x00
EVENT_MSG_OUT_MATCH 0x02 BBOX0C1|BBOX1C1
-UMASK_MSG_OUT_MATCH 0x01
+OPTIONS_MSG_OUT_MATCH EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_OUT_MATCH 0x00
+
+EVENT_OPCODE_ADDR_IN_MATCH 0x02 BBOX0C0|BBOX1C0
+OPTIONS_OPCODE_ADDR_IN_MATCH EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_OPCODE_ADDR_IN_MATCH 0x00
+
+EVENT_OPCODE_IN_MATCH 0x03 BBOX0C1|BBOX1C1
+OPTIONS_OPCODE_IN_MATCH EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_OPCODE_IN_MATCH 0x00
+
+EVENT_OPCODE_OUT_MATCH 0x04 BBOX0C1|BBOX1C1
+OPTIONS_OPCODE_OUT_MATCH EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_OPCODE_OUT_MATCH 0x00
+
+EVENT_ADDR_IN_MATCH 0x04 BBOX0C2|BBOX1C2
+OPTIONS_ADDR_IN_MATCH EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_ADDR_IN_MATCH 0x00
+
+EVENT_RBOX_VNA_UNAVAIL 0x15 BBOX0C3|BBOX1C3
+UMASK_RBOX_VNA_UNAVAIL 0x00
+
+EVENT_SBOX_VN0_UNAVAIL 0x14 BBOX0C3|BBOX1C3
+UMASK_SBOX_VN0_UNAVAIL 0x00
+
+EVENT_SNPOQ_INSERTS 0x12 BBOX0C1|BBOX1C1
+UMASK_SNPOQ_INSERTS 0x00
+
+EVENT_SNPOQ_OCCUPANCY 0x12 BBOX0C0|BBOX1C0
+UMASK_SNPOQ_OCCUPANCY 0x00
+
+EVENT_TF_ALL 0x04 BBOX0C0|BBOX1C0
+UMASK_TF_ALL 0x00
+
+EVENT_TF_INVITOE 0x06 BBOX0C0|BBOX1C0
+UMASK_TF_INVITOE 0x00
+
+EVENT_TF_IOH 0x0B BBOX0C0|BBOX1C0
+UMASK_TF_IOH 0x00
+
+EVENT_TF_IOH_INVITOE 0x0F BBOX0C0|BBOX1C0
+UMASK_TF_IOH_INVITOE 0x00
+
+EVENT_TF_IOH_NON_INVITOE_RD 0x1C BBOX0C0|BBOX1C0
+UMASK_TF_IOH_NON_INVITOE_RD 0x00
+
+EVENT_TF_IOH_WR 0x0D BBOX0C0|BBOX1C0
+UMASK_TF_IOH_WR 0x00
+
+EVENT_TF_WR 0x05 BBOX0C0|BBOX1C0
+UMASK_TF_WR 0x00
+
EVENT_ALLOC_TO_ARB 0x00 RBOX0
UMASK_ALLOC_TO_ARB_PORT0_IPERF0_NCB 0x00 0x01 0x09
@@ -3012,6 +3156,7 @@ EVENT_TRANS_VIQ 0x1D CBOX
UMASK_TRANS_VIQ 0x00
EVENT_TO_R_PROG_EV 0x00 SBOX
+OPTIONS_TO_R_PROG_EV EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MATCH0_MASK
UMASK_TO_R_PROG_EV 0x00
EVENT_TO_R_B_HOM_MSGQ_CYCLES_FULL 0x03 SBOX
diff --git a/src/includes/perfmon_westmere_events.txt b/src/includes/perfmon_westmere_events.txt
index 3c3e66f..7032ae3 100644
--- a/src/includes/perfmon_westmere_events.txt
+++ b/src/includes/perfmon_westmere_events.txt
@@ -1,16 +1,17 @@
# =======================================================================================
-#
+#
# Filename: perfmon_westmere_events.txt
-#
+#
# Description: Event list for Intel Westmere
-#
-# Version: 3.1.3
-# Released: 4.11.2014
-#
-# Author: Jan Treibig (jt), jan.treibig at gmail.com
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
+# Author: Jan Treibig (jt), jan.treibig at gmail.com
+# Thomas Roehl (tr), thomas.roehl at googlemail.com
# Project: likwid
#
-# Copyright (C) 2014 Jan Treibig
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
@@ -68,16 +69,21 @@ EVENT_MEM_STORE_RETIRED_DTLB 0x0C PMC
UMASK_MEM_STORE_RETIRED_DTLB_MISS 0x01
EVENT_UOPS_ISSUED 0x0E PMC
-UMASK_UOPS_ISSUED_ANY 0x01
-UMASK_UOPS_ISSUED_STALLED_CYCLES 0x01 0xC1 0x01
-UMASK_UOPS_ISSUED_FUSED 0x02
+UMASK_UOPS_ISSUED_ANY 0x01
+UMASK_UOPS_ISSUED_FUSED 0x02
+DEFAULT_OPTIONS_UOPS_TOTAL_STALL_CYCLES EVENT_OPTION_THRESHOLD=0xF,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_ACTIVE_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_ISSUED_ACTIVE_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_STALL_CYCLES 0x01
EVENT_MEM_UNCORE_RETIRED 0x0F PMC
-UMASK_MEM_UNCORE_RETIRED_LOCAL_HITM 0x02
-UMASK_MEM_UNCORE_RETIRED_LOCAL_DRAM_AND_REMOTE_CACHE_HIT 0x08
-UMASK_MEM_UNCORE_RETIRED_LOCAL_DRAM 0x10
-UMASK_MEM_UNCORE_RETIRED_REMOTE_DRAM 0x20
-UMASK_MEM_UNCORE_RETIRED_UNCACHEABLE 0x80
+UMASK_MEM_UNCORE_RETIRED_LOCAL_HITM 0x02
+UMASK_MEM_UNCORE_RETIRED_LOCAL_DRAM_AND_REMOTE_CACHE_HIT 0x08
+UMASK_MEM_UNCORE_RETIRED_LOCAL_DRAM 0x10
+UMASK_MEM_UNCORE_RETIRED_REMOTE_DRAM 0x20
+UMASK_MEM_UNCORE_RETIRED_UNCACHEABLE 0x80
EVENT_FP_COMP_OPS_EXE 0x10 PMC
UMASK_FP_COMP_OPS_EXE_X87 0x01
@@ -106,7 +112,8 @@ UMASK_LOAD_DISPATCH_ANY 0x07
EVENT_ARITH 0x14 PMC
UMASK_ARITH_CYCLES_DIV_BUSY 0x01
-UMASK_ARITH_NUM_DIV 0x01 0xC5 0x01
+DEFAULT_OPTIONS_ARITH_NUM_DIV EVENT_OPTION_EDGE=1,EVENT_OPTION_INVERT=1,EVENT_OPTION_THRESHOLD=0x1
+UMASK_ARITH_NUM_DIV 0x01
UMASK_ARITH_MUL 0x02
EVENT_INST_QUEUE 0x17 PMC
@@ -177,9 +184,15 @@ EVENT_L3_LAT_CACHE 0x2E PMC
UMASK_L3_LAT_CACHE_REFERENCE 0x4F
UMASK_L3_LAT_CACHE_MISS 0x41
-EVENT_CPU_CLOCK_UNHALTED 0x3C PMC
+EVENT_CPU_CLOCK_UNHALTED 0x3C PMC
UMASK_CPU_CLOCK_UNHALTED_THREAD_P 0x00
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_THREAD_P_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P_ANY 0x00
UMASK_CPU_CLOCK_UNHALTED_REF_P 0x01
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_REF_XCLK_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_CPU_CLOCK_UNHALTED_REF_XCLK_ANY 0x01
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0x2,EVENT_OPTION_INVERT=0x1
+UMASK_CPU_CLOCK_UNHALTED_TOTAL_CYCLES 0x00
EVENT_DTLB_MISSES 0x49 PMC
UMASK_DTLB_MISSES_ANY 0x01
@@ -217,9 +230,13 @@ UMASK_OFFCORE_EVENTS_OUTSTANDING_DEMAND_READ_CODE 0x02
UMASK_OFFCORE_EVENTS_OUTSTANDING_DEMAND_RFO 0x04
UMASK_OFFCORE_EVENTS_OUTSTANDING_ANY_READ 0x08
-EVENT_CACHE_LOCK_CYCLES 0x63 PMC0|PMC1
-UMASK_CACHE_LOCK_CYCLES_L1D_L2 0x01
+EVENT_CACHE_LOCK 0x63 PMC0|PMC1
+UMASK_CACHE_LOCK_CYCLES_L1D_L2 0x01
+DEFAULT_OPTIONS_CACHE_LOCK_COUNT_L1D_L2 EVENT_OPTION_EDGE=1
+UMASK_CACHE_LOCK_COUNT_L1D_L2 0x01
UMASK_CACHE_LOCK_CYCLES_L1D 0x02
+DEFAULT_OPTIONS_CACHE_LOCK_COUNT_L1D EVENT_OPTION_EDGE=1
+UMASK_CACHE_LOCK_COUNT_L1D 0x02
EVENT_IO_TRANSACTIONS 0x6C PMC
UMASK_IO_TRANSACTIONS 0x01
@@ -305,15 +322,38 @@ UMASK_OFFCORE_REQUESTS_ANY 0x80
EVENT_UOPS_EXECUTED 0xB1 PMC
UMASK_UOPS_EXECUTED_PORT0 0x01
UMASK_UOPS_EXECUTED_PORT1 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT2_CORE EVENT_OPTION_ANYTHREAD=1
UMASK_UOPS_EXECUTED_PORT2_CORE 0x04
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT3_CORE EVENT_OPTION_ANYTHREAD=1
UMASK_UOPS_EXECUTED_PORT3_CORE 0x08
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT4_CORE EVENT_OPTION_ANYTHREAD=1
UMASK_UOPS_EXECUTED_PORT4_CORE 0x10
-UMASK_UOPS_EXECUTED_CORE_ACTIVE_CYCLES_NO_PORT5 0x1F
UMASK_UOPS_EXECUTED_PORT5 0x20
-UMASK_UOPS_EXECUTED_CORE_ACTIVE_CYCLES 0x3F
UMASK_UOPS_EXECUTED_PORT015 0x40
-UMASK_UOPS_EXECUTED_PORT015_STALL_CYCLES 0x40 0xC1 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT234_CORE EVENT_OPTION_ANYTHREAD=1
UMASK_UOPS_EXECUTED_PORT234 0x80
+UMASK_UOPS_EXECUTED_THREAD 0xC0
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_ACTIVE_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_CORE_ACTIVE_CYCLES 0x3F
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_ACTIVE_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_EDGE=1
+UMASK_UOPS_EXECUTED_CORE_ACTIVE_COUNT 0x3F
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_CYCLES 0x3F
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_INVERT=1,EVENT_OPTION_EDGE=1
+UMASK_UOPS_EXECUTED_CORE_STALL_COUNT 0x3F
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_ACTIVE_CYCLES_NO_PORT5 EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_CORE_ACTIVE_CYCLES_NO_PORT5 0x1F
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_ACTIVE_COUNT_NO_PORT5 EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_EDGE=1
+UMASK_UOPS_EXECUTED_CORE_ACTIVE_COUNT_NO_PORT5 0x1F
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_CYCLES_NO_PORT5 EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_CYCLES_NO_PORT5 0x1F
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_COUNT_NO_PORT5 EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_EDGE=1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_COUNT_NO_PORT5 0x1F
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT015_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_PORT015_STALL_CYCLES 0x40
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT015_STALL_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_INVERT=1,EVENT_OPTION_EDGE=1
+UMASK_UOPS_EXECUTED_PORT015_STALL_COUNT 0x40
+
EVENT_OFFCORE_REQUESTS_SQ_FULL 0xB2 PMC
UMASK_OFFCORE_REQUESTS_SQ_FULL 0x01
@@ -343,10 +383,14 @@ UMASK_INST_RETIRED_MMX 0x04
EVENT_UOPS_RETIRED 0xC2 PMC
UMASK_UOPS_RETIRED_ANY 0x01
-UMASK_UOPS_RETIRED_STALL_CYCLES 0x01 0xC1 0x01
-UMASK_UOPS_RETIRED_ACTIVE_CYCLES 0x01 0x41 0x01
UMASK_UOPS_RETIRED_RETIRE_SLOTS 0x02
UMASK_UOPS_RETIRED_MACRO_FUSED 0x04
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xF,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_TOTAL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_ACTIVE_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_RETIRED_ACTIVE_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_STALL_CYCLES 0x01
EVENT_MACHINE_CLEARS 0xC3 PMC
UMASK_MACHINE_CLEARS_CYCLES 0x01
@@ -392,10 +436,26 @@ EVENT_MACRO_INSTS 0xD0 PMC
UMASK_MACRO_INSTS_DECODED 0x01
EVENT_UOPS_DECODED 0xD1 PMC
-UMASK_UOPS_DECODED_STALL_CYCLES 0x01 0xC1 0x01
+UMASK_UOPS_DECODED_ANY 0x01
+DEFAULT_OPTIONS_UOPS_DECODED_ACTIVE_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_DECODED_ACTIVE_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_DECODED_ACTIVE_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_UOPS_DECODED_ACTIVE_COUNT 0x01
+DEFAULT_OPTIONS_UOPS_DECODED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_DECODED_STALL_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_DECODED_STALL_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_DECODED_STALL_COUNT 0x01
UMASK_UOPS_DECODED_MS 0x02
-UMASK_UOPS_DECODED_ESP_FOLDING 0x04
-UMASK_UOPS_DECODED_ESP_SYNC 0x08
+DEFAULT_OPTIONS_UOPS_DECODED_MS_ACTIVE_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_DECODED_MS_ACTIVE_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_DECODED_MS_ACTIVE_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_UOPS_DECODED_MS_ACTIVE_COUNT 0x02
+DEFAULT_OPTIONS_UOPS_DECODED_MS_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_DECODED_MS_STALL_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_DECODED_MS_STALL_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_DECODED_MS_STALL_COUNT 0x02
+UMASK_UOPS_DECODED_ESP_FOLDING 0x04
+UMASK_UOPS_DECODED_ESP_SYNC 0x08
EVENT_RAT_STALLS 0xD2 PMC
UMASK_RAT_STALLS_FLAGS 0x01
@@ -437,8 +497,8 @@ UMASK_L2_TRANSACTIONS_IFETCH 0x04
UMASK_L2_TRANSACTIONS_PREFETCH 0x08
UMASK_L2_TRANSACTIONS_L1D_WB 0x10
UMASK_L2_TRANSACTIONS_L1D_FILL 0x20
-UMASK_L2_TRANSACTIONS_L1D_WB 0x40
-UMASK_L2_TRANSACTIONS_L1D_ANY 0x80
+UMASK_L2_TRANSACTIONS_L2_WB 0x40
+UMASK_L2_TRANSACTIONS_ANY 0x80
EVENT_L2_LINES_IN 0xF1 PMC
UMASK_L2_LINES_IN_S_STATE 0x02
@@ -450,6 +510,8 @@ UMASK_L2_LINES_OUT_DEMAND_CLEAN 0x01
UMASK_L2_LINES_OUT_DEMAND_DIRTY 0x02
UMASK_L2_LINES_OUT_PREFETCH_CLEAN 0x04
UMASK_L2_LINES_OUT_PREFETCH_DIRTY 0x08
+UMASK_L2_LINES_OUT_CLEAN_ANY 0x05
+UMASK_L2_LINES_OUT_DIRTY_ANY 0x0A
UMASK_L2_LINES_OUT_ANY 0x0F
EVENT_SQ_MISC 0xF4 PMC
@@ -473,6 +535,17 @@ UMASK_SIMD_INT_64_PACKED_LOGICAL 0x10
UMASK_SIMD_INT_64_PACKED_ARITH 0x20
UMASK_SIMD_INT_64_SHUFFLE_MOVE 0x40
+EVENT_OFFCORE_RESPONSE_0 0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS 0x01 0xFF 0xFF
+
+EVENT_OFFCORE_RESPONSE_1 0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS 0x01 0xFF 0xFF
+
+EVENT_UNCORE_CLOCK 0x00 UPMCFIX
+UMASK_UNCORE_CLOCK 0x00
+
EVENT_UNC_GQ_CYCLES_FULL 0x00 UPMC
UMASK_UNC_GQ_CYCLES_FULL_READ_TRACKER 0x01
UMASK_UNC_GQ_CYCLES_FULL_WRITE_TRACKER 0x02
@@ -678,10 +751,36 @@ UMASK_UNC_QHL_SLEEPS_IOH_CONFLICT 0x08
UMASK_UNC_QHL_SLEEPS_REMOTE_CONFLICT 0x10
UMASK_UNC_QHL_SLEEPS_LOCAL_CONFLICT 0x20
-EVENT_UNC_ADDR_OPCODE_MATCH 0x35 UPMC
-UMASK_UNC_ADDR_OPCODE_MATCH_IOH 0x01
-UMASK_UNC_ADDR_OPCODE_MATCH_REMOTE 0x02
-UMASK_UNC_ADDR_OPCODE_MATCH_LOCAL 0x04
+EVENT_UNC_ADDR_OPCODE_MATCH_AND 0x35 UPMC
+OPTIONS_UNC_ADDR_OPCODE_MATCH_AND_IOH EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_UNC_ADDR_OPCODE_MATCH_AND_IOH 0x01 0x02 0x0
+OPTIONS_UNC_ADDR_OPCODE_MATCH_AND_REMOTE EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_UNC_ADDR_OPCODE_MATCH_AND_REMOTE 0x02 0x02 0x0
+OPTIONS_UNC_ADDR_OPCODE_MATCH_AND_LOCAL EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_UNC_ADDR_OPCODE_MATCH_AND_LOCAL 0x04 0x02 0x0
+
+EVENT_UNC_ADDR_OPCODE_MATCH_OR 0x35 UPMC
+OPTIONS_UNC_ADDR_OPCODE_MATCH_OR_IOH EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_UNC_ADDR_OPCODE_MATCH_OR_IOH 0x01 0x0C 0x0
+OPTIONS_UNC_ADDR_OPCODE_MATCH_OR_REMOTE EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_UNC_ADDR_OPCODE_MATCH_OR_REMOTE 0x02 0x0C 0x0
+OPTIONS_UNC_ADDR_OPCODE_MATCH_OR_LOCAL EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_UNC_ADDR_OPCODE_MATCH_OR_LOCAL 0x04 0x0C 0x0
+
+EVENT_UNC_ADDR_OPCODE_MATCH_RSPFWDS 0x35 UPMC
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPFWDS_IOH 0x01 0x04 0x1A
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPFWDS_REMOTE 0x02 0x04 0x1A
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPFWDS_LOCAL 0x04 0x04 0x1A
+
+EVENT_UNC_ADDR_OPCODE_MATCH_RSPIWB 0x35 UPMC
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPIWB_IOH 0x01 0x04 0x1D
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPIWB_REMOTE 0x02 0x04 0x1D
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPIWB_LOCAL 0x04 0x04 0x1D
+
+EVENT_UNC_ADDR_OPCODE_MATCH_RSPIWB 0x35 UPMC
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPIWB_IOH 0x01 0x04 0x00
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPIWB_REMOTE 0x02 0x04 0x00
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPIWB_LOCAL 0x04 0x04 0x00
EVENT_UNC_QPI_TX_STALLED_SINGLE_FLIT 0x40 UPMC
UMASK_UNC_QPI_TX_STALLED_SINGLE_FLIT_HOME_LINK_0 0x01
@@ -789,3 +888,4 @@ UMASK_UNC_CYCLES_UNHALTED_L3_FLL_ENABLE 0x02
EVENT_UNC_CYCLES_UNHALTED_L3_FLL_DISABLE 0x86 UPMC
UMASK_UNC_CYCLES_UNHALTED_L3_FLL_DISABLE 0x01
+
diff --git a/src/includes/power.h b/src/includes/power.h
index 6cb5fd3..b6c26d8 100644
--- a/src/includes/power.h
+++ b/src/includes/power.h
@@ -6,13 +6,14 @@
* Description: Header File Power Module
* Implements Intel RAPL Interface.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -35,46 +36,174 @@
#include <types.h>
#include <registers.h>
#include <bitUtil.h>
-#include <msr.h>
+#include <error.h>
+#include <access.h>
-extern PowerInfo power_info;
-extern const uint32_t power_regs[4];
+const char* power_names[NUM_POWER_DOMAINS] = {"PKG", "PP0", "PP1", "DRAM"};
-extern void power_init(int cpuId);
-static inline void power_start(PowerData* data, int cpuId, PowerType type);
-static inline void power_stop(PowerData* data, int cpuId, PowerType type);
-static inline uint32_t power_read(int cpuId, uint64_t reg);
-static inline uint32_t power_tread(int socket_fd, int cpuId, uint64_t reg);
-static inline double power_printEnergy(PowerData* data);
+uint32_t power_regs[NUM_POWER_DOMAINS] = {MSR_PKG_ENERGY_STATUS,
+ MSR_PP0_ENERGY_STATUS,
+ MSR_PP1_ENERGY_STATUS,
+ MSR_DRAM_ENERGY_STATUS};
-static double
+uint32_t limit_regs[NUM_POWER_DOMAINS] = {MSR_PKG_RAPL_POWER_LIMIT,
+ MSR_PP0_RAPL_POWER_LIMIT,
+ MSR_PP1_RAPL_POWER_LIMIT,
+ MSR_DRAM_RAPL_POWER_LIMIT};
+
+uint32_t policy_regs[NUM_POWER_DOMAINS] = {0,
+ MSR_PP0_ENERGY_POLICY,
+ MSR_PP1_ENERGY_POLICY,
+ 0};
+
+uint32_t perf_regs[NUM_POWER_DOMAINS] = {MSR_PKG_PERF_STATUS,
+ MSR_PP0_PERF_STATUS,
+ 0,
+ MSR_DRAM_PERF_STATUS};
+
+uint32_t info_regs[NUM_POWER_DOMAINS] = {MSR_PKG_POWER_INFO,
+ 0,
+ 0,
+ MSR_DRAM_POWER_INFO};
+
+
+double
power_printEnergy(PowerData* data)
{
- return (double) ((data->after - data->before) * power_info.energyUnit);
+ return (double) ((data->after - data->before) * power_info.domains[data->domain].energyUnit);
}
-static void
+int
power_start(PowerData* data, int cpuId, PowerType type)
{
- data->before = extractBitField(msr_read(cpuId, power_regs[type]),32,0);
+ if (power_info.hasRAPL)
+ {
+ if (power_info.domains[type].supportFlags & POWER_DOMAIN_SUPPORT_STATUS)
+ {
+ uint64_t result = 0;
+ data->before = 0;
+ CHECK_MSR_READ_ERROR(HPMread(cpuId, MSR_DEV, power_regs[type], &result))
+ data->before = field64(result, 0, 32);
+ data->domain = type;
+ return 0;
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, RAPL domain %s not supported, power_names[type]);
+ return -EFAULT;
+ }
+ }
+ else
+ {
+ DEBUG_PLAIN_PRINT(DEBUGLEV_DEVELOP, No RAPL support);
+ return -EIO;
+ }
}
-static void
+int
power_stop(PowerData* data, int cpuId, PowerType type)
{
- data->after = extractBitField(msr_read(cpuId, power_regs[type]),32,0);
+ if (power_info.hasRAPL)
+ {
+ if (power_info.domains[type].supportFlags & POWER_DOMAIN_SUPPORT_STATUS)
+ {
+ uint64_t result = 0;
+ data->after = 0;
+ CHECK_MSR_READ_ERROR(HPMread(cpuId, MSR_DEV, power_regs[type], &result))
+ data->after = field64(result, 0, 32);
+ data->domain = type;
+ return 0;
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, RAPL domain %s not supported, power_names[type]);
+ return -EFAULT;
+ }
+ }
+ else
+ {
+ DEBUG_PLAIN_PRINT(DEBUGLEV_DEVELOP, No RAPL support);
+ return -EIO;
+ }
+}
+
+int
+power_read(int cpuId, uint64_t reg, uint32_t *data)
+{
+ int i;
+ PowerType type = -1;
+
+ if (power_info.hasRAPL)
+ {
+ for (i = 0; i < NUM_POWER_DOMAINS; i++)
+ {
+ if (reg == power_regs[i])
+ {
+ type = i;
+ break;
+ }
+ }
+ if (power_info.domains[type].supportFlags & POWER_DOMAIN_SUPPORT_STATUS)
+ {
+ uint64_t result = 0;
+ *data = 0;
+ CHECK_MSR_READ_ERROR(HPMread(cpuId, MSR_DEV, reg, &result))
+ *data = field64(result, 0, 32);
+ return 0;
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, RAPL domain %s not supported, power_names[type]);
+ return -EFAULT;
+ }
+ }
+ else
+ {
+ DEBUG_PLAIN_PRINT(DEBUGLEV_DEVELOP, No RAPL support);
+ return -EIO;
+ }
}
-static uint32_t
-power_read(int cpuId, uint64_t reg)
+int
+power_tread(int socket_fd, int cpuId, uint64_t reg, uint32_t *data)
{
- return extractBitField(msr_read(cpuId, reg),32,0);
+ int i;
+ PowerType type;
+ if (power_info.hasRAPL)
+ {
+ for (i = 0; i < NUM_POWER_DOMAINS; i++)
+ {
+ if (reg == power_regs[i])
+ {
+ type = i;
+ break;
+ }
+ }
+ if (power_info.domains[type].supportFlags & POWER_DOMAIN_SUPPORT_STATUS)
+ {
+ uint64_t result = 0;
+ *data = 0;
+ CHECK_MSR_READ_ERROR(HPMread(cpuId, MSR_DEV, reg, &result))
+ *data = field64(result, 0, 32);
+ return 0;
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, RAPL domain %s not supported, power_names[type]);
+ return -EFAULT;
+ }
+ }
+ else
+ {
+ DEBUG_PLAIN_PRINT(DEBUGLEV_DEVELOP, No RAPL support);
+ return -EIO;
+ }
}
-static uint32_t
-power_tread(int socket_fd, int cpuId, uint64_t reg)
+double
+power_getEnergyUnit(int domain)
{
- return extractBitField(msr_tread(socket_fd, cpuId, reg),32,0);
+ return power_info.domains[domain].energyUnit;
}
#endif /*POWER_H*/
diff --git a/src/includes/power_types.h b/src/includes/power_types.h
index b53ce85..337e091 100644
--- a/src/includes/power_types.h
+++ b/src/includes/power_types.h
@@ -5,13 +5,14 @@
*
* Description: Types file for power module.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -32,36 +33,10 @@
#define POWER_TYPES_H
#include <stdint.h>
+#include <likwid.h>
-typedef enum {
- PKG = 0,
- PP0,
- PP1,
- DRAM
-} PowerType;
-typedef struct {
- int numSteps;
- double* steps;
-} TurboBoost;
-
-typedef struct {
- double baseFrequency;
- double minFrequency;
- TurboBoost turbo;
- double powerUnit;
- double energyUnit;
- double timeUnit;
- double tdp;
- double minPower;
- double maxPower;
- double maxTimeWindow;
-} PowerInfo;
-
-typedef struct {
- uint32_t before;
- uint32_t after;
-} PowerData;
+extern uint32_t power_regs[NUM_POWER_DOMAINS];
#endif /*POWER_TYPES_H*/
diff --git a/src/includes/registers.h b/src/includes/registers.h
index ae80e28..32d975e 100644
--- a/src/includes/registers.h
+++ b/src/includes/registers.h
@@ -5,13 +5,14 @@
*
* Description: Register Defines for the perfmon module
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -47,6 +48,10 @@
#define MSR_PERFEVTSEL1 0x187
#define MSR_PERFEVTSEL2 0x188
#define MSR_PERFEVTSEL3 0x189
+#define MSR_PERFEVTSEL4 0x190
+#define MSR_PERFEVTSEL5 0x191
+#define MSR_PERFEVTSEL6 0x192
+#define MSR_PERFEVTSEL7 0x193
#define MSR_PMC0 0x0C1
#define MSR_PMC1 0x0C2
#define MSR_PMC2 0x0C3
@@ -60,6 +65,7 @@
#define MSR_PERF_GLOBAL_STATUS 0x38E
#define MSR_PERF_GLOBAL_OVF_CTRL 0x390
#define MSR_PEBS_ENABLE 0x3F1
+#define MSR_PEBS_LD_LAT 0x3F6
/* Perfmon V3 */
#define MSR_OFFCORE_RESP0 0x1A6
#define MSR_OFFCORE_RESP1 0x1A7
@@ -85,19 +91,20 @@
#define MSR_UNCORE_PMC5 0x3B5
#define MSR_UNCORE_PMC6 0x3B6
#define MSR_UNCORE_PMC7 0x3B7
-/*
- * Perfmon V4 (starting with Haswell, according to
+/*
+ * Perfmon V3 (starting with Haswell, according to
* Intel software developers guide also for SandyBridge,
* IvyBridge not mentioned in this section)
*/
-#define MSR_UNC_PERF_GLOBAL_CTRL MSR_UNCORE_PERF_GLOBAL_CTRL
-#define MSR_UNC_PERF_GLOBAL_STATUS MSR_UNCORE_PERF_GLOBAL_STATUS
-#define MSR_UNC_PERF_FIXED_CTRL MSR_UNCORE_FIXED_CTR0
-#define MSR_UNC_PERF_FIXED_CTR MSR_UNCORE_FIXED_CTR_CTRL
-#define MSR_UNC_ARB_PERFEVTSEL0 MSR_UNCORE_PMC2
-#define MSR_UNC_ARB_PERFEVTSEL1 MSR_UNCORE_PMC3
-#define MSR_UNC_ARB_CTR0 MSR_UNCORE_PMC0
-#define MSR_UNC_ARB_CTR1 MSR_UNCORE_PMC1
+#define MSR_UNC_PERF_GLOBAL_CTRL 0x391
+#define MSR_UNC_PERF_GLOBAL_STATUS 0x392
+#define MSR_UNC_PERF_GLOBAL_OVF_CTRL 0x393
+#define MSR_UNC_PERF_FIXED_CTRL 0x394
+#define MSR_UNC_PERF_FIXED_CTR 0x395
+#define MSR_UNC_ARB_PERFEVTSEL0 0x3B2
+#define MSR_UNC_ARB_PERFEVTSEL1 0x3B3
+#define MSR_UNC_ARB_CTR0 0x3B0
+#define MSR_UNC_ARB_CTR1 0x3B1
#define MSR_UNC_CBO_CONFIG 0x396
#define MSR_UNC_CBO_0_PERFEVTSEL0 0x700
#define MSR_UNC_CBO_0_PERFEVTSEL1 0x701
@@ -115,6 +122,38 @@
#define MSR_UNC_CBO_3_PERFEVTSEL1 0x731
#define MSR_UNC_CBO_3_CTR0 0x736
#define MSR_UNC_CBO_3_CTR1 0x737
+/* Perfmon V4 starting with Skylake */
+#define MSR_V4_PERF_GLOBAL_STATUS 0x38E
+#define MSR_V4_PERF_GLOBAL_STATUS_SET 0x391
+#define MSR_V4_PERF_GLOBAL_STATUS_RESET 0x390
+#define MSR_V4_PERF_GLOBAL_INUSE 0x392
+#define MSR_V4_PEBS_FRONTEND 0x3F7
+#define MSR_V4_UNC_PERF_GLOBAL_CTRL 0xE01
+#define MSR_V4_UNC_PERF_GLOBAL_STATUS 0xE02
+#define MSR_V4_UNC_PERF_FIXED_CTRL 0x394
+#define MSR_V4_UNC_PERF_FIXED_CTR 0x395
+#define MSR_V4_ARB_PERF_FIXED_CTRL0 0x3B2
+#define MSR_V4_ARB_PERF_FIXED_CTR0 0x3B0
+#define MSR_V4_ARB_PERF_FIXED_CTRL1 0x3B3
+#define MSR_V4_ARB_PERF_FIXED_CTR1 0x3B1
+#define MSR_V4_C0_PERF_FIXED_CTRL0 0x700
+#define MSR_V4_C0_PERF_FIXED_CTR0 0x706
+#define MSR_V4_C0_PERF_FIXED_CTRL1 0x701
+#define MSR_V4_C0_PERF_FIXED_CTR1 0x707
+#define MSR_V4_C1_PERF_FIXED_CTRL0 0x710
+#define MSR_V4_C1_PERF_FIXED_CTR0 0x716
+#define MSR_V4_C1_PERF_FIXED_CTRL1 0x711
+#define MSR_V4_C1_PERF_FIXED_CTR1 0x717
+#define MSR_V4_C2_PERF_FIXED_CTRL0 0x720
+#define MSR_V4_C2_PERF_FIXED_CTR0 0x726
+#define MSR_V4_C2_PERF_FIXED_CTRL1 0x721
+#define MSR_V4_C2_PERF_FIXED_CTR1 0x727
+#define MSR_V4_C3_PERF_FIXED_CTRL0 0x730
+#define MSR_V4_C3_PERF_FIXED_CTR0 0x736
+#define MSR_V4_C3_PERF_FIXED_CTRL1 0x731
+#define MSR_V4_C3_PERF_FIXED_CTR1 0x737
+/* V4 Uncore registers the same as in V3 */
+
/* Xeon Phi */
#define MSR_MIC_TSC 0x010
#define MSR_MIC_PERFEVTSEL0 0x028
@@ -125,6 +164,10 @@
#define MSR_MIC_PERF_GLOBAL_STATUS 0x02D
#define MSR_MIC_PERF_GLOBAL_OVF_CTRL 0x02E
#define MSR_MIC_PERF_GLOBAL_CTRL 0x02F
+/* Xeon Phi (Knights Landing)*/
+#define MSR_MIC2_PMC0 0x4C1
+#define MSR_MIC2_PMC1 0x4C2
+#define MSR_MIC2_TURBO_RATIO_LIMIT 0x1AD
/* Core v1/v2 type uncore
@@ -324,7 +367,10 @@
#define MSR_UNC_PCU_PMON_CTL2 0xC32
#define MSR_UNC_PCU_PMON_CTL3 0xC33
#define MSR_UNC_PCU_PMON_BOX_FILTER 0xC34
-#define MSR_UNC_PCU_PMON_BOX_CTL 0xD24
+#define MSR_UNC_PCU_PMON_BOX_CTL 0xC24
+#define MSR_UNC_PCU_PMON_BOX_STATUS 0xC35
+#define MSR_UNC_PCU_PMON_FIXED_CTR0 0x3FC
+#define MSR_UNC_PCU_PMON_FIXED_CTR1 0x3FD
/* UBox Performance Monitoring */
@@ -342,6 +388,7 @@
/* HA Box Performance Monitoring */
#define PCI_UNC_HA_PMON_BOX_CTL 0xF4
+#define PCI_UNC_HA_PMON_BOX_STATUS 0xF8
#define PCI_UNC_HA_PMON_CTL_0 0xD8
#define PCI_UNC_HA_PMON_CTL_1 0xDC
#define PCI_UNC_HA_PMON_CTL_2 0xE0
@@ -378,9 +425,22 @@
#define PCI_UNC_MC_PMON_CTR_2_B 0xB0
#define PCI_UNC_MC_PMON_CTR_3_B 0xB8
+/* IRP Performance Monitoring */
+#define PCI_UNC_IRP_PMON_BOX_STATUS 0xF8
+#define PCI_UNC_IRP_PMON_BOX_CTL 0xF4
+#define PCI_UNC_IRP0_PMON_CTL_0 0xD8
+#define PCI_UNC_IRP0_PMON_CTL_1 0xDC
+#define PCI_UNC_IRP0_PMON_CTR_0 0xA0
+#define PCI_UNC_IRP0_PMON_CTR_1 0xB0
+#define PCI_UNC_IRP1_PMON_CTL_0 0xE0
+#define PCI_UNC_IRP1_PMON_CTL_1 0xE4
+#define PCI_UNC_IRP1_PMON_CTR_0 0xB8
+#define PCI_UNC_IRP1_PMON_CTR_1 0xC0
+
/* QPI Box Performance Monitoring */
#define PCI_UNC_QPI_PMON_BOX_CTL 0xF4
+#define PCI_UNC_QPI_PMON_BOX_STATUS 0xF8
#define PCI_UNC_QPI_PMON_CTL_0 0xD8
#define PCI_UNC_QPI_PMON_CTL_1 0xDC
#define PCI_UNC_QPI_PMON_CTL_2 0xE0
@@ -402,6 +462,7 @@
/* R2PCIE Box Performance Monitoring */
#define PCI_UNC_R2PCIE_PMON_BOX_CTL 0xF4
+#define PCI_UNC_R2PCIE_PMON_BOX_STATUS 0xF8
#define PCI_UNC_R2PCIE_PMON_CTL_0 0xD8
#define PCI_UNC_R2PCIE_PMON_CTL_1 0xDC
#define PCI_UNC_R2PCIE_PMON_CTL_2 0xE0
@@ -418,6 +479,7 @@
/* R3QPI Box Performance Monitoring */
#define PCI_UNC_R3QPI_PMON_BOX_CTL 0xF4
+#define PCI_UNC_R3QPI_PMON_BOX_STATUS 0xF8
#define PCI_UNC_R3QPI_PMON_CTL_0 0xD8
#define PCI_UNC_R3QPI_PMON_CTL_1 0xDC
#define PCI_UNC_R3QPI_PMON_CTL_2 0xE0
@@ -428,6 +490,438 @@
#define PCI_UNC_R3QPI_PMON_CTR_1_B 0xA8
#define PCI_UNC_R3QPI_PMON_CTR_2_B 0xB0
+/* ########################################################## */
+/* Core v3 type uncore
+ * Naming following Intel Uncore Performance Monitoring Guide
+ * Ref. Nr. 331051-001
+ * */
+
+/* UBox Performance Monitoring */
+#define MSR_UNC_V3_U_PMON_CTR0 0x709
+#define MSR_UNC_V3_U_PMON_CTR1 0x70A
+#define MSR_UNC_V3_U_PMON_CTL0 0x705
+#define MSR_UNC_V3_U_PMON_CTL1 0x706
+#define MSR_UNC_V3_U_UCLK_FIXED_CTR 0x704
+#define MSR_UNC_V3_U_UCLK_FIXED_CTL 0x703
+#define MSR_UNC_V3_U_PMON_BOX_STATUS 0x708
+#define MSR_UNC_V3_U_PMON_GLOBAL_STATUS 0x701
+#define MSR_UNC_V3_U_PMON_GLOBAL_CTL 0x700
+#define MSR_UNC_V3_U_PMON_GLOBAL_CONFIG 0x702
+
+/* CBox Performance Monitoring */
+#define MSR_UNC_V3_C0_PMON_BOX_CTL 0xE00
+#define MSR_UNC_V3_C0_PMON_BOX_STATUS 0xE07
+#define MSR_UNC_V3_C0_PMON_BOX_FILTER0 0xE05
+#define MSR_UNC_V3_C0_PMON_BOX_FILTER1 0xE06
+#define MSR_UNC_V3_C0_PMON_CTL0 0xE01
+#define MSR_UNC_V3_C0_PMON_CTL1 0xE02
+#define MSR_UNC_V3_C0_PMON_CTL2 0xE03
+#define MSR_UNC_V3_C0_PMON_CTL3 0xE04
+#define MSR_UNC_V3_C0_PMON_CTR0 0xE08
+#define MSR_UNC_V3_C0_PMON_CTR1 0xE09
+#define MSR_UNC_V3_C0_PMON_CTR2 0xE0A
+#define MSR_UNC_V3_C0_PMON_CTR3 0xE0B
+
+#define MSR_UNC_V3_C1_PMON_BOX_CTL 0xE10
+#define MSR_UNC_V3_C1_PMON_BOX_STATUS 0xE17
+#define MSR_UNC_V3_C1_PMON_BOX_FILTER0 0xE15
+#define MSR_UNC_V3_C1_PMON_BOX_FILTER1 0xE16
+#define MSR_UNC_V3_C1_PMON_CTL0 0xE11
+#define MSR_UNC_V3_C1_PMON_CTL1 0xE12
+#define MSR_UNC_V3_C1_PMON_CTL2 0xE13
+#define MSR_UNC_V3_C1_PMON_CTL3 0xE14
+#define MSR_UNC_V3_C1_PMON_CTR0 0xE18
+#define MSR_UNC_V3_C1_PMON_CTR1 0xE19
+#define MSR_UNC_V3_C1_PMON_CTR2 0xE1A
+#define MSR_UNC_V3_C1_PMON_CTR3 0xE1B
+
+#define MSR_UNC_V3_C2_PMON_BOX_CTL 0xE20
+#define MSR_UNC_V3_C2_PMON_BOX_STATUS 0xE27
+#define MSR_UNC_V3_C2_PMON_BOX_FILTER0 0xE25
+#define MSR_UNC_V3_C2_PMON_BOX_FILTER1 0xE26
+#define MSR_UNC_V3_C2_PMON_CTL0 0xE21
+#define MSR_UNC_V3_C2_PMON_CTL1 0xE22
+#define MSR_UNC_V3_C2_PMON_CTL2 0xE23
+#define MSR_UNC_V3_C2_PMON_CTL3 0xE24
+#define MSR_UNC_V3_C2_PMON_CTR0 0xE28
+#define MSR_UNC_V3_C2_PMON_CTR1 0xE29
+#define MSR_UNC_V3_C2_PMON_CTR2 0xE2A
+#define MSR_UNC_V3_C2_PMON_CTR3 0xE2B
+
+#define MSR_UNC_V3_C3_PMON_BOX_CTL 0xE30
+#define MSR_UNC_V3_C3_PMON_BOX_STATUS 0xE37
+#define MSR_UNC_V3_C3_PMON_BOX_FILTER0 0xE35
+#define MSR_UNC_V3_C3_PMON_BOX_FILTER1 0xE36
+#define MSR_UNC_V3_C3_PMON_CTL0 0xE31
+#define MSR_UNC_V3_C3_PMON_CTL1 0xE32
+#define MSR_UNC_V3_C3_PMON_CTL2 0xE33
+#define MSR_UNC_V3_C3_PMON_CTL3 0xE34
+#define MSR_UNC_V3_C3_PMON_CTR0 0xE38
+#define MSR_UNC_V3_C3_PMON_CTR1 0xE39
+#define MSR_UNC_V3_C3_PMON_CTR2 0xE3A
+#define MSR_UNC_V3_C3_PMON_CTR3 0xE3B
+
+#define MSR_UNC_V3_C4_PMON_BOX_CTL 0xE40
+#define MSR_UNC_V3_C4_PMON_BOX_STATUS 0xE47
+#define MSR_UNC_V3_C4_PMON_BOX_FILTER0 0xE45
+#define MSR_UNC_V3_C4_PMON_BOX_FILTER1 0xE46
+#define MSR_UNC_V3_C4_PMON_CTL0 0xE41
+#define MSR_UNC_V3_C4_PMON_CTL1 0xE42
+#define MSR_UNC_V3_C4_PMON_CTL2 0xE43
+#define MSR_UNC_V3_C4_PMON_CTL3 0xE44
+#define MSR_UNC_V3_C4_PMON_CTR0 0xE48
+#define MSR_UNC_V3_C4_PMON_CTR1 0xE49
+#define MSR_UNC_V3_C4_PMON_CTR2 0xE4A
+#define MSR_UNC_V3_C4_PMON_CTR3 0xE4B
+
+#define MSR_UNC_V3_C5_PMON_BOX_CTL 0xE50
+#define MSR_UNC_V3_C5_PMON_BOX_STATUS 0xE57
+#define MSR_UNC_V3_C5_PMON_BOX_FILTER0 0xE55
+#define MSR_UNC_V3_C5_PMON_BOX_FILTER1 0xE56
+#define MSR_UNC_V3_C5_PMON_CTL0 0xE51
+#define MSR_UNC_V3_C5_PMON_CTL1 0xE52
+#define MSR_UNC_V3_C5_PMON_CTL2 0xE53
+#define MSR_UNC_V3_C5_PMON_CTL3 0xE54
+#define MSR_UNC_V3_C5_PMON_CTR0 0xE58
+#define MSR_UNC_V3_C5_PMON_CTR1 0xE59
+#define MSR_UNC_V3_C5_PMON_CTR2 0xE5A
+#define MSR_UNC_V3_C5_PMON_CTR3 0xE5B
+
+#define MSR_UNC_V3_C6_PMON_BOX_CTL 0xE60
+#define MSR_UNC_V3_C6_PMON_BOX_STATUS 0xE67
+#define MSR_UNC_V3_C6_PMON_BOX_FILTER0 0xE65
+#define MSR_UNC_V3_C6_PMON_BOX_FILTER1 0xE66
+#define MSR_UNC_V3_C6_PMON_CTL0 0xE61
+#define MSR_UNC_V3_C6_PMON_CTL1 0xE62
+#define MSR_UNC_V3_C6_PMON_CTL2 0xE63
+#define MSR_UNC_V3_C6_PMON_CTL3 0xE64
+#define MSR_UNC_V3_C6_PMON_CTR0 0xE68
+#define MSR_UNC_V3_C6_PMON_CTR1 0xE69
+#define MSR_UNC_V3_C6_PMON_CTR2 0xE6A
+#define MSR_UNC_V3_C6_PMON_CTR3 0xE6B
+
+#define MSR_UNC_V3_C7_PMON_BOX_CTL 0xE70
+#define MSR_UNC_V3_C7_PMON_BOX_STATUS 0xE77
+#define MSR_UNC_V3_C7_PMON_BOX_FILTER0 0xE75
+#define MSR_UNC_V3_C7_PMON_BOX_FILTER1 0xE76
+#define MSR_UNC_V3_C7_PMON_CTL0 0xE71
+#define MSR_UNC_V3_C7_PMON_CTL1 0xE72
+#define MSR_UNC_V3_C7_PMON_CTL2 0xE73
+#define MSR_UNC_V3_C7_PMON_CTL3 0xE74
+#define MSR_UNC_V3_C7_PMON_CTR0 0xE78
+#define MSR_UNC_V3_C7_PMON_CTR1 0xE79
+#define MSR_UNC_V3_C7_PMON_CTR2 0xE7A
+#define MSR_UNC_V3_C7_PMON_CTR3 0xE7B
+
+#define MSR_UNC_V3_C8_PMON_BOX_CTL 0xE80
+#define MSR_UNC_V3_C8_PMON_BOX_STATUS 0xE87
+#define MSR_UNC_V3_C8_PMON_BOX_FILTER0 0xE85
+#define MSR_UNC_V3_C8_PMON_BOX_FILTER1 0xE86
+#define MSR_UNC_V3_C8_PMON_CTL0 0xE81
+#define MSR_UNC_V3_C8_PMON_CTL1 0xE82
+#define MSR_UNC_V3_C8_PMON_CTL2 0xE83
+#define MSR_UNC_V3_C8_PMON_CTL3 0xE84
+#define MSR_UNC_V3_C8_PMON_CTR0 0xE88
+#define MSR_UNC_V3_C8_PMON_CTR1 0xE89
+#define MSR_UNC_V3_C8_PMON_CTR2 0xE8A
+#define MSR_UNC_V3_C8_PMON_CTR3 0xE8B
+
+#define MSR_UNC_V3_C9_PMON_BOX_CTL 0xE90
+#define MSR_UNC_V3_C9_PMON_BOX_STATUS 0xE97
+#define MSR_UNC_V3_C9_PMON_BOX_FILTER0 0xE95
+#define MSR_UNC_V3_C9_PMON_BOX_FILTER1 0xE96
+#define MSR_UNC_V3_C9_PMON_CTL0 0xE91
+#define MSR_UNC_V3_C9_PMON_CTL1 0xE92
+#define MSR_UNC_V3_C9_PMON_CTL2 0xE93
+#define MSR_UNC_V3_C9_PMON_CTL3 0xE94
+#define MSR_UNC_V3_C9_PMON_CTR0 0xE98
+#define MSR_UNC_V3_C9_PMON_CTR1 0xE99
+#define MSR_UNC_V3_C9_PMON_CTR2 0xE9A
+#define MSR_UNC_V3_C9_PMON_CTR3 0xE9B
+
+#define MSR_UNC_V3_C10_PMON_BOX_CTL 0xEA0
+#define MSR_UNC_V3_C10_PMON_BOX_STATUS 0xEA7
+#define MSR_UNC_V3_C10_PMON_BOX_FILTER0 0xEA5
+#define MSR_UNC_V3_C10_PMON_BOX_FILTER1 0xEA6
+#define MSR_UNC_V3_C10_PMON_CTL0 0xEA1
+#define MSR_UNC_V3_C10_PMON_CTL1 0xEA2
+#define MSR_UNC_V3_C10_PMON_CTL2 0xEA3
+#define MSR_UNC_V3_C10_PMON_CTL3 0xEA4
+#define MSR_UNC_V3_C10_PMON_CTR0 0xEA8
+#define MSR_UNC_V3_C10_PMON_CTR1 0xEA9
+#define MSR_UNC_V3_C10_PMON_CTR2 0xEAA
+#define MSR_UNC_V3_C10_PMON_CTR3 0xEAB
+
+#define MSR_UNC_V3_C11_PMON_BOX_CTL 0xEB0
+#define MSR_UNC_V3_C11_PMON_BOX_STATUS 0xEB7
+#define MSR_UNC_V3_C11_PMON_BOX_FILTER0 0xEB5
+#define MSR_UNC_V3_C11_PMON_BOX_FILTER1 0xEB6
+#define MSR_UNC_V3_C11_PMON_CTL0 0xEB1
+#define MSR_UNC_V3_C11_PMON_CTL1 0xEB2
+#define MSR_UNC_V3_C11_PMON_CTL2 0xEB3
+#define MSR_UNC_V3_C11_PMON_CTL3 0xEB4
+#define MSR_UNC_V3_C11_PMON_CTR0 0xEB8
+#define MSR_UNC_V3_C11_PMON_CTR1 0xEB9
+#define MSR_UNC_V3_C11_PMON_CTR2 0xEBA
+#define MSR_UNC_V3_C11_PMON_CTR3 0xEBB
+
+#define MSR_UNC_V3_C12_PMON_BOX_CTL 0xEC0
+#define MSR_UNC_V3_C12_PMON_BOX_STATUS 0xEC7
+#define MSR_UNC_V3_C12_PMON_BOX_FILTER0 0xEC5
+#define MSR_UNC_V3_C12_PMON_BOX_FILTER1 0xEC6
+#define MSR_UNC_V3_C12_PMON_CTL0 0xEC1
+#define MSR_UNC_V3_C12_PMON_CTL1 0xEC2
+#define MSR_UNC_V3_C12_PMON_CTL2 0xEC3
+#define MSR_UNC_V3_C12_PMON_CTL3 0xEC4
+#define MSR_UNC_V3_C12_PMON_CTR0 0xEC8
+#define MSR_UNC_V3_C12_PMON_CTR1 0xEC9
+#define MSR_UNC_V3_C12_PMON_CTR2 0xECA
+#define MSR_UNC_V3_C12_PMON_CTR3 0xECB
+
+#define MSR_UNC_V3_C13_PMON_BOX_CTL 0xED0
+#define MSR_UNC_V3_C13_PMON_BOX_STATUS 0xED7
+#define MSR_UNC_V3_C13_PMON_BOX_FILTER0 0xED5
+#define MSR_UNC_V3_C13_PMON_BOX_FILTER1 0xED6
+#define MSR_UNC_V3_C13_PMON_CTL0 0xED1
+#define MSR_UNC_V3_C13_PMON_CTL1 0xED2
+#define MSR_UNC_V3_C13_PMON_CTL2 0xED3
+#define MSR_UNC_V3_C13_PMON_CTL3 0xED4
+#define MSR_UNC_V3_C13_PMON_CTR0 0xED8
+#define MSR_UNC_V3_C13_PMON_CTR1 0xED9
+#define MSR_UNC_V3_C13_PMON_CTR2 0xEDA
+#define MSR_UNC_V3_C13_PMON_CTR3 0xEDB
+
+#define MSR_UNC_V3_C14_PMON_BOX_CTL 0xEE0
+#define MSR_UNC_V3_C14_PMON_BOX_STATUS 0xEE7
+#define MSR_UNC_V3_C14_PMON_BOX_FILTER0 0xEE5
+#define MSR_UNC_V3_C14_PMON_BOX_FILTER1 0xEE6
+#define MSR_UNC_V3_C14_PMON_CTL0 0xEE1
+#define MSR_UNC_V3_C14_PMON_CTL1 0xEE2
+#define MSR_UNC_V3_C14_PMON_CTL2 0xEE3
+#define MSR_UNC_V3_C14_PMON_CTL3 0xEE4
+#define MSR_UNC_V3_C14_PMON_CTR0 0xEE8
+#define MSR_UNC_V3_C14_PMON_CTR1 0xEE9
+#define MSR_UNC_V3_C14_PMON_CTR2 0xEEA
+#define MSR_UNC_V3_C14_PMON_CTR3 0xEEB
+
+#define MSR_UNC_V3_C15_PMON_BOX_CTL 0xEF0
+#define MSR_UNC_V3_C15_PMON_BOX_STATUS 0xEF7
+#define MSR_UNC_V3_C15_PMON_BOX_FILTER0 0xEF5
+#define MSR_UNC_V3_C15_PMON_BOX_FILTER1 0xEF6
+#define MSR_UNC_V3_C15_PMON_CTL0 0xEF1
+#define MSR_UNC_V3_C15_PMON_CTL1 0xEF2
+#define MSR_UNC_V3_C15_PMON_CTL2 0xEF3
+#define MSR_UNC_V3_C15_PMON_CTL3 0xEF4
+#define MSR_UNC_V3_C15_PMON_CTR0 0xEF8
+#define MSR_UNC_V3_C15_PMON_CTR1 0xEF9
+#define MSR_UNC_V3_C15_PMON_CTR2 0xEFA
+#define MSR_UNC_V3_C15_PMON_CTR3 0xEFB
+
+#define MSR_UNC_V3_C16_PMON_BOX_CTL 0xF00
+#define MSR_UNC_V3_C16_PMON_BOX_STATUS 0xF07
+#define MSR_UNC_V3_C16_PMON_BOX_FILTER0 0xF05
+#define MSR_UNC_V3_C16_PMON_BOX_FILTER1 0xF06
+#define MSR_UNC_V3_C16_PMON_CTL0 0xF01
+#define MSR_UNC_V3_C16_PMON_CTL1 0xF02
+#define MSR_UNC_V3_C16_PMON_CTL2 0xF03
+#define MSR_UNC_V3_C16_PMON_CTL3 0xF04
+#define MSR_UNC_V3_C16_PMON_CTR0 0xF08
+#define MSR_UNC_V3_C16_PMON_CTR1 0xF09
+#define MSR_UNC_V3_C16_PMON_CTR2 0xF0A
+#define MSR_UNC_V3_C16_PMON_CTR3 0xF0B
+
+#define MSR_UNC_V3_C17_PMON_BOX_CTL 0xF10
+#define MSR_UNC_V3_C17_PMON_BOX_STATUS 0xF17
+#define MSR_UNC_V3_C17_PMON_BOX_FILTER0 0xF15
+#define MSR_UNC_V3_C17_PMON_BOX_FILTER1 0xF16
+#define MSR_UNC_V3_C17_PMON_CTL0 0xF11
+#define MSR_UNC_V3_C17_PMON_CTL1 0xF12
+#define MSR_UNC_V3_C17_PMON_CTL2 0xF13
+#define MSR_UNC_V3_C17_PMON_CTL3 0xF14
+#define MSR_UNC_V3_C17_PMON_CTR0 0xF18
+#define MSR_UNC_V3_C17_PMON_CTR1 0xF19
+#define MSR_UNC_V3_C17_PMON_CTR2 0xF1A
+#define MSR_UNC_V3_C17_PMON_CTR3 0xF1B
+
+#define MSR_UNC_V3_C18_PMON_BOX_CTL 0xF20
+#define MSR_UNC_V3_C18_PMON_BOX_STATUS 0xF27
+#define MSR_UNC_V3_C18_PMON_BOX_FILTER0 0xF25
+#define MSR_UNC_V3_C18_PMON_BOX_FILTER1 0xF26
+#define MSR_UNC_V3_C18_PMON_CTL0 0xF21
+#define MSR_UNC_V3_C18_PMON_CTL1 0xF22
+#define MSR_UNC_V3_C18_PMON_CTL2 0xF23
+#define MSR_UNC_V3_C18_PMON_CTL3 0xF24
+#define MSR_UNC_V3_C18_PMON_CTR0 0xF28
+#define MSR_UNC_V3_C18_PMON_CTR1 0xF29
+#define MSR_UNC_V3_C18_PMON_CTR2 0xF2A
+#define MSR_UNC_V3_C18_PMON_CTR3 0xF2B
+
+#define MSR_UNC_V3_C19_PMON_BOX_CTL 0xF30
+#define MSR_UNC_V3_C19_PMON_BOX_STATUS 0xF37
+#define MSR_UNC_V3_C19_PMON_BOX_FILTER0 0xF35
+#define MSR_UNC_V3_C19_PMON_BOX_FILTER1 0xF36
+#define MSR_UNC_V3_C19_PMON_CTL0 0xF31
+#define MSR_UNC_V3_C19_PMON_CTL1 0xF32
+#define MSR_UNC_V3_C19_PMON_CTL2 0xF33
+#define MSR_UNC_V3_C19_PMON_CTL3 0xF34
+#define MSR_UNC_V3_C19_PMON_CTR0 0xF38
+#define MSR_UNC_V3_C19_PMON_CTR1 0xF39
+#define MSR_UNC_V3_C19_PMON_CTR2 0xF3A
+#define MSR_UNC_V3_C19_PMON_CTR3 0xF3B
+
+#define MSR_UNC_V3_C20_PMON_BOX_CTL 0xF40
+#define MSR_UNC_V3_C20_PMON_BOX_STATUS 0xF47
+#define MSR_UNC_V3_C20_PMON_BOX_FILTER0 0xF45
+#define MSR_UNC_V3_C20_PMON_BOX_FILTER1 0xF46
+#define MSR_UNC_V3_C20_PMON_CTL0 0xF41
+#define MSR_UNC_V3_C20_PMON_CTL1 0xF42
+#define MSR_UNC_V3_C20_PMON_CTL2 0xF43
+#define MSR_UNC_V3_C20_PMON_CTL3 0xF44
+#define MSR_UNC_V3_C20_PMON_CTR0 0xF48
+#define MSR_UNC_V3_C20_PMON_CTR1 0xF49
+#define MSR_UNC_V3_C20_PMON_CTR2 0xF4A
+#define MSR_UNC_V3_C20_PMON_CTR3 0xF4B
+
+#define MSR_UNC_V3_C21_PMON_BOX_CTL 0xF50
+#define MSR_UNC_V3_C21_PMON_BOX_STATUS 0xF57
+#define MSR_UNC_V3_C21_PMON_BOX_FILTER0 0xF55
+#define MSR_UNC_V3_C21_PMON_BOX_FILTER1 0xF56
+#define MSR_UNC_V3_C21_PMON_CTL0 0xF51
+#define MSR_UNC_V3_C21_PMON_CTL1 0xF52
+#define MSR_UNC_V3_C21_PMON_CTL2 0xF53
+#define MSR_UNC_V3_C21_PMON_CTL3 0xF54
+#define MSR_UNC_V3_C21_PMON_CTR0 0xF58
+#define MSR_UNC_V3_C21_PMON_CTR1 0xF59
+#define MSR_UNC_V3_C21_PMON_CTR2 0xF5A
+#define MSR_UNC_V3_C21_PMON_CTR3 0xF5B
+
+#define MSR_UNC_V3_C22_PMON_BOX_CTL 0xF60
+#define MSR_UNC_V3_C22_PMON_BOX_STATUS 0xF67
+#define MSR_UNC_V3_C22_PMON_BOX_FILTER0 0xF65
+#define MSR_UNC_V3_C22_PMON_BOX_FILTER1 0xF66
+#define MSR_UNC_V3_C22_PMON_CTL0 0xF61
+#define MSR_UNC_V3_C22_PMON_CTL1 0xF62
+#define MSR_UNC_V3_C22_PMON_CTL2 0xF63
+#define MSR_UNC_V3_C22_PMON_CTL3 0xF64
+#define MSR_UNC_V3_C22_PMON_CTR0 0xF68
+#define MSR_UNC_V3_C22_PMON_CTR1 0xF69
+#define MSR_UNC_V3_C22_PMON_CTR2 0xF6A
+#define MSR_UNC_V3_C22_PMON_CTR3 0xF6B
+
+#define MSR_UNC_V3_C23_PMON_BOX_CTL 0xF70
+#define MSR_UNC_V3_C23_PMON_BOX_STATUS 0xF77
+#define MSR_UNC_V3_C23_PMON_BOX_FILTER0 0xF75
+#define MSR_UNC_V3_C23_PMON_BOX_FILTER1 0xF76
+#define MSR_UNC_V3_C23_PMON_CTL0 0xF71
+#define MSR_UNC_V3_C23_PMON_CTL1 0xF72
+#define MSR_UNC_V3_C23_PMON_CTL2 0xF73
+#define MSR_UNC_V3_C23_PMON_CTL3 0xF74
+#define MSR_UNC_V3_C23_PMON_CTR0 0xF78
+#define MSR_UNC_V3_C23_PMON_CTR1 0xF79
+#define MSR_UNC_V3_C23_PMON_CTR2 0xF7A
+#define MSR_UNC_V3_C23_PMON_CTR3 0xF7B
+
+/* Sbox */
+#define MSR_UNC_V3_S0_PMON_BOX_CTL 0x720
+#define MSR_UNC_V3_S0_PMON_BOX_STATUS 0x725
+#define MSR_UNC_V3_S0_PMON_CTL_0 0x721
+#define MSR_UNC_V3_S0_PMON_CTL_1 0x722
+#define MSR_UNC_V3_S0_PMON_CTL_2 0x723
+#define MSR_UNC_V3_S0_PMON_CTL_3 0x724
+#define MSR_UNC_V3_S0_PMON_CTR_0 0x726
+#define MSR_UNC_V3_S0_PMON_CTR_1 0x727
+#define MSR_UNC_V3_S0_PMON_CTR_2 0x728
+#define MSR_UNC_V3_S0_PMON_CTR_3 0x729
+
+#define MSR_UNC_V3_S1_PMON_BOX_CTL 0x72A
+#define MSR_UNC_V3_S1_PMON_BOX_STATUS 0x72F
+#define MSR_UNC_V3_S1_PMON_CTL_0 0x72B
+#define MSR_UNC_V3_S1_PMON_CTL_1 0x72C
+#define MSR_UNC_V3_S1_PMON_CTL_2 0x72D
+#define MSR_UNC_V3_S1_PMON_CTL_3 0x72E
+#define MSR_UNC_V3_S1_PMON_CTR_0 0x730
+#define MSR_UNC_V3_S1_PMON_CTR_1 0x731
+#define MSR_UNC_V3_S1_PMON_CTR_2 0x732
+#define MSR_UNC_V3_S1_PMON_CTR_3 0x733
+
+#define MSR_UNC_V3_S2_PMON_BOX_CTL 0x734
+#define MSR_UNC_V3_S2_PMON_BOX_STATUS 0x739
+#define MSR_UNC_V3_S2_PMON_CTL_0 0x735
+#define MSR_UNC_V3_S2_PMON_CTL_1 0x736
+#define MSR_UNC_V3_S2_PMON_CTL_2 0x737
+#define MSR_UNC_V3_S2_PMON_CTL_3 0x738
+#define MSR_UNC_V3_S2_PMON_CTR_0 0x73A
+#define MSR_UNC_V3_S2_PMON_CTR_1 0x73B
+#define MSR_UNC_V3_S2_PMON_CTR_2 0x73C
+#define MSR_UNC_V3_S2_PMON_CTR_3 0x73D
+
+#define MSR_UNC_V3_S3_PMON_BOX_CTL 0x73E
+#define MSR_UNC_V3_S3_PMON_BOX_STATUS 0x743
+#define MSR_UNC_V3_S3_PMON_CTL_0 0x73F
+#define MSR_UNC_V3_S3_PMON_CTL_1 0x740
+#define MSR_UNC_V3_S3_PMON_CTL_2 0x741
+#define MSR_UNC_V3_S3_PMON_CTL_3 0x742
+#define MSR_UNC_V3_S3_PMON_CTR_0 0x744
+#define MSR_UNC_V3_S3_PMON_CTR_1 0x745
+#define MSR_UNC_V3_S3_PMON_CTR_2 0x746
+#define MSR_UNC_V3_S3_PMON_CTR_3 0x747
+
+/* V3 HA similar to V1/V2 */
+/* V3 iMC similar to V1/V2 */
+
+
+/* PCU (Power Control) Performance Monitoring */
+
+#define MSR_UNC_V3_PCU_PMON_CTR0 0x717
+#define MSR_UNC_V3_PCU_PMON_CTR1 0x718
+#define MSR_UNC_V3_PCU_PMON_CTR2 0x719
+#define MSR_UNC_V3_PCU_PMON_CTR3 0x71A
+#define MSR_UNC_V3_PCU_PMON_CTL0 0x711
+#define MSR_UNC_V3_PCU_PMON_CTL1 0x712
+#define MSR_UNC_V3_PCU_PMON_CTL2 0x713
+#define MSR_UNC_V3_PCU_PMON_CTL3 0x714
+#define MSR_UNC_V3_PCU_PMON_BOX_FILTER 0x715
+#define MSR_UNC_V3_PCU_PMON_BOX_CTL 0x710
+#define MSR_UNC_V3_PCU_PMON_BOX_STATUS 0x716
+#define MSR_UNC_V3_PCU_CC6_CTR 0x3FD
+#define MSR_UNC_V3_PCU_CC3_CTR 0x3FC
+#define MSR_UNC_V3_PCU_PC2_CTR 0x60D
+#define MSR_UNC_V3_PCU_PC3_CTR 0x3F8
+
+/* V3 QPI Box Performance Monitoring, mostly similar to V1/V2 */
+
+#define PCI_UNC_V3_QPI_PMON_BOX_CTL 0xF4
+#define PCI_UNC_V3_QPI_PMON_BOX_STATUS 0xF8
+#define PCI_UNC_V3_QPI_PMON_CTL_0 0xD8
+#define PCI_UNC_V3_QPI_PMON_CTL_1 0xDC
+#define PCI_UNC_V3_QPI_PMON_CTL_2 0xE0
+#define PCI_UNC_V3_QPI_PMON_CTL_3 0xE4
+#define PCI_UNC_V3_QPI_PMON_CTR_0_A 0xA4
+#define PCI_UNC_V3_QPI_PMON_CTR_1_A 0xAC
+#define PCI_UNC_V3_QPI_PMON_CTR_2_A 0xB4
+#define PCI_UNC_V3_QPI_PMON_CTR_3_A 0xBC
+#define PCI_UNC_V3_QPI_PMON_CTR_0_B 0xA0
+#define PCI_UNC_V3_QPI_PMON_CTR_1_B 0xA8
+#define PCI_UNC_V3_QPI_PMON_CTR_2_B 0xB0
+#define PCI_UNC_V3_QPI_PMON_CTR_3_B 0xB8
+#define PCI_UNC_V3_QPI_PMON_RX_MASK_0 0x238
+#define PCI_UNC_V3_QPI_PMON_RX_MASK_1 0x23C
+#define PCI_UNC_V3_QPI_PMON_RX_MATCH_0 0x228
+#define PCI_UNC_V3_QPI_PMON_RX_MATCH_1 0x22C
+#define PCI_UNC_V3_QPI_PMON_TX_MASK_0 0x210
+#define PCI_UNC_V3_QPI_PMON_TX_MASK_1 0x214
+#define PCI_UNC_V3_QPI_PMON_TX_MATCH_0 0x200
+#define PCI_UNC_V3_QPI_PMON_TX_MATCH_1 0x204
+#define PCI_UNC_V3_QPI_RATE_STATUS 0xD4
+#define PCI_UNC_V3_QPI_LINK_LLR 0xD0
+#define PCI_UNC_V3_QPI_LINK_IDLE 0xC8
+
+
+/* V3 R2PCIE Box Performance Monitoring similar to V1/V2 */
+
+/* V3 R3QPI Box Performance Monitoring similar to V1/V2 */
+
+/* ########################################################## */
/* EX type uncore */
/* U box - System Config Controller */
@@ -774,6 +1268,7 @@
/* Match/Mask MSRs */
#define MSR_B0_PMON_MATCH 0xE45
#define MSR_B0_PMON_MASK 0xE46
+#define MSR_S0_PMON_MM_CFG 0xE49
#define MSR_S0_PMON_MATCH 0xE49
#define MSR_S0_PMON_MASK 0xE4A
#define MSR_B1_PMON_MATCH 0xE4D
@@ -781,6 +1276,7 @@
#define MSR_M0_PMON_MM_CONFIG 0xE54
#define MSR_M0_PMON_ADDR_MATCH 0xE55
#define MSR_M0_PMON_ADDR_MASK 0xE56
+#define MSR_S1_PMON_MM_CFG 0xE58
#define MSR_S1_PMON_MATCH 0xE59
#define MSR_S1_PMON_MASK 0xE5A
#define MSR_M1_PMON_MM_CONFIG 0xE5C
@@ -803,6 +1299,11 @@
#define MSR_DRAM_ENERGY_STATUS 0x619
#define MSR_DRAM_PERF_STATUS 0x61B
#define MSR_DRAM_POWER_INFO 0x61C
+#define MSR_PLATFORM_ENERGY_STATUS 0x64D
+#define MSR_PLATFORM_POWER_LIMIT 0x65C
+
+/* Intel Silvermont's RAPL registers */
+#define MSR_PKG_POWER_INFO_SILVERMONT 0x66E
/* TM/TM2 interface */
#define IA32_THERM_STATUS 0x19C
@@ -811,12 +1312,22 @@
/* Turbo Boost Interface */
#define MSR_IA32_MISC_ENABLE 0x1A0
+#define MSR_PREFETCH_ENABLE 0x1A4
#define MSR_PLATFORM_INFO 0x0CE
#define MSR_TURBO_POWER_CURRENT_LIMIT 0x1AC
#define MSR_TURBO_RATIO_LIMIT 0x1AD
+#define MSR_TURBO_RATIO_LIMIT1 0x1AE
+#define MSR_TURBO_RATIO_LIMIT2 0x1AF
+#define MSR_TURBO_RATIO_LIMIT3 0x1AC
-/* Intel Silvermont's RAPL registers */
-#define MSR_PKG_POWER_INFO_SILVERMONT 0x66E
+/* MISC Intel register */
+#define MSR_MPERF 0xE7
+#define MSR_APERF 0xE8
+#define MSR_PPERF 0x64E
+#define MSR_WEIGHTED_CORE_C0 0x658
+#define MSR_ANY_CORE_C0 0x659
+#define MSR_ANY_GFXE_C0 0x65A
+#define MSR_CORE_GFXE_OVERLAP_C0 0x65B
/*
* AMD
*/
@@ -866,6 +1377,15 @@
#define MSR_AMD16_PMC2 0xC0010006
#define MSR_AMD16_PMC3 0xC0010007
+#define MSR_AMD16_L2_PERFEVTSEL0 0xC0010230
+#define MSR_AMD16_L2_PERFEVTSEL1 0xC0010232
+#define MSR_AMD16_L2_PERFEVTSEL2 0xC0010234
+#define MSR_AMD16_L2_PERFEVTSEL3 0xC0010236
+#define MSR_AMD16_L2_PMC0 0xC0010231
+#define MSR_AMD16_L2_PMC1 0xC0010233
+#define MSR_AMD16_L2_PMC2 0xC0010235
+#define MSR_AMD16_L2_PMC3 0xC0010237
+
#define MSR_AMD16_NB_PERFEVTSEL0 0xC0010240
#define MSR_AMD16_NB_PERFEVTSEL1 0xC0010242
#define MSR_AMD16_NB_PERFEVTSEL2 0xC0010244
diff --git a/src/includes/registers_types.h b/src/includes/registers_types.h
new file mode 100644
index 0000000..e588e3e
--- /dev/null
+++ b/src/includes/registers_types.h
@@ -0,0 +1,209 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: registers_types.h
+ *
+ * Description: Header File of registers.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef REGISTERS_TYPES_H
+#define REGISTERS_TYPES_H
+
+#include <pci_types.h>
+
+typedef enum {
+ PMC0 = 0,
+ PMC1, PMC2, PMC3, PMC4, PMC5, PMC6,
+ PMC7, PMC8, PMC9, PMC10, PMC11, PMC12,
+ PMC13, PMC14, PMC15, PMC16, PMC17, PMC18,
+ PMC19, PMC20, PMC21, PMC22, PMC23, PMC24,
+ PMC25, PMC26, PMC27, PMC28, PMC29, PMC30,
+ PMC31, PMC32, PMC33, PMC34, PMC35, PMC36,
+ PMC37, PMC38, PMC39, PMC40, PMC41, PMC42,
+ PMC43, PMC44, PMC45, PMC46, PMC47, PMC48,
+ PMC49, PMC50, PMC51, PMC52, PMC53, PMC54,
+ PMC55, PMC56, PMC57, PMC58, PMC59, PMC60,
+ PMC61, PMC62, PMC63, PMC64, PMC65, PMC66,
+ PMC67, PMC68, PMC69, PMC70, PMC71, PMC72,
+ PMC73, PMC74, PMC75, PMC76, PMC77, PMC78,
+ PMC79, PMC80, PMC81, PMC82, PMC83, PMC84,
+ PMC85, PMC86, PMC87, PMC88, PMC89, PMC90,
+ PMC91, PMC92, PMC93, PMC94, PMC95, PMC96,
+ PMC97, PMC98, PMC99, PMC100, PMC101, PMC102,
+ PMC103, PMC104, PMC105, PMC106, PMC107, PMC108,
+ PMC109, PMC110, PMC111, PMC112, PMC113, PMC114,
+ PMC115, PMC116, PMC117, PMC118, PMC119, PMC120,
+ PMC121, PMC122, PMC123, PMC124, PMC125, PMC126,
+ PMC127, PMC128, PMC129, PMC130, PMC131, PMC132,
+ PMC133, PMC134, PMC135, PMC136, PMC137, PMC138,
+ PMC139, PMC140, PMC141, PMC142, PMC143, PMC144,
+ PMC145, PMC146, PMC147, PMC148, PMC149, PMC150,
+ PMC151, PMC152, PMC153, PMC154, PMC155, PMC156,
+ PMC157, PMC158, PMC159, PMC160, PMC161, PMC162,
+ PMC163, PMC164, PMC165, PMC166, PMC167, PMC168,
+ PMC169, PMC170, PMC171, PMC172, PMC173, PMC174,
+ PMC175, PMC176, PMC177, PMC178, PMC179, PMC180,
+ PMC181, PMC182, PMC183, PMC184, PMC185, PMC186,
+ PMC187, PMC188, PMC189, PMC190, PMC191, PMC192,
+ PMC193, PMC194, PMC195, PMC196, PMC197, PMC198,
+ PMC199, PMC200, PMC201, PMC202, PMC203, PMC204,
+ PMC205, PMC206, PMC207, PMC208, PMC209, PMC210,
+ PMC211, PMC212, PMC213, PMC214, PMC215, PMC216,
+ NUM_PMC
+} RegisterIndex;
+
+typedef enum {
+ PMC = 0, FIXED, THERMAL,
+ POWER, UNCORE, MBOX0,
+ MBOX1, MBOX2, MBOX3,
+ MBOX4, MBOX5, MBOX6, MBOX7,
+ MBOX0FIX, MBOX1FIX, MBOX2FIX,
+ MBOX3FIX, MBOX4FIX, MBOX5FIX,
+ MBOX6FIX, MBOX7FIX,
+ BBOX0, BBOX1,
+ RBOX0, RBOX1, RBOX2,
+ WBOX,
+ WBOX0FIX, WBOX1FIX,
+ SBOX0, SBOX1, SBOX2, SBOX3,
+ SBOX0FIX, SBOX1FIX, SBOX2FIX, SBOX3FIX,
+ CBOX0, CBOX1, CBOX2,
+ CBOX3, CBOX4, CBOX5,
+ CBOX6, CBOX7, CBOX8,
+ CBOX9, CBOX10, CBOX11,
+ CBOX12, CBOX13, CBOX14,
+ CBOX15, CBOX16, CBOX17,
+ CBOX18, CBOX19, CBOX20,
+ CBOX21, CBOX22, CBOX23,
+ PBOX,
+ UBOX,
+ UBOXFIX,
+ IBOX0, IBOX1,
+ QBOX0, QBOX1, QBOX2,
+ QBOX0FIX, QBOX1FIX, QBOX2FIX,
+ NUM_UNITS, NOTYPE, MAX_UNITS
+} RegisterType;
+
+static char* RegisterTypeNames[MAX_UNITS] = {
+ [PMC] = "Core-local general purpose counters",
+ [FIXED] = "Fixed counters",
+ [THERMAL] = "Thermal",
+ [POWER] = "Energy/Power counters (RAPL)",
+ [UNCORE] = "Socket-local general/fixed purpose counters",
+ [MBOX0] = "Memory Controller 0 Channel 0",
+ [MBOX1] = "Memory Controller 0 Channel 1",
+ [MBOX2] = "Memory Controller 0 Channel 2",
+ [MBOX3] = "Memory Controller 0 Channel 3",
+ [MBOX4] = "Memory Controller 1 Channel 0",
+ [MBOX5] = "Memory Controller 1 Channel 1",
+ [MBOX6] = "Memory Controller 1 Channel 2",
+ [MBOX7] = "Memory Controller 1 Channel 3",
+ [MBOX0FIX] = "Memory Controller 0 Channel 0 Fixed Counter",
+ [MBOX1FIX] = "Memory Controller 0 Channel 1 Fixed Counter",
+ [MBOX2FIX] = "Memory Controller 0 Channel 2 Fixed Counter",
+ [MBOX3FIX] = "Memory Controller 0 Channel 3 Fixed Counter",
+ [MBOX4FIX] = "Memory Controller 1 Channel 0 Fixed Counter",
+ [MBOX5FIX] = "Memory Controller 1 Channel 1 Fixed Counter",
+ [MBOX6FIX] = "Memory Controller 1 Channel 2 Fixed Counter",
+ [MBOX7FIX] = "Memory Controller 1 Channel 3 Fixed Counter",
+ [BBOX0] = "Home Agent box 0",
+ [BBOX1] = "Home Agent box 1",
+ [RBOX0] = "Routing box 0",
+ [RBOX1] = "Routing box 1",
+ [RBOX2] = "Routing box 2",
+ [WBOX] = "Power control box",
+ [WBOX0FIX] = "Power control box fixed counter 0",
+ [WBOX1FIX] = "Power control box fixed counter 1",
+ [SBOX0] = "QPI Link Layer box 0",
+ [SBOX1] = "QPI Link Layer box 1",
+ [SBOX2] = "QPI Link Layer box 2",
+ [SBOX3] = "QPI Link Layer box 3",
+ [SBOX0FIX] = "QPI Link Layer box fixed 0",
+ [SBOX1FIX] = "QPI Link Layer box fixed 1",
+ [SBOX2FIX] = "QPI Link Layer box fixed 2",
+ [SBOX3FIX] = "QPI Link Layer box fixed 3",
+ [CBOX0] = "Caching Agent box 0",
+ [CBOX1] = "Caching Agent box 1",
+ [CBOX2] = "Caching Agent box 2",
+ [CBOX3] = "Caching Agent box 3",
+ [CBOX4] = "Caching Agent box 4",
+ [CBOX5] = "Caching Agent box 5",
+ [CBOX6] = "Caching Agent box 6",
+ [CBOX7] = "Caching Agent box 7",
+ [CBOX8] = "Caching Agent box 8",
+ [CBOX9] = "Caching Agent box 9",
+ [CBOX10] = "Caching Agent box 10",
+ [CBOX11] = "Caching Agent box 11",
+ [CBOX12] = "Caching Agent box 12",
+ [CBOX13] = "Caching Agent box 13",
+ [CBOX14] = "Caching Agent box 14",
+ [CBOX15] = "Caching Agent box 15",
+ [CBOX16] = "Caching Agent box 16",
+ [CBOX17] = "Caching Agent box 17",
+ [CBOX18] = "Caching Agent box 18",
+ [CBOX19] = "Caching Agent box 19",
+ [CBOX20] = "Caching Agent box 20",
+ [CBOX21] = "Caching Agent box 21",
+ [CBOX22] = "Caching Agent box 22",
+ [CBOX23] = "Caching Agent box 23",
+ [PBOX] = "Physical Layer box",
+ [UBOX] = "System Configuration box",
+ [UBOXFIX] = "System Configuration box fixed counter",
+ [IBOX0] = "Coherency Maintainer for IIO traffic",
+ [IBOX1] = "Coherency Maintainer for IIO traffic",
+ [QBOX0] = "QPI Link Layer 0",
+ [QBOX1] = "QPI Link Layer 1",
+ [QBOX0FIX] = "QPI Link Layer rate status 0",
+ [QBOX1FIX] = "QPI Link Layer rate status 1",
+ [NUM_UNITS] = "Maximally usable register types",
+ [NOTYPE] = "No Type, used for skipping unavailable counters"
+};
+
+#define REG_TYPE_MASK(type) (type < NUM_UNITS ? (0x1ULL<<type) : 0x0ULL)
+
+typedef struct {
+ char* key;
+ RegisterIndex index;
+ RegisterType type;
+ uint64_t configRegister;
+ uint64_t counterRegister;
+ uint64_t counterRegister2;
+ PciDeviceIndex device;
+ uint64_t optionMask;
+} RegisterMap;
+
+typedef struct {
+ uint32_t ctrlRegister;
+ uint32_t statusRegister;
+ uint32_t ovflRegister;
+ int ovflOffset;
+ uint8_t isPci;
+ PciDeviceIndex device;
+ uint32_t regWidth;
+ uint32_t filterRegister1;
+ uint32_t filterRegister2;
+} BoxMap;
+
+#endif
diff --git a/src/includes/strUtil.h b/src/includes/strUtil.h
deleted file mode 100644
index 18236b6..0000000
--- a/src/includes/strUtil.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: strUtil.h
- *
- * Description: Header File strUtil Module.
- * Helper routines for bstrlib and command line parsing
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef STRUTIL_H
-#define STRUTIL_H
-
-#include <bstrlib.h>
-#include <types.h>
-#include <time.h>
-
-#define CHECK_OPTION_STRING \
-if (! (argString = bSecureInput(400,optarg))) { \
- ERROR_PLAIN_PRINT(Failed to read argument string!); \
-}
-
-extern int str2int(const char* str);
-extern uint32_t bstr_to_cpuset_physical(uint32_t* threads, const_bstring q);
-extern int bstr_to_cpuset(int* threads, const_bstring str);
-extern void bstr_to_eventset(StrUtilEventSet* set, const_bstring str);
-extern bstring bSecureInput (int maxlen, char* vgcCtx);
-extern int bJustifyCenter (bstring b, int width);
-extern void bstr_to_workgroup(Workgroup* threads, const_bstring str, DataType type, int numberOfStreams);
-extern FILE* bstr_to_outstream(const_bstring argString, bstring filter);
-extern uint64_t bstr_to_doubleSize(const_bstring str, DataType type);
-extern void bstr_to_interval(const_bstring str, struct timespec* interval);
-
-#endif /*STRUTIL_H*/
diff --git a/src/includes/strUtil_types.h b/src/includes/strUtil_types.h
deleted file mode 100644
index 25766ff..0000000
--- a/src/includes/strUtil_types.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: strUtil_types.h
- *
- * Description: Types file for strUtil module.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef STRUTIL_TYPES_H
-#define STRUTIL_TYPES_H
-
-#include <bstrlib.h>
-
-
-typedef struct {
- bstring eventName;
- bstring counterName;
-} StrUtilEvent;
-
-typedef struct {
- StrUtilEvent* events;
- int numberOfEvents;
-} StrUtilEventSet;
-
-typedef struct {
- bstring domain;
- int offset;
- void* ptr;
-} Stream;
-
-typedef struct {
- uint32_t numberOfThreads;
- int* processorIds;
- uint64_t size;
- Stream* streams;
-} Workgroup;
-
-
-#endif /*STRUTIL_TYPES_H*/
diff --git a/src/includes/test_types.h b/src/includes/test_types.h
deleted file mode 100644
index 45c0932..0000000
--- a/src/includes/test_types.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: test_types.h
- *
- * Description: Type definitions for benchmarking framework
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef TEST_TYPES_H
-#define TEST_TYPES_H
-
-#include <stdint.h>
-#include <bstrlib.h>
-
-typedef void (*FuncPrototype)();
-
-typedef enum {
- SINGLE = 0,
- DOUBLE,
- SINGLE_RAND,
- DOUBLE_RAND
-} DataType;
-
-typedef enum {
- STREAM_1 = 1,
- STREAM_2,
- STREAM_3,
- STREAM_4,
- STREAM_5,
- STREAM_6,
- STREAM_7,
- STREAM_8,
- STREAM_9,
- STREAM_10,
- STREAM_11,
- STREAM_12,
- STREAM_13,
- STREAM_14,
- STREAM_15,
- STREAM_16,
- STREAM_17,
- STREAM_18,
- STREAM_19,
- STREAM_20,
- STREAM_21,
- STREAM_22,
- STREAM_23,
- STREAM_24,
- STREAM_25,
- STREAM_26,
- STREAM_27,
- STREAM_28,
- STREAM_29,
- STREAM_30,
- STREAM_31,
- STREAM_32,
- STREAM_33,
- STREAM_34,
- STREAM_35,
- STREAM_36,
- STREAM_37,
- STREAM_38,
- MAX_STREAMS
-} Pattern;
-
-typedef struct {
- char* name;
- Pattern streams;
- DataType type ;
- int stride;
- FuncPrototype kernel;
- double flops;
- int bytes;
-} TestCase;
-
-typedef struct {
- uint64_t size;
- uint32_t iter;
- const TestCase* test;
- uint64_t cycles;
- uint32_t numberOfThreads;
- int* processors;
- void** streams;
-} ThreadUserData;
-
-#endif /*TEST_TYPES_H*/
diff --git a/src/includes/textcolor.h b/src/includes/textcolor.h
index 4c1b7b1..d0a3e10 100644
--- a/src/includes/textcolor.h
+++ b/src/includes/textcolor.h
@@ -7,13 +7,13 @@
* Allows toggling of terminal escape sequences for
* colored text.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
diff --git a/src/includes/thermal.h b/src/includes/thermal.h
index 3153386..ac37261 100644
--- a/src/includes/thermal.h
+++ b/src/includes/thermal.h
@@ -6,13 +6,13 @@
* Description: Header File Thermal Module.
* Implements Intel TM/TM2 Interface.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -34,20 +34,43 @@
#include <types.h>
#include <registers.h>
#include <bitUtil.h>
-#include <msr.h>
+#include <error.h>
+#include <access.h>
-extern ThermalInfo thermal_info;
-extern void thermal_init(int cpuId);
-static inline uint32_t thermal_read(int cpuId);
-static uint32_t
-thermal_read(int cpuId)
+int
+thermal_read(int cpuId, uint32_t *data)
{
- uint32_t readout = extractBitField(msr_read(cpuId, IA32_THERM_STATUS),7,16);
- return (readout == 0 ?
- thermal_info.activationT - thermal_info.offset :
- (thermal_info.activationT-thermal_info.offset) - readout );
+ uint64_t result = 0;
+ uint32_t readout = 0;
+ if (HPMread(cpuId, MSR_DEV, IA32_THERM_STATUS, &result))
+ {
+ *data = 0;
+ return -EIO;
+ }
+ readout = extractBitField(result,7,16);
+ *data = (readout == 0 ?
+ thermal_info.activationT - thermal_info.offset :
+ (thermal_info.activationT - thermal_info.offset) - readout );
+ return 0;
+}
+
+int
+thermal_tread(int socket_fd, int cpuId, uint32_t *data)
+{
+ uint64_t result = 0;
+ uint32_t readout = 0;
+ if (HPMread(cpuId, MSR_DEV, IA32_THERM_STATUS, &result))
+ {
+ *data = 0;
+ return -EIO;
+ }
+ readout = extractBitField(result,7,16);
+ *data = (readout == 0 ?
+ thermal_info.activationT - thermal_info.offset :
+ (thermal_info.activationT - thermal_info.offset) - readout );
+ return 0;
}
#endif /*THERMAL_H*/
diff --git a/src/includes/thermal_types.h b/src/includes/thermal_types.h
index a619180..feb17fa 100644
--- a/src/includes/thermal_types.h
+++ b/src/includes/thermal_types.h
@@ -5,13 +5,13 @@
*
* Description: Types file for thermal module.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -33,7 +33,9 @@
#include <stdint.h>
-
+/** \addtogroup ThermalMon
+ * @{
+ */
typedef struct {
uint16_t highT;
uint32_t resolution;
@@ -41,5 +43,11 @@ typedef struct {
uint32_t offset;
} ThermalInfo;
+/** \brief Pointer for exporting the ThermalInfo data structure */
+typedef ThermalInfo* ThermalInfo_t;
+/** @}*/
+
+extern ThermalInfo thermal_info;
+
#endif /*THERMAL_TYPES_H*/
diff --git a/src/includes/threads.h b/src/includes/threads.h
deleted file mode 100644
index 6e00191..0000000
--- a/src/includes/threads.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: threads.h
- *
- * Description: Header file of pthread interface module
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef THREADS_H
-#define THREADS_H
-
-#include <types.h>
-#include <pthread.h>
-#include <threads_types.h>
-#include <stdio.h>
-
-#define THREADS_BARRIER pthread_barrier_wait(&threads_barrier)
-
-extern pthread_barrier_t threads_barrier;
-extern ThreadData* threads_data;
-extern ThreadGroup* threads_groups;
-
-
-/**
- * @brief Initialization of the thread module
- * @param numberOfThreads The total number of threads
- */
-extern void threads_init(FILE* OUTSTREAM, int numberOfThreads);
-
-/**
- * @brief Create all threads
- * @param startRoutine thread entry function pointer
- */
-extern void threads_create(void *(*startRoutine)(void*));
-
-/**
- * @brief Register User thread data for all threads
- * @param data Reference to the user data structo
- * @param func Optional function pointer to copy data
- */
-extern void threads_registerDataAll(
- ThreadUserData* data,
- threads_copyDataFunc func);
-
-/**
- * @brief Register User thread data for one thread
- * @param threadId thread Id
- * @param data Reference to the user data structo
- * @param func Optional function pointer to copy data
- */
-extern void threads_registerDataThread(
- int threadId,
- ThreadUserData* data,
- threads_copyDataFunc func);
-
-/**
- * @brief Register User thread data for a thread group
- * @param groupId group Id
- * @param data Reference to the user data structo
- * @param func Optional function pointer to copy data
- */
-extern void threads_registerDataGroup(
- int groupId,
- ThreadUserData* data,
- threads_copyDataFunc func);
-
-/**
- * @brief Join the threads and free pthread related data structures
- * @param
- */
-extern void threads_join(void);
-
-/**
- * @brief Free memory of thread data structures
- * @param numberOfGroups The number of groups to destroy
- */
-extern void threads_destroy(int numberOfGroups);
-
-/**
- * @brief Create Thread groups
- * @param numberOfGroups The number of groups to create
- */
-extern void threads_createGroups(int numberOfGroups);
-
-#endif /* THREADS_H */
diff --git a/src/includes/threads_types.h b/src/includes/threads_types.h
deleted file mode 100644
index dfa13f3..0000000
--- a/src/includes/threads_types.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: threads_types.h
- *
- * Description: Types file for threads module.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef THREADS_TYPES_H
-#define THREADS_TYPES_H
-
-#include <stdio.h>
-#include <stdint.h>
-
-typedef struct {
- int globalNumberOfThreads;
- int numberOfThreads;
- int globalThreadId;
- int threadId;
- int numberOfGroups;
- int groupId;
- double time;
- uint64_t cycles;
- FILE* output;
- ThreadUserData data;
-} ThreadData;
-
-typedef struct {
- int numberOfThreads;
- int* threadIds;
-} ThreadGroup;
-
-typedef void (*threads_copyDataFunc)(ThreadUserData* src,ThreadUserData* dst);
-
-#endif /*THREADS_TYPES_H*/
diff --git a/src/includes/timer.h b/src/includes/timer.h
index b97f4ac..a7ea870 100644
--- a/src/includes/timer.h
+++ b/src/includes/timer.h
@@ -10,13 +10,13 @@
* with rdtsc of 100 cycles in the worst case. Therefore sensible
* measurements should be over 1000 cycles.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -37,80 +37,20 @@
#include <types.h>
-#define RDTSC(cpu_c) \
- __asm__ volatile("xor %%eax,%%eax\n\t" \
- "cpuid\n\t" \
- "rdtsc\n\t" \
- "movl %%eax, %0\n\t" \
- "movl %%edx, %1\n\t" \
- : "=r" ((cpu_c).int32.lo), "=r" ((cpu_c).int32.hi) \
- : : "%eax","%ebx","%ecx","%edx")
-
-#define RDTSC_CR(cpu_c) \
- __asm__ volatile("rdtsc\n\t" \
- "movl %%eax, %0\n\t" \
- "movl %%edx, %1\n\t" \
- : "=r" ((cpu_c).int32.lo), "=r" ((cpu_c).int32.hi) \
- : : "%eax","%ebx","%ecx","%edx")
-
-#define RDTSCP(cpu_c) \
- __asm__ volatile("rdtscp\n\t" \
- "movl %%eax, %0\n\t" \
- "movl %%edx, %1\n\t" \
- "cpuid\n\t" \
- : "=r" ((cpu_c).int32.lo), "=r" ((cpu_c).int32.hi) \
- : : "%eax","%ebx","%ecx","%edx")
-
-#ifdef HAS_RDTSCP
-#define RDTSC_STOP(cpu_c) RDTSCP(cpu_c);
-#else
-#define RDTSC_STOP(cpu_c) RDTSC_CR(cpu_c);
-#endif
-
extern void timer_init( void );
extern double timer_print( TimerData* );
extern uint64_t timer_printCycles( TimerData* );
extern uint64_t timer_getCpuClock( void );
+extern uint64_t timer_getCpuClockCurrent( int cpu_id );
+extern uint64_t timer_getCycleClock( void );
extern uint64_t timer_getBaseline( void );
-static inline void timer_start( TimerData* );
-static inline void timer_stop ( TimerData* );
-
-void timer_start( TimerData* time )
-{
-#ifdef __x86_64
- RDTSC(time->start);
-#endif
-#ifdef _ARCH_PPC
- uint32_t tbl, tbu0, tbu1;
-
- do {
- __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
- __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
- __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
- } while (tbu0 != tbu1);
+extern void timer_start( TimerData* );
+extern void timer_stop ( TimerData* );
- time->start.int64 = (((uint64_t)tbu0) << 32) | tbl;
-#endif
-}
-void timer_stop( TimerData* time )
-{
-#ifdef __x86_64
- RDTSC_STOP(time->stop)
-#endif
-#ifdef _ARCH_PPC
- uint32_t tbl, tbu0, tbu1;
- do {
- __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
- __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
- __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
- } while (tbu0 != tbu1);
- time->stop.int64 = (((uint64_t)tbu0) << 32) | tbl;
-#endif
-}
#endif /* TIMER_H */
diff --git a/src/includes/timer_types.h b/src/includes/timer_types.h
index 265d5c9..2dac362 100644
--- a/src/includes/timer_types.h
+++ b/src/includes/timer_types.h
@@ -5,13 +5,13 @@
*
* Description: Types file for timer module.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
diff --git a/src/includes/tlb-info.h b/src/includes/tlb-info.h
new file mode 100644
index 0000000..1f322c9
--- /dev/null
+++ b/src/includes/tlb-info.h
@@ -0,0 +1,89 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: tlb-info.h
+ *
+ * Description: Header File of topology module that contains the TLB
+ * describing strings. Not used currently.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+
+#ifndef TLB_INFO_H
+#define TLB_INFO_H
+
+static char* intel_tlb_info[256] = {
+ [0] = NULL,
+ [1] = "Instruction TLB: 4 KByte pages, 4-way set associative, 32 entries",
+ [2] = "Instruction TLB: 4 MByte pages, fully associative, 2 entries",
+ [3] = "Data TLB: 4 KByte pages, 4-way set associative, 64 entries",
+ [4] = "Data TLB: 4 MByte pages, 4-way set associative, 8 entries",
+ [5] = "Data TLB1: 4 MByte pages, 4-way set associative, 32 entries",
+ [6 ... 10] = NULL,
+ [11] = "Instruction TLB: 4 MByte pages, 4-way set associative, 4 entries",
+ [12 ... 78] = NULL,
+ [79] = "Instruction TLB: 4 KByte pages, 32 entries",
+ [80] = "Instruction TLB: 4 KByte and 2-MByte or 4-MByte pages, 64 entries",
+ [81] = "Instruction TLB: 4 KByte and 2-MByte or 4-MByte pages, 128 entries",
+ [82] = "Instruction TLB: 4 KByte and 2-MByte or 4-MByte pages, 256 entries",
+ [83 ... 84] = NULL,
+ [85] = "Instruction TLB: 2-MByte or 4-MByte pages, fully associative, 7 entries",
+ [86] = "Data TLB0: 4 MByte pages, 4-way set associative, 16 entries",
+ [87] = "Data TLB0: 4 KByte pages, 4-way associative, 16 entries",
+ [88] = NULL,
+ [89] = "Data TLB0: 4 KByte pages, fully associative, 16 entries",
+ [90] = "Data TLB0: 2-MByte or 4 MByte pages, 4-way set associative, 32 entries",
+ [91] = "Data TLB: 4 KByte and 4 MByte pages, 64 entries",
+ [92] = "Data TLB: 4 KByte and 4 MByte pages,128 entries",
+ [93] = "Data TLB: 4 KByte and 4 MByte pages,256 entries",
+ [94 ... 96] = NULL,
+ [97] = "Instruction TLB: 4 KByte pages, fully associative, 48 entries",
+ [98] = NULL,
+ [99] = "Data TLB: 1 GByte pages, 4-way set associative, 4 entries",
+ [100 ... 117] = NULL,
+ [118] = "Instruction TLB: 2M/4M pages, fully associative, 8 entries",
+ [119 ... 159] = NULL,
+ [160] = "DTLB: 4k pages, fully associative, 32 entries",
+ [161 ... 175] = NULL,
+ [176] = "Instruction TLB: 4 KByte pages, 4-way set associative, 128 entries",
+ [177] = "Instruction TLB: 2M pages, 4-way, 8 entries or 4M pages, 4-way, 4 entries",
+ [178] = "Instruction TLB: 4KByte pages, 4-way set associative, 64 entries",
+ [179] = "Data TLB: 4 KByte pages, 4-way set associative, 128 entries",
+ [180] = "Data TLB1: 4 KByte pages, 4-way associative, 256 entries",
+ [181] = "Instruction TLB: 4KByte pages, 8-way set associative, 64 entries",
+ [182] = "Instruction TLB: 4KByte pages, 8-way set associative, 128 entries",
+ [183 ... 185] = NULL,
+ [186] = "Data TLB1: 4 KByte pages, 4-way associative, 64 entries",
+ [187 ... 191] = NULL,
+ [192] = "Data TLB: 4 KByte and 4 MByte pages, 4-way associative, 8 entries",
+ [193] = "Shared 2nd-Level TLB: 4 KByte/2MByte pages, 8-way associative, 1024 entries",
+ [194] = "DTLB: 4 KByte/2 MByte pages, 4-way associative, 16 entries",
+ [195 ... 201] = NULL,
+ [202] = "Shared 2nd-Level TLB: 4 KByte pages, 4-way associative, 512 entries",
+ [203 ... 239] = NULL,
+ [240] = "64-Byte prefetching",
+ [241] = "128-Byte prefetching",
+ [242 ... 255] = NULL
+};
+#endif
diff --git a/src/includes/topology.h b/src/includes/topology.h
new file mode 100644
index 0000000..77129fb
--- /dev/null
+++ b/src/includes/topology.h
@@ -0,0 +1,144 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: topology.h
+ *
+ * Description: Header File of topology module.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef LIKWID_TOPOLOGY
+#define LIKWID_TOPOLOGY
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <topology_cpuid.h>
+#include <topology_proc.h>
+#ifdef LIKWID_USE_HWLOC
+#include <topology_hwloc.h>
+#endif
+#include <types.h>
+#include <tree.h>
+
+
+#define MAX_FEATURE_STRING_LENGTH 512
+#define MAX_MODEL_STRING_LENGTH 512
+
+extern int affinity_thread2tile_lookup[MAX_NUM_THREADS];
+extern int affinity_thread2tile_lookup[MAX_NUM_THREADS];
+struct topology_functions {
+ void (*init_cpuInfo) (cpu_set_t cpuSet);
+ void (*init_cpuFeatures) (void);
+ void (*init_nodeTopology) (cpu_set_t cpuSet);
+ void (*init_cacheTopology) (void);
+ void (*init_fileTopology) (FILE*);
+ void (*close_topology) (void);
+};
+
+/* Intel P6 */
+#define PENTIUM_M_BANIAS 0x09U
+#define PENTIUM_M_DOTHAN 0x0DU
+#define CORE_DUO 0x0EU
+#define CORE2_65 0x0FU
+#define CORE2_45 0x17U
+#define ATOM 0x1CU
+#define ATOM_45 0x26U
+#define ATOM_32 0x36U
+#define ATOM_22 0x27U
+#define ATOM_SILVERMONT_E 0x37U
+#define ATOM_SILVERMONT_C 0x4DU
+#define ATOM_SILVERMONT_Z1 0x4AU
+#define ATOM_SILVERMONT_Z2 0x5AU
+#define ATOM_SILVERMONT_F 0x5DU
+#define ATOM_SILVERMONT_AIR 0x4CU
+#define ATOM_SILVERMONT_GOLD 0x5CU
+#define NEHALEM 0x1AU
+#define NEHALEM_BLOOMFIELD 0x1AU
+#define NEHALEM_LYNNFIELD 0x1EU
+#define NEHALEM_LYNNFIELD_M 0x1FU
+#define NEHALEM_WESTMERE 0x2CU
+#define NEHALEM_WESTMERE_M 0x25U
+#define SANDYBRIDGE 0x2AU
+#define SANDYBRIDGE_EP 0x2DU
+#define HASWELL 0x3CU
+#define HASWELL_EP 0x3FU
+#define HASWELL_M1 0x45U
+#define HASWELL_M2 0x46U
+#define IVYBRIDGE 0x3AU
+#define IVYBRIDGE_EP 0x3EU
+#define NEHALEM_EX 0x2EU
+#define WESTMERE_EX 0x2FU
+#define XEON_MP 0x1DU
+#define BROADWELL 0x3DU
+#define BROADWELL_E 0x4FU
+#define BROADWELL_D 0x56U
+#define SKYLAKE1 0x4EU
+#define SKYLAKE2 0x5EU
+
+/* Intel MIC */
+#define XEON_PHI 0x01U
+#define XEON_PHI2 0x57U
+
+/* AMD K10 */
+#define BARCELONA 0x02U
+#define SHANGHAI 0x04U
+#define ISTANBUL 0x08U
+#define MAGNYCOURS 0x09U
+
+/* AMD K8 */
+#define OPTERON_SC_1MB 0x05U
+#define OPTERON_DC_E 0x21U
+#define OPTERON_DC_F 0x41U
+#define ATHLON64_X2 0x43U
+#define ATHLON64_X2_F 0x4BU
+#define ATHLON64_F1 0x4FU
+#define ATHLON64_F2 0x5FU
+#define ATHLON64_X2_G 0x6BU
+#define ATHLON64_G1 0x6FU
+#define ATHLON64_G2 0x7FU
+
+
+#define P6_FAMILY 0x6U
+#define MIC_FAMILY 0xBU
+#define NETBURST_FAMILY 0xFFU
+#define K15_FAMILY 0x15U
+#define K16_FAMILY 0x16U
+#define K10_FAMILY 0x10U
+#define K8_FAMILY 0xFU
+
+
+
+
+
+extern int cpu_count(cpu_set_t* set);
+
+static inline int cpuid_hasFeature(FeatureBit bit)
+{
+ return (cpuid_info.featureFlags & (1<<bit));
+}
+
+
+#endif
diff --git a/src/includes/topology_cpuid.h b/src/includes/topology_cpuid.h
new file mode 100644
index 0000000..9e39641
--- /dev/null
+++ b/src/includes/topology_cpuid.h
@@ -0,0 +1,43 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: topology_cpuid.h
+ *
+ * Description: Header File of topology backend using cpuid instruction.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef LIKWID_TOPOLOGY_CPUID
+#define LIKWID_TOPOLOGY_CPUID
+
+#include <sched.h>
+
+void cpuid_init_cpuInfo(cpu_set_t cpuSet);
+void cpuid_init_cpuFeatures(void);
+void cpuid_init_nodeTopology(cpu_set_t cpuSet);
+void cpuid_init_cacheTopology(void);
+
+
+#endif
diff --git a/src/includes/topology_hwloc.h b/src/includes/topology_hwloc.h
new file mode 100644
index 0000000..4595a08
--- /dev/null
+++ b/src/includes/topology_hwloc.h
@@ -0,0 +1,52 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: topology_hwloc.h
+ *
+ * Description: Header File of topology backend using the hwloc library
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef LIKWID_TOPOLOGY_HWLOC
+#define LIKWID_TOPOLOGY_HWLOC
+
+
+#include <hwloc.h>
+#include <sched.h>
+
+
+extern hwloc_topology_t hwloc_topology;
+
+int likwid_hwloc_record_objs_of_type_below_obj(hwloc_topology_t t, hwloc_obj_t obj, hwloc_obj_type_t type, int* index, uint32_t **list);
+
+
+
+void hwloc_init_cpuInfo(cpu_set_t cpuSet);
+void hwloc_init_cpuFeatures(void);
+void hwloc_init_nodeTopology(cpu_set_t cpuSet);
+void hwloc_init_cacheTopology(void);
+void hwloc_close(void);
+
+
+#endif
diff --git a/src/includes/topology_proc.h b/src/includes/topology_proc.h
new file mode 100644
index 0000000..1efd81b
--- /dev/null
+++ b/src/includes/topology_proc.h
@@ -0,0 +1,51 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: topology_proc.h
+ *
+ * Description: Header File of topology backend using procfs/sysfs
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef LIKWID_TOPOLOGY_PROC
+#define LIKWID_TOPOLOGY_PROC
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sched.h>
+#include <unistd.h>
+#include <sched.h>
+
+#include <error.h>
+#include <tree.h>
+#include <bitUtil.h>
+#include <topology.h>
+
+void proc_init_cpuInfo(cpu_set_t cpuSet);
+void proc_init_cpuFeatures(void);
+void proc_init_nodeTopology(cpu_set_t cpuSet);
+void proc_init_cacheTopology(void);
+
+
+#endif
diff --git a/src/includes/topology_types.h b/src/includes/topology_types.h
new file mode 100644
index 0000000..82cf954
--- /dev/null
+++ b/src/includes/topology_types.h
@@ -0,0 +1,73 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: topology_types.h
+ *
+ * Description: Types file for topology module. External definitions are
+ * in likwid.h
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com,
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef CPUID_TYPES_H
+#define CPUID_TYPES_H
+
+/** \addtogroup CPUTopology CPU information module
+* @{
+*/
+/*! \brief Enum of possible CPU features
+
+CPUs implement different features that likely improve application performance if
+optimized using the feature. The list contains all features that are currently
+supported by LIKWID. LIKWID does not perform any action based on these features,
+it gathers the data only for output purposes. It is not a complete list.
+\extends CpuInfo
+*/
+typedef enum {
+ SSE3=0, /*!< \brief Streaming SIMD Extensions 3 */
+ MMX, /*!< \brief Multi Media Extension */
+ SSE, /*!< \brief Streaming SIMD Extensions */
+ SSE2, /*!< \brief Streaming SIMD Extensions 2 */
+ MONITOR, /*!< \brief MONITOR and MWAIT instructions (part of SSE3) */
+ ACPI, /*!< \brief Advanced Configuration and Power Interface */
+ RDTSCP, /*!< \brief Serializing Read of the Time Stamp Counter */
+ VMX, /*!< \brief Virtual Machine eXtensions (VT-x) */
+ EIST, /*!< \brief Enhanced Intel SpeedStep */
+ TM, /*!< \brief Thermal Monitor */
+ TM2, /*!< \brief Thermal Monitor 2 */
+ AES, /*!< \brief AES instruction set */
+ RDRAND, /*!< \brief Random numbers from an on-chip hardware random number generator */
+ SSSE3, /*!< \brief Supplemental Streaming SIMD Extensions 3 */
+ SSE41, /*!< \brief Streaming SIMD Extensions 4.1 */
+ SSE42, /*!< \brief Streaming SIMD Extensions 4.2 */
+ AVX, /*!< \brief Advanced Vector Extensions */
+ FMA, /*!< \brief Fused multiply-add (FMA3) */
+ AVX2, /*!< \brief Advanced Vector Extensions 2 */
+ RTM, /*!< \brief Restricted Transactional Memory */
+ HLE, /*!< \brief Hardware Lock Elision */
+ HTT, /*!< \brief Hyper-Threading Technology */
+ RDSEED, /*!< \brief Non-deterministic random bit generator */
+} FeatureBit;
+/** @}*/
+#endif /*CPUID_TYPES_H*/
diff --git a/src/includes/tree.h b/src/includes/tree.h
index 9816cf7..66cfa97 100644
--- a/src/includes/tree.h
+++ b/src/includes/tree.h
@@ -6,13 +6,13 @@
* Description: Header File tree Module.
* Implements a simple tree data structure.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -36,6 +36,7 @@
extern void tree_init(TreeNode** root, int id);
extern void tree_print(TreeNode* nodePtr);
+extern void tree_destroy(TreeNode* nodePtr);
extern void tree_insertNode(TreeNode* nodePtr, int id);
extern int tree_nodeExists(TreeNode* nodePtr, int id);
extern int tree_countChildren(TreeNode* nodePtr);
diff --git a/src/includes/tree_types.h b/src/includes/tree_types.h
index b449e39..d2eb7d5 100644
--- a/src/includes/tree_types.h
+++ b/src/includes/tree_types.h
@@ -5,13 +5,13 @@
*
* Description: Types file for tree module.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -31,14 +31,24 @@
#ifndef TREE_TYPES_H
#define TREE_TYPES_H
-/* For arbitrary trees llink are the children and
- * rlink are the neighbours
- */
-typedef struct treeNode {
- int id;
- struct treeNode* llink;
- struct treeNode* rlink;
-} TreeNode;
+/** \addtogroup CPUTopology
+* @{
+*/
+/*! \brief Structure of a tree node
+
+This structure is used to form the tree of the system topology. The information
+describing each node is store in other places, therefore an ID is enough.
+\extends CpuTopology
+*/
+struct treeNode {
+ int id; /*!< \brief ID of the node */
+ struct treeNode* llink; /*!< \brief List of children of the current node */
+ struct treeNode* rlink; /*!< \brief List of neighbors of the current node */
+};
+
+/** \brief Shorter name for struct treeNode */
+typedef struct treeNode TreeNode;
+/** @}*/
#endif /*TREE_TYPES_H*/
diff --git a/src/includes/types.h b/src/includes/types.h
index 2b0745a..c32d870 100644
--- a/src/includes/types.h
+++ b/src/includes/types.h
@@ -5,13 +5,14 @@
*
* Description: Global Types file
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -34,27 +35,17 @@
/* ##### HEADER FILE INCLUDES ######################################### */
#include <stdint.h>
+#include <bstrlib.h>
-#include <accessClient_types.h>
+#include <access_client_types.h>
+#include <registers_types.h>
#include <pci_types.h>
#include <power_types.h>
#include <thermal_types.h>
-#include <strUtil_types.h>
-#include <test_types.h>
-#include <barrier_types.h>
-#include <timer_types.h>
#include <tree_types.h>
-#include <cpuid_types.h>
-#include <affinity_types.h>
-#include <threads_types.h>
-#include <cpuFeatures_types.h>
-#include <asciiBoxes_types.h>
-#include <asciiTable_types.h>
+#include <topology_types.h>
#include <perfmon_types.h>
#include <libperfctr_types.h>
-#include <multiplex_types.h>
-#include <numa_types.h>
-#include <pci_types.h>
typedef struct {
@@ -83,4 +74,7 @@ typedef struct {
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
#endif /*TYPES_H*/
diff --git a/src/libperfctr.c b/src/libperfctr.c
index a4b2158..6f0ff0f 100644
--- a/src/libperfctr.c
+++ b/src/libperfctr.c
@@ -5,13 +5,14 @@
*
* Description: Marker API interface of module perfmon
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Authors: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -38,51 +39,34 @@
#include <unistd.h>
#include <sched.h>
#include <pthread.h>
+#include <inttypes.h>
-#include <error.h>
-#include <types.h>
+#include <likwid.h>
#include <bitUtil.h>
-#include <bstrlib.h>
-#include <cpuid.h>
-#include <numa.h>
-#include <affinity.h>
#include <lock.h>
#include <tree.h>
-#include <accessClient.h>
-#include <msr.h>
-#include <pci.h>
-#include <power.h>
-#include <thermal.h>
#include <timer.h>
#include <hashTable.h>
#include <registers.h>
-#include <likwid.h>
-
-#include <perfmon_core2_counters.h>
-#include <perfmon_haswell_counters.h>
-#include <perfmon_interlagos_counters.h>
-#include <perfmon_kabini_counters.h>
-#include <perfmon_k10_counters.h>
-#include <perfmon_nehalem_counters.h>
-#include <perfmon_phi_counters.h>
-#include <perfmon_pm_counters.h>
-#include <perfmon_sandybridge_counters.h>
-#include <perfmon_ivybridge_counters.h>
-#include <perfmon_westmereEX_counters.h>
-#include <perfmon_silvermont_counters.h>
+#include <error.h>
+#include <access.h>
+#include <perfmon.h>
/* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */
-static int perfmon_numCounters=0; /* total number of counters */
-static int perfmon_numCountersCore=0; /* max index of core counters */
-static int perfmon_numCountersUncore=0; /* max index of conventional uncore counters */
-static PerfmonCounterMap* perfmon_counter_map = NULL;
-static int socket_lock[MAX_NUM_NODES];
-static int thread_socketFD[MAX_NUM_THREADS];
-static int hasPCICounters = 0;
+int socket_lock[MAX_NUM_NODES];
static int likwid_init = 0;
-static BitMask counterMask;
+static int numberOfGroups = 0;
+static int* groups;
+static int threads2Cpu[MAX_NUM_THREADS];
+static pthread_t threads2Pthread[MAX_NUM_THREADS];
+static int realThreads2Cpu[MAX_NUM_THREADS] = { [ 0 ... (MAX_NUM_THREADS-1)] = -1};
+static int num_cpus = 0;
+static int registered_cpus = 0;
+static pthread_mutex_t globalLock = PTHREAD_MUTEX_INITIALIZER;
+static int use_locks = 0;
+static pthread_mutex_t threadLocks[MAX_NUM_THREADS] = { [ 0 ... (MAX_NUM_THREADS-1)] = PTHREAD_MUTEX_INITIALIZER};
/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
@@ -90,61 +74,87 @@ static BitMask counterMask;
/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
-void str2BitMask(const char* str, BitMask* mask)
+
+static int getProcessorID(cpu_set_t* cpu_set)
{
- char* endptr;
- errno = 0;
- struct bstrList* tokens;
- bstring q = bfromcstralloc (60, str);
- tokens = bsplit(q,' ');
+ int processorId;
- for (int i=0; i<tokens->qty; i++)
+ for (processorId=0;processorId<MAX_NUM_THREADS;processorId++)
{
- uint64_t val = strtoull((char*) tokens->entry[i]->data, &endptr, 16);
-
- if ((errno == ERANGE && val == LONG_MAX ) || (errno != 0 && val == 0))
+ if (CPU_ISSET(processorId,cpu_set))
{
- ERROR;
+ break;
}
+ }
+ return processorId;
+}
- if (endptr == str)
+static int getThreadID(int cpu_id)
+{
+ int i;
+ for(i=0;i<groupSet->numberOfThreads;i++)
+ {
+ if (cpu_id == groupSet->threads[i].processorId)
{
- ERROR_PLAIN_PRINT(No digits were found);
+ return i;
}
-
- mask->mask[i] = val;
}
-
- bstrListDestroy(tokens);
- bdestroy(q);
+ return -1;
}
-static int getProcessorID(cpu_set_t* cpu_set)
+static double
+calculateMarkerResult(RegisterIndex index, uint64_t start, uint64_t stop, int overflows)
{
- int processorId;
+ double result = 0.0;
- for (processorId=0;processorId<MAX_NUM_THREADS;processorId++)
+ if (overflows == 0)
{
- if (CPU_ISSET(processorId,cpu_set))
- {
- break;
- }
+ result = (double) (stop - start);
}
- return processorId;
+ else if (overflows > 0)
+ {
+ result += (double) ((perfmon_getMaxCounterValue(counter_map[index].type) - start) + stop);
+ overflows--;
+ }
+ result += (double) (overflows * perfmon_getMaxCounterValue(counter_map[index].type));
+ if (counter_map[index].type == POWER)
+ {
+ result *= power_getEnergyUnit(getCounterTypeOffset(index));
+ }
+ else if (counter_map[index].type == THERMAL)
+ {
+ result = (double)stop;
+ }
+ return result;
}
/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
void likwid_markerInit(void)
{
- int cpuId = likwid_getProcessorId();
+ int i;
+ int verbosity;
+ bstring bThreadStr;
+ bstring bEventStr;
+ struct bstrList* threadTokens;
+ struct bstrList* eventStrings;
char* modeStr = getenv("LIKWID_MODE");
- char* maskStr = getenv("LIKWID_MASK");
-
- if ((modeStr != NULL) && (maskStr != NULL))
+ char* eventStr = getenv("LIKWID_EVENTS");
+ char* cThreadStr = getenv("LIKWID_THREADS");
+ char* filepath = getenv("LIKWID_FILEPATH");
+ /* Dirty hack to avoid nonnull warnings */
+ int (*ownatoi)(const char*);
+ ownatoi = &atoi;
+
+ if ((modeStr != NULL) && (filepath != NULL) && (eventStr != NULL) && (cThreadStr != NULL))
{
likwid_init = 1;
}
+ else if (likwid_init == 0)
+ {
+ fprintf(stderr, "Running without Marker API. Activate Marker API with -m on commandline.\n");
+ return;
+ }
else
{
return;
@@ -156,226 +166,130 @@ void likwid_markerInit(void)
exit(EXIT_FAILURE);
}
- cpuid_init();
+ topology_init();
numa_init();
affinity_init();
- timer_init();
hashTable_init();
- for(int i=0; i<MAX_NUM_THREADS; i++) thread_socketFD[i] = -1;
for(int i=0; i<MAX_NUM_NODES; i++) socket_lock[i] = LOCK_INIT;
- accessClient_mode = atoi(modeStr);
- str2BitMask(maskStr, &counterMask);
+ HPMmode(atoi(modeStr));
- if (accessClient_mode != DAEMON_AM_DIRECT)
+ if (getenv("LIKWID_DEBUG") != NULL)
{
- accessClient_init(&thread_socketFD[cpuId]);
+ perfmon_verbosity = atoi(getenv("LIKWID_DEBUG"));
+ verbosity = perfmon_verbosity;
}
- msr_init(thread_socketFD[cpuId]);
- thermal_init(cpuId);
-
- switch ( cpuid_info.family )
+ bThreadStr = bfromcstr(cThreadStr);
+ threadTokens = bsplit(bThreadStr,',');
+ num_cpus = threadTokens->qty;
+ for (i=0; i<num_cpus; i++)
{
- case P6_FAMILY:
-
- switch ( cpuid_info.model )
+ threads2Cpu[i] = ownatoi(bdata(threadTokens->entry[i]));
+ }
+ bdestroy(bThreadStr);
+ bstrListDestroy(threadTokens);
+
+ if (getenv("LIKWID_PIN") != NULL)
+ {
+ likwid_pinThread(threads2Cpu[0]);
+ if (getenv("OMP_NUM_THREADS") != NULL)
+ {
+ if (ownatoi(getenv("OMP_NUM_THREADS")) > num_cpus)
{
- case PENTIUM_M_BANIAS:
-
- case PENTIUM_M_DOTHAN:
-
- perfmon_counter_map = pm_counter_map;
- perfmon_numCounters = NUM_COUNTERS_PM;
- perfmon_numCountersCore = NUM_COUNTERS_CORE_PM;
- break;
-
- case ATOM_45:
-
- case ATOM_32:
-
- case ATOM_22:
-
- case ATOM:
-
- perfmon_counter_map = core2_counter_map;
- perfmon_numCounters = NUM_COUNTERS_CORE2;
- perfmon_numCountersCore = NUM_COUNTERS_CORE_CORE2;
- break;
-
- case ATOM_SILVERMONT_C:
- case ATOM_SILVERMONT_E:
- case ATOM_SILVERMONT_F1:
- case ATOM_SILVERMONT_F2:
- case ATOM_SILVERMONT_F3:
- power_init(0);
- perfmon_counter_map = silvermont_counter_map;
- perfmon_numCounters = NUM_COUNTERS_SILVERMONT;
- perfmon_numCountersCore = NUM_COUNTERS_CORE_SILVERMONT;
- break;
-
- case CORE_DUO:
- ERROR_PLAIN_PRINT(Unsupported Processor);
- break;
-
- case XEON_MP:
-
- case CORE2_65:
-
- case CORE2_45:
-
- perfmon_counter_map = core2_counter_map;
- perfmon_numCounters = NUM_COUNTERS_CORE2;
- perfmon_numCountersCore = NUM_COUNTERS_CORE_CORE2;
- break;
-
- case NEHALEM_EX:
-
- case WESTMERE_EX:
-
- perfmon_counter_map = westmereEX_counter_map;
- perfmon_numCounters = NUM_COUNTERS_WESTMEREEX;
- perfmon_numCountersCore = NUM_COUNTERS_CORE_WESTMEREEX;
- perfmon_numCountersUncore = NUM_COUNTERS_UNCORE_WESTMEREEX;
- break;
-
- case NEHALEM_BLOOMFIELD:
-
- case NEHALEM_LYNNFIELD:
-
- case NEHALEM_WESTMERE_M:
-
- case NEHALEM_WESTMERE:
-
- perfmon_counter_map = nehalem_counter_map;
- perfmon_numCounters = NUM_COUNTERS_NEHALEM;
- perfmon_numCountersCore = NUM_COUNTERS_CORE_NEHALEM;
- perfmon_numCountersUncore = NUM_COUNTERS_UNCORE_NEHALEM;
- break;
-
- case IVYBRIDGE:
-
- case IVYBRIDGE_EP:
-
- {
- int socket_fd = thread_socketFD[cpuId];
- hasPCICounters = 1;
- power_init(0); /* FIXME Static coreId is dangerous */
- pci_init(socket_fd);
- perfmon_counter_map = ivybridge_counter_map;
- perfmon_numCounters = NUM_COUNTERS_IVYBRIDGE;
- perfmon_numCountersCore = NUM_COUNTERS_CORE_IVYBRIDGE;
- perfmon_numCountersUncore = NUM_COUNTERS_UNCORE_IVYBRIDGE;
- }
- break;
-
- case HASWELL:
-
- case HASWELL_EX:
-
- case HASWELL_M1:
-
- case HASWELL_M2:
-
- power_init(0); /* FIXME Static coreId is dangerous */
-
- perfmon_counter_map = haswell_counter_map;
- perfmon_numCounters = NUM_COUNTERS_HASWELL;
- perfmon_numCountersCore = NUM_COUNTERS_CORE_HASWELL;
- break;
-
- case SANDYBRIDGE:
-
- case SANDYBRIDGE_EP:
-
- {
- int socket_fd = thread_socketFD[cpuId];
- hasPCICounters = 1;
- power_init(0); /* FIXME Static coreId is dangerous */
- pci_init(socket_fd);
- perfmon_counter_map = sandybridge_counter_map;
- perfmon_numCounters = NUM_COUNTERS_SANDYBRIDGE;
- perfmon_numCountersCore = NUM_COUNTERS_CORE_SANDYBRIDGE;
- perfmon_numCountersUncore = NUM_COUNTERS_UNCORE_SANDYBRIDGE;
- }
- break;
-
- default:
- ERROR_PLAIN_PRINT(Unsupported Processor);
- break;
+ use_locks = 1;
}
- break;
-
- case MIC_FAMILY:
-
- switch ( cpuid_info.model )
+ }
+ if (getenv("CILK_NWORKERS") != NULL)
+ {
+ if (ownatoi(getenv("CILK_NWORKERS")) > num_cpus)
{
- case XEON_PHI:
-
- perfmon_counter_map = phi_counter_map;
- perfmon_numCounters = NUM_COUNTERS_PHI;
- perfmon_numCountersCore = NUM_COUNTERS_CORE_PHI;
- break;
-
- default:
- ERROR_PLAIN_PRINT(Unsupported Processor);
- break;
+ use_locks = 1;
}
- break;
-
- case K8_FAMILY:
-
- perfmon_counter_map = k10_counter_map;
- perfmon_numCounters = NUM_COUNTERS_K10;
- perfmon_numCountersCore = NUM_COUNTERS_CORE_K10;
- break;
-
- case K10_FAMILY:
-
- perfmon_counter_map = k10_counter_map;
- perfmon_numCounters = NUM_COUNTERS_K10;
- perfmon_numCountersCore = NUM_COUNTERS_CORE_K10;
- break;
-
- case K15_FAMILY:
-
- perfmon_counter_map = interlagos_counter_map;
- perfmon_numCounters = NUM_COUNTERS_INTERLAGOS;
- perfmon_numCountersCore = NUM_COUNTERS_CORE_INTERLAGOS;
- break;
+ }
+ }
- case K16_FAMILY:
+ i = perfmon_init(num_cpus, threads2Cpu);
+ if (i<0)
+ {
+ fprintf(stderr,"Failed to initialize LIKWID perfmon library.\n");
+ return;
+ }
- perfmon_counter_map = kabini_counter_map;
- perfmon_numCounters = NUM_COUNTERS_KABINI;
- perfmon_numCountersCore = NUM_COUNTERS_CORE_KABINI;
- break;
+ bEventStr = bfromcstr(eventStr);
+ eventStrings = bsplit(bEventStr,'|');
+ numberOfGroups = eventStrings->qty;
+ groups = malloc(numberOfGroups * sizeof(int));
+ if (!groups)
+ {
+ fprintf(stderr,"Cannot allocate space for group handling.\n");
+ bstrListDestroy(eventStrings);
+ exit(EXIT_FAILURE);
+ }
+ for (i=0; i<eventStrings->qty; i++)
+ {
+ groups[i] = perfmon_addEventSet(bdata(eventStrings->entry[i]));
+ }
+ bstrListDestroy(eventStrings);
+ bdestroy(bEventStr);
- default:
- ERROR_PLAIN_PRINT(Unsupported Processor);
- break;
+ for (i=0; i<num_cpus; i++)
+ {
+ hashTable_initThread(threads2Cpu[i]);
+ for(int j=0; j<groupSet->groups[groups[0]].numberOfEvents;j++)
+ {
+ groupSet->groups[groups[0]].events[j].threadCounter[i].init = TRUE;
+ groupSet->groups[groups[0]].state = STATE_START;
+ }
}
+
+ groupSet->activeGroup = 0;
}
void likwid_markerThreadInit(void)
{
- if ( ! likwid_init )
+ int myID;
+ if ( !likwid_init )
{
return;
}
+
+ pthread_mutex_lock(&globalLock);
+ myID = registered_cpus++;
+ pthread_mutex_unlock(&globalLock);
- int cpuId = likwid_getProcessorId();
-
- if (accessClient_mode != DAEMON_AM_DIRECT)
+ if (getenv("LIKWID_PIN") != NULL)
{
- if (thread_socketFD[cpuId] == -1)
+ cpu_set_t cpuset;
+ CPU_ZERO(&cpuset);
+ sched_getaffinity(gettid(), sizeof(cpu_set_t), &cpuset);
+ if ((CPU_COUNT(&cpuset) > 1) || (likwid_getProcessorId() != threads2Cpu[myID % num_cpus]))
{
- accessClient_init(&thread_socketFD[cpuId]);
+ likwid_pinThread(threads2Cpu[myID % num_cpus]);
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, "Pin thread %lu to CPU %d\n", gettid(), threads2Cpu[myID % num_cpus]);
}
}
}
+void likwid_markerNextGroup(void)
+{
+ int i;
+ int next_group;
+
+ if (!likwid_init)
+ {
+ return;
+ }
+
+ next_group = (groupSet->activeGroup + 1) % numberOfGroups;
+ if (next_group != groupSet->activeGroup)
+ {
+ i = perfmon_switchActiveGroup(next_group);
+ }
+ return;
+}
+
/* File format
* 1 numberOfThreads numberOfRegions
* 2 regionID:regionTag0
@@ -387,45 +301,67 @@ void likwid_markerClose(void)
{
FILE *file = NULL;
LikwidResults* results = NULL;
- int numberOfThreads;
- int numberOfRegions;
+ int numberOfThreads = 0;
+ int numberOfRegions = 0;
+ char* markerfile = NULL;
+ int lineidx = 0;
+ char line[1024];
if ( ! likwid_init )
{
return;
}
-
hashTable_finalize(&numberOfThreads, &numberOfRegions, &results);
-
- file = fopen(getenv("LIKWID_FILEPATH"),"w");
+ if ((numberOfThreads == 0)||(numberOfThreads == 0))
+ {
+ fprintf(stderr, "No threads or regions defined in hash table\n");
+ return;
+ }
+ markerfile = getenv("LIKWID_FILEPATH");
+ if (markerfile == NULL)
+ {
+ fprintf(stderr, "Is the application executed with LIKWID wrapper? No file path for the Marker API output defined.\n");
+ return;
+ }
+ file = fopen(markerfile,"w");
if (file != NULL)
{
- fprintf(file,"%d %d\n",numberOfThreads,numberOfRegions);
-
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating Marker file %s with %d regions %d groups and %d threads, markerfile, numberOfRegions, numberOfGroups, numberOfThreads);
+ fprintf(file,"%d %d %d\n",numberOfThreads, numberOfRegions, numberOfGroups);
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, %d %d %d, numberOfThreads, numberOfRegions, numberOfGroups);
for (int i=0; i<numberOfRegions; i++)
{
fprintf(file,"%d:%s\n",i,bdata(results[i].tag));
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, %d:%s, i,bdata(results[i].tag));
}
-
for (int i=0; i<numberOfRegions; i++)
{
for (int j=0; j<numberOfThreads; j++)
{
fprintf(file,"%d ",i);
- fprintf(file,"%d ",j);
+ fprintf(file,"%d ",results[i].groupID);
+ fprintf(file,"%d ",results[i].cpulist[j]);
fprintf(file,"%u ",results[i].count[j]);
fprintf(file,"%e ",results[i].time[j]);
-
- for (int k=0; k<NUM_PMC; k++)
+ fprintf(file,"%d ",groupSet->groups[results[i].groupID].numberOfEvents);
+ lineidx = sprintf(&(line[0]), "%d %d %d %u %e %d ", i, results[i].groupID,results[i].cpulist[j],results[i].count[j],results[i].time[j],groupSet->groups[results[i].groupID].numberOfEvents);
+ for (int k=0; k<groupSet->groups[results[i].groupID].numberOfEvents; k++)
{
fprintf(file,"%e ",results[i].counters[j][k]);
+ lineidx += sprintf(&(line[lineidx]), "%e ", results[i].counters[j][k]);
}
fprintf(file,"\n");
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, %s,line);
}
}
fclose(file);
}
+ else
+ {
+ fprintf(stderr, "Cannot open file %s\n", markerfile);
+ fprintf(stderr, "%s", strerror(errno));
+ }
for (int i=0;i<numberOfRegions; i++)
{
@@ -436,6 +372,7 @@ void likwid_markerClose(void)
free(results[i].time);
bdestroy(results[i].tag);
free(results[i].count);
+ free(results[i].cpulist);
free(results[i].counters);
}
@@ -443,282 +380,179 @@ void likwid_markerClose(void)
{
free(results);
}
-
- msr_finalize();
- pci_finalize();
-
- for (int i=0; i<MAX_NUM_THREADS; i++)
- {
- accessClient_finalize(thread_socketFD[i]);
- thread_socketFD[i] = -1;
- }
+ likwid_init = 0;
+ HPMfinalize();
}
-
-void likwid_markerStartRegion(const char* regionTag)
+int likwid_markerRegisterRegion(const char* regionTag)
{
if ( ! likwid_init )
{
- return;
+ return -EFAULT;
}
-
+ TimerData timer;
bstring tag = bfromcstralloc(100, regionTag);
LikwidThreadResults* results;
- uint64_t res;
+ char groupSuffix[10];
+ sprintf(groupSuffix, "-%d", groupSet->activeGroup);
+ bcatcstr(tag, groupSuffix);
int cpu_id = hashTable_get(tag, &results);
bdestroy(tag);
- int socket_fd = thread_socketFD[cpu_id];
+ return 0;
+}
- if (accessClient_mode != DAEMON_AM_DIRECT)
+
+int likwid_markerStartRegion(const char* regionTag)
+{
+ if ( ! likwid_init )
{
- if (socket_fd == -1)
- {
- printf("ERROR: Invalid socket file handle on processor %d. \
- Did you call likwid_markerThreadInit() ?\n", cpu_id);
- }
+ return -EFAULT;
}
-
- results->count++;
-
- /* Core specific counters */
- for ( int i=0; i<perfmon_numCountersCore; i++ )
+ int myCPU = likwid_getProcessorId();
+ if (getThreadID(myCPU) < 0)
{
- bitMask_test(res,counterMask,i);
- if ( res )
- {
- if (perfmon_counter_map[i].type != THERMAL)
- {
- results->StartPMcounters[i] =
- (double) msr_tread(
- socket_fd,
- cpu_id,
- perfmon_counter_map[i].counterRegister);
- }
- }
+ return -EFAULT;
}
- /* Uncore specific counters */
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
- lock_acquire((int*)
- &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id))
+ bstring tag = bfromcstralloc(100, regionTag);
+ LikwidThreadResults* results;
+ char groupSuffix[10];
+ sprintf(groupSuffix, "-%d", groupSet->activeGroup);
+ bcatcstr(tag, groupSuffix);
+
+ int cpu_id = hashTable_get(tag, &results);
+ int thread_id = getThreadID(cpu_id);
+ perfmon_readCountersCpu(cpu_id);
+ results->cpuID = cpu_id;
+ for(int i=0;i<groupSet->groups[groupSet->activeGroup].numberOfEvents;i++)
{
- /* Conventional Uncore counters */
- for ( int i=perfmon_numCountersCore; i<perfmon_numCountersUncore; i++ )
- {
- bitMask_test(res,counterMask,i);
- if ( res )
- {
- if (perfmon_counter_map[i].type != POWER)
- {
- results->StartPMcounters[i] =
- (double) msr_tread(
- socket_fd,
- cpu_id,
- perfmon_counter_map[i].counterRegister);
- }
- else
- {
- results->StartPMcounters[i] =
- (double) power_tread(
- socket_fd,
- cpu_id,
- perfmon_counter_map[i].counterRegister);
- }
- }
- }
-
- /* PCI Uncore counters */
- if ( hasPCICounters && (accessClient_mode != DAEMON_AM_DIRECT) )
- {
- for ( int i=perfmon_numCountersUncore; i<perfmon_numCounters; i++ )
- {
- bitMask_test(res,counterMask,i);
- if ( res )
- {
- uint64_t counter_result =
- pci_tread(
- socket_fd,
- cpu_id,
- perfmon_counter_map[i].device,
- perfmon_counter_map[i].counterRegister);
-
- counter_result = (counter_result<<32) +
- pci_tread(
- socket_fd,
- cpu_id,
- perfmon_counter_map[i].device,
- perfmon_counter_map[i].counterRegister2);
-
- results->StartPMcounters[perfmon_counter_map[i].index] =
- (double) counter_result;
- }
- }
- }
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, START [%s] READ EVENT [%d=%d] EVENT %d VALUE %llu , regionTag, thread_id, cpu_id, i,
+ LLU_CAST groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].counterData);
+ //groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].startData =
+ // groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].counterData;
+
+ results->StartPMcounters[i] = groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].counterData;
+ results->StartOverflows[i] = groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].overflows;
}
-
+
+ bdestroy(tag);
timer_start(&(results->startTime));
+ return 0;
}
-#define READ_END_MEM_CHANNEL(channel, reg, cid) \
- counter_result = pci_tread(socket_fd, cpu_id, channel, reg##_A); \
- counter_result = (counter_result<<32) + \
- pci_tread(socket_fd, cpu_id, channel, reg##_B); \
- results->PMcounters[cid] += (double) counter_result - results->StartPMcounters[cid]
-
-/* TODO: Readout hash at the end. Compute result at the end of the function to
- * keep overhead in region low */
-void likwid_markerStopRegion(const char* regionTag)
+int likwid_markerStopRegion(const char* regionTag)
{
if (! likwid_init)
{
- return;
+ return -EFAULT;
}
TimerData timestamp;
timer_stop(×tamp);
- int cpu_id = likwid_getProcessorId();
- uint64_t res;
- int socket_fd = thread_socketFD[cpu_id];
- double PMcounters[NUM_PMC];
-
- /* Core specific counters */
- for ( int i=0; i<perfmon_numCountersCore; i++ )
+ double result = 0.0;
+ int cpu_id;
+ int myCPU = likwid_getProcessorId();
+ if (getThreadID(myCPU) < 0)
{
- bitMask_test(res,counterMask,i);
- if ( res )
- {
- if (perfmon_counter_map[i].type != THERMAL)
- {
- PMcounters[i] = (double) msr_tread(
- socket_fd,
- cpu_id,
- perfmon_counter_map[i].counterRegister);
- }
- else
- {
- PMcounters[i] = (double) thermal_read(cpu_id);
- }
- }
+ return -EFAULT;
}
+ int thread_id;
+ bstring tag = bfromcstr(regionTag);
+ char groupSuffix[100];
+ LikwidThreadResults* results;
+ sprintf(groupSuffix, "-%d", groupSet->activeGroup);
+ bcatcstr(tag, groupSuffix);
+ if (use_locks == 1)
+ {
+ pthread_mutex_lock(&threadLocks[myCPU]);
+ }
+
+ cpu_id = hashTable_get(tag, &results);
+ thread_id = getThreadID(cpu_id);
+ results->groupID = groupSet->activeGroup;
+ results->startTime.stop.int64 = timestamp.stop.int64;
+ results->time += timer_print(&(results->startTime));
+ results->count++;
+ bdestroy(tag);
+
+ perfmon_readCountersCpu(cpu_id);
- /* Uncore specific counters */
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ for(int i=0;i<groupSet->groups[groupSet->activeGroup].numberOfEvents;i++)
{
- /* Conventional Uncore counters */
- for ( int i=perfmon_numCountersCore; i<perfmon_numCountersUncore; i++ )
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, STOP [%s] READ EVENT [%d=%d] EVENT %d VALUE %llu, regionTag, thread_id, cpu_id, i,
+ LLU_CAST groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].counterData);
+ result = calculateMarkerResult(groupSet->groups[groupSet->activeGroup].events[i].index, results->StartPMcounters[i],
+ groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].counterData,
+ groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].overflows -
+ results->StartOverflows[i]);
+ if (counter_map[groupSet->groups[groupSet->activeGroup].events[i].index].type != THERMAL)
{
- bitMask_test(res,counterMask,i);
- if ( res )
- {
- if (perfmon_counter_map[i].type != POWER)
- {
- PMcounters[i] = (double) msr_tread(
- socket_fd,
- cpu_id,
- perfmon_counter_map[i].counterRegister);
- }
- else
- {
- PMcounters[i] = (double) power_tread(
- socket_fd,
- cpu_id,
- perfmon_counter_map[i].counterRegister);
- }
- }
+ results->PMcounters[i] += result;
}
-
- /* PCI Uncore counters */
- if ( hasPCICounters && (accessClient_mode != DAEMON_AM_DIRECT) )
+ else
{
- for ( int i=perfmon_numCountersUncore; i<perfmon_numCounters; i++ )
- {
- bitMask_test(res,counterMask,i);
- if ( res )
- {
- uint64_t counter_result =
- pci_tread(
- socket_fd,
- cpu_id,
- perfmon_counter_map[i].device,
- perfmon_counter_map[i].counterRegister);
-
- counter_result = (counter_result<<32) +
- pci_tread(
- socket_fd,
- cpu_id,
- perfmon_counter_map[i].device,
- perfmon_counter_map[i].counterRegister2);
-
- PMcounters[i] = (double) counter_result;
- }
- }
+ results->PMcounters[i] = result;
}
}
+ if (use_locks == 1)
+ {
+ pthread_mutex_unlock(&threadLocks[myCPU]);
+ }
+ return 0;
+}
- bstring tag = bfromcstralloc(100, regionTag);
- LikwidThreadResults* results;
- hashTable_get(tag, &results);
- results->startTime.stop = timestamp.stop;
- results->time += timer_print(&(results->startTime));
- bdestroy(tag);
- /* Accumulate the results */
- /* Core counters */
- for ( int i=0; i<perfmon_numCountersCore; i++ )
+void likwid_markerGetRegion(const char* regionTag, int* nr_events, double* events, double *time, int *count)
+{
+ if (! likwid_init)
{
- bitMask_test(res,counterMask,i);
- if ( res )
- {
- if (perfmon_counter_map[i].type != THERMAL)
- {
- results->PMcounters[i] += (PMcounters[i] - results->StartPMcounters[i]);
- }
- else
- {
- results->PMcounters[i] = PMcounters[i];
- }
- }
+ *nr_events = 0;
+ *time = 0;
+ *count = 0;
+ return;
}
-
- /* Uncore counters */
- if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+ int length = 0;
+ int cpu_id;
+ int myCPU = likwid_getProcessorId();
+ int thread_id;
+ bstring tag = bfromcstr(regionTag);
+ char groupSuffix[100];
+ LikwidThreadResults* results;
+ sprintf(groupSuffix, "-%d", groupSet->activeGroup);
+ bcatcstr(tag, groupSuffix);
+
+ cpu_id = hashTable_get(tag, &results);
+ thread_id = getThreadID(myCPU);
+ *count = results->count;
+ *time = results->time;
+ length = MIN(groupSet->groups[groupSet->activeGroup].numberOfEvents, *nr_events);
+ for(int i=0;i<length;i++)
{
- for ( int i=perfmon_numCountersCore; i<perfmon_numCounters; i++ )
- {
- bitMask_test(res,counterMask,i);
- if ( res )
- {
- if ( perfmon_counter_map[i].type == POWER )
- {
- if (PMcounters[i] >= results->StartPMcounters[i])
- {
- results->PMcounters[i] += power_info.energyUnit *
- (PMcounters[i] - results->StartPMcounters[i]);
- }
- else
- {
- results->PMcounters[i] += power_info.energyUnit *
- (((double)0xFFFFFFFF) - results->StartPMcounters[i] + PMcounters[i]);
- }
- }
- else
- {
- results->PMcounters[i] += (PMcounters[i] - results->StartPMcounters[i]);
- }
- }
- }
+ events[i] = results->PMcounters[i];
}
+ *nr_events = length;
+ bdestroy(tag);
+ return;
}
+
int likwid_getProcessorId()
{
+ int i;
cpu_set_t cpu_set;
CPU_ZERO(&cpu_set);
sched_getaffinity(gettid(),sizeof(cpu_set_t), &cpu_set);
-
- return getProcessorID(&cpu_set);
+ if (CPU_COUNT(&cpu_set) > 1)
+ {
+ return sched_getcpu();
+ }
+ else
+ {
+ return getProcessorID(&cpu_set);
+ }
+ return -1;
}
#ifdef HAS_SCHEDAFFINITY
@@ -735,7 +569,7 @@ int likwid_pinThread(int processorId)
if (ret != 0)
{
- ERROR;
+ ERROR_PRINT("ERROR: Pinning of thread to CPU %d failed\n", processorId);
return FALSE;
}
@@ -755,7 +589,7 @@ int likwid_pinProcess(int processorId)
if (ret < 0)
{
- ERROR;
+ ERROR_PRINT("ERROR: Pinning of process to CPU %d failed\n", processorId);
return FALSE;
}
diff --git a/src/likwid.f90 b/src/likwid.f90
index 1215dd4..f7096e5 100644
--- a/src/likwid.f90
+++ b/src/likwid.f90
@@ -4,13 +4,14 @@
!
! Description: Marker API f90 module
!
-! Version: 3.1.3
-! Released: 4.11.2014
+! Version: 4.1
+! Released: 19.5.2016
!
-! Author: Jan Treibig (jt), jan.treibig at gmail.com
+! Authors: Jan Treibig (jt), jan.treibig at gmail.com,
+! Thomas Roehl (tr), thomas.roehl at googlemail.com
! Project: likwid
!
-! Copyright (C) 2014 Jan Treibig
+! Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
!
! This program is free software: you can redistribute it and/or modify it under
! the terms of the GNU General Public License as published by the Free Software
@@ -26,28 +27,95 @@
!
! =======================================================================================
+!> \defgroup Fortran_Interface Likwid Fortran90 Module
-
+!> \ingroup Fortran_Interface
+!> Likwid Fortran90 Module for embedding the Marker API into Fortran applications
+!> In the basic configuration the module is compiled with the Intel Fortran Compiler
module likwid
interface
- subroutine likwid_markerInit()
- end subroutine likwid_markerInit
+!> \ingroup Fortran_Interface
+!> \brief Initialize the Likwid Marker API
+!! This routine initializes the Marker API for Fortran. It reads some
+!! environment commonly set by likwid-perfctr.
+!! \note Must be called once in a serial region.
+ subroutine likwid_markerInit()
+ end subroutine likwid_markerInit
+
+!> \ingroup Fortran_Interface
+!> \brief Add current thread to Likwid for Marker API measurements
+!! This routine adds the current thread to Likwid that it performs measurements
+!! for this thread. If using the daemon access mode, it starts a deamon for the
+!! current thread.
+!! \note Must be called once in a parallel region.
+ subroutine likwid_markerThreadInit()
+ end subroutine likwid_markerThreadInit
+
+!> \ingroup Fortran_Interface
+!> \brief Setup performance counters for the next event set
+!> If multiple groups should be measured this function
+!> switches to the next group in a round robin fashion.
+!> Each call reprogramms the performance counters for the current CPU,
+!> \note Do not call it while measuring a code region.
+ subroutine likwid_markerNextGroup()
+ end subroutine likwid_markerNextGroup
+
+!> \ingroup Fortran_Interface
+!> \brief Close the Likwid Marker API
+!> Close the Likwid Marker API and write measured results to temporary file
+!> for evaluation done by likwid-perfctr
+!> \note Must be called once in a serial region and no further
+!> Likwid calls should be used
+ subroutine likwid_markerClose()
+ end subroutine likwid_markerClose
+
+!> \ingroup Fortran_Interface
+!> \brief Register a code region
+!> Initializes the hash table with an empty entry to reduce the overhead
+!> at likwid_markerStartRegion()
+ subroutine likwid_markerRegisterRegion( regionTag )
+!> \param regionTag Name for the code region for later identification
+ character(*) :: regionTag
+ end subroutine likwid_markerRegisterRegion
- subroutine likwid_markerThreadInit()
- end subroutine likwid_markerThreadInit
- subroutine likwid_markerClose()
- end subroutine likwid_markerClose
+!> \ingroup Fortran_Interface
+!> \brief Start the measurement for a code region
+!> Reads the currently running event set and store the results as start values.
+!> for the measurement group identified by regionTag
+ subroutine likwid_markerStartRegion( regionTag )
+!> \param regionTag Name for the code region for later identification
+ character(*) :: regionTag
+ end subroutine likwid_markerStartRegion
- subroutine likwid_markerStartRegion( regionTag )
- character(*) :: regionTag
- end subroutine likwid_markerStartRegion
+!> \ingroup Fortran_Interface
+!> \brief Stop the measurement for a code region
+!> Reads the currently running event set and accumulate the difference between
+!> stop and start data in the measurement group identified by regionTag.
+ subroutine likwid_markerStopRegion( regionTag )
+!> \param regionTag Name for the code region for later identification
+ character(*) :: regionTag
+ end subroutine likwid_markerStopRegion
- subroutine likwid_markerStopRegion( regionTag )
- character(*) :: regionTag
- end subroutine likwid_markerStopRegion
+!> \ingroup Fortran_Interface
+!> \brief Get accumulated measurement results for a code region
+!> Get the accumulated data in the measurement group identified by regionTag
+!> for the current thread.
+!> \warning Experimental
+ subroutine likwid_markerGetRegion( regionTag, nr_events, events, time, count )
+!> \param regionTag [in] Name for the code region for later identification
+!> \param nr_events [in,out] Length of the events array
+!> \param events [out] Events array to store intermediate results
+!> \param time [out] Accumulated measurement time
+!> \param count [out] Call count of the region
+ character(*) :: regionTag
+ INTEGER :: nr_events
+ DOUBLE PRECISION, DIMENSION(*) :: events
+ DOUBLE PRECISION :: time
+ INTEGER :: count
+ end subroutine likwid_markerGetRegion
end interface
diff --git a/src/likwid_f90_interface.c b/src/likwid_f90_interface.c
index 31bad92..51285ec 100644
--- a/src/likwid_f90_interface.c
+++ b/src/likwid_f90_interface.c
@@ -5,13 +5,14 @@
*
* Description: F90 interface for marker API
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com,
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -33,22 +34,44 @@
#include <likwid.h>
-void likwid_markerinit_(void)
+void __attribute__ ((visibility ("default") )) likwid_markerinit_(void)
{
likwid_markerInit();
}
-void likwid_markerthreadinit_(void)
+void __attribute__ ((visibility ("default") )) likwid_markerthreadinit_(void)
{
likwid_markerThreadInit();
}
-void likwid_markerclose_(void)
+void __attribute__ ((visibility ("default") )) likwid_markerclose_(void)
{
likwid_markerClose();
}
-void likwid_markerstartregion_(char* regionTag, int len)
+void __attribute__ ((visibility ("default") )) likwid_markernextgroup_(void)
+{
+ likwid_markerNextGroup();
+}
+
+void __attribute__ ((visibility ("default") )) likwid_markerregisterregion_(char* regionTag, int len)
+{
+ char* tmp = (char*) malloc((len+1) * sizeof(char) );
+ strncpy(tmp, regionTag, len * sizeof(char) );
+
+ for (int i=(len-1); len > 0; len--)
+ {
+ if (tmp[i] != ' ') {
+ tmp[i+1] = 0;
+ break;
+ }
+ }
+
+ likwid_markerRegisterRegion( tmp );
+ free(tmp);
+}
+
+void __attribute__ ((visibility ("default") )) likwid_markerstartregion_(char* regionTag, int len)
{
char* tmp = (char*) malloc((len+1) * sizeof(char) );
strncpy(tmp, regionTag, len * sizeof(char) );
@@ -65,7 +88,7 @@ void likwid_markerstartregion_(char* regionTag, int len)
free(tmp);
}
-void likwid_markerstopregion_(char* regionTag, int len)
+void __attribute__ ((visibility ("default") )) likwid_markerstopregion_(char* regionTag, int len)
{
char* tmp = (char*) malloc((len+1) * sizeof(char));
strncpy(tmp, regionTag, len * sizeof(char) );
@@ -82,3 +105,19 @@ void likwid_markerstopregion_(char* regionTag, int len)
free(tmp);
}
+void __attribute__ ((visibility ("default") )) likwid_markergetregion_(char* regionTag, int* nr_events, double* events, double *time, int *count, int len)
+{
+ char* tmp = (char*) malloc((len+1) * sizeof(char));
+ strncpy(tmp, regionTag, len * sizeof(char) );
+
+ for (int i=(len-1); len > 0; len--)
+ {
+ if (tmp[i] != ' ') {
+ tmp[i+1] = 0;
+ break;
+ }
+ }
+ likwid_markerGetRegion( tmp, nr_events, events, time, count);
+ free(tmp);
+}
+
diff --git a/src/loadData.S b/src/loadData.S
new file mode 100644
index 0000000..86de4d6
--- /dev/null
+++ b/src/loadData.S
@@ -0,0 +1,44 @@
+.intel_syntax noprefix
+
+.text
+.globl _loadData
+.type _loadData, @function
+_loadData :
+#ifdef __x86_64
+xor rax, rax
+.align 16
+1:
+mov r8, [rsi + rax]
+mov r9, [rsi + rax + 64]
+mov r10, [rsi + rax + 128]
+mov r11, [rsi + rax + 192]
+add rax, 256
+cmp rax, rdi
+jb 1b
+
+ret
+#else
+#ifdef __i386__
+push ebp
+mov ebp, esp
+push edi
+push esi
+xor eax, eax
+1:
+mov edi, DWORD PTR [ebp + eax + 12]
+mov esi, DWORD PTR [ebp + eax + 76]
+mov ecx, DWORD PTR [ebp + eax + 140]
+mov edx, DWORD PTR [ebp + eax + 204]
+add eax, 256
+cmp eax, DWORD PTR [ebp+8]
+jb 1b
+pop esi
+pop edi
+mov esp, ebp
+pop ebp
+ret
+#endif
+#endif
+.size _loadData, .-_loadData
+
+
diff --git a/src/loadData.s b/src/loadData.s
deleted file mode 100644
index e176c53..0000000
--- a/src/loadData.s
+++ /dev/null
@@ -1,22 +0,0 @@
-.intel_syntax noprefix
-
-.text
-.globl _loadData
-.type _loadData, @function
-_loadData :
-
-xor rax, rax
-.align 16
-1:
-mov r8, [rsi + rax]
-mov r9, [rsi + rax + 64]
-mov r10, [rsi + rax + 128]
-mov r11, [rsi + rax + 192]
-add rax, 256
-cmp rax, rdi
-jb 1b
-
-ret
-.size _loadData, .-_loadData
-
-
diff --git a/src/loadData.s.tmp b/src/loadData.s.tmp
deleted file mode 100644
index e69de29..0000000
diff --git a/src/luawid.c b/src/luawid.c
new file mode 100644
index 0000000..6e5ced8
--- /dev/null
+++ b/src/luawid.c
@@ -0,0 +1,2334 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: luawid.c
+ *
+ * Description: C part of the Likwid Lua interface
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <time.h>
+#include <sched.h>
+
+#include <lua.h> /* Always include this */
+#include <lauxlib.h> /* Always include this */
+#include <lualib.h> /* Always include this */
+
+#include <likwid.h>
+#include <tree.h>
+#include <access.h>
+#include <bstrlib.h>
+
+#ifdef COLOR
+#include <textcolor.h>
+#endif
+
+static int topology_isInitialized = 0;
+CpuInfo_t cpuinfo = NULL;
+CpuTopology_t cputopo = NULL;
+
+static int numa_isInitialized = 0;
+NumaTopology_t numainfo = NULL;
+static int affinity_isInitialized = 0;
+AffinityDomains_t affinity = NULL;
+static int perfmon_isInitialized = 0;
+static int timer_isInitialized = 0;
+static int power_isInitialized = 0;
+PowerInfo_t power;
+static int power_hasRAPL = 0;
+static int config_isInitialized = 0;
+Configuration_t configfile = NULL;
+
+
+static int lua_likwid_getConfiguration(lua_State* L)
+{
+ int ret = 0;
+ if (config_isInitialized == 0)
+ {
+ ret = init_configuration();
+ if (ret == 0)
+ {
+ config_isInitialized = 1;
+ configfile = get_configuration();
+ }
+ else
+ {
+ lua_newtable(L);
+ lua_pushstring(L, "configFile");
+ lua_pushnil(L);
+ lua_settable(L,-3);
+ lua_pushstring(L, "topologyFile");
+ lua_pushnil(L);
+ lua_settable(L,-3);
+ lua_pushstring(L, "daemonPath");
+ lua_pushnil(L);
+ lua_settable(L,-3);
+ lua_pushstring(L, "groupPath");
+ lua_pushnil(L);
+ lua_settable(L,-3);
+ lua_pushstring(L, "daemonMode");
+ lua_pushinteger(L, -1);
+ lua_settable(L,-3);
+ lua_pushstring(L, "maxNumThreads");
+ lua_pushinteger(L, 0);
+ lua_settable(L,-3);
+ lua_pushstring(L, "maxNumNodes");
+ lua_pushinteger(L, 0);
+ lua_settable(L,-3);
+ return 1;
+ }
+ }
+ if ((config_isInitialized) && (configfile == NULL))
+ {
+ configfile = get_configuration();
+ }
+ lua_newtable(L);
+ lua_pushstring(L, "configFile");
+ lua_pushstring(L, configfile->configFileName);
+ lua_settable(L,-3);
+ lua_pushstring(L, "topologyFile");
+ lua_pushstring(L, configfile->topologyCfgFileName);
+ lua_settable(L,-3);
+ lua_pushstring(L, "daemonPath");
+ lua_pushstring(L, configfile->daemonPath);
+ lua_settable(L,-3);
+ lua_pushstring(L, "groupPath");
+ lua_pushstring(L, configfile->groupPath);
+ lua_settable(L,-3);
+ lua_pushstring(L, "daemonMode");
+ lua_pushinteger(L, (int)configfile->daemonMode);
+ lua_settable(L,-3);
+ lua_pushstring(L, "maxNumThreads");
+ lua_pushinteger(L, configfile->maxNumThreads);
+ lua_settable(L,-3);
+ lua_pushstring(L, "maxNumNodes");
+ lua_pushinteger(L, configfile->maxNumNodes);
+ lua_settable(L,-3);
+ return 1;
+}
+
+static int lua_likwid_putConfiguration(lua_State* L)
+{
+ if (config_isInitialized == 1)
+ {
+ destroy_configuration();
+ config_isInitialized = 0;
+ configfile = NULL;
+ }
+ return 0;
+}
+
+static int lua_likwid_setGroupPath(lua_State* L)
+{
+ int ret;
+ const char* tmpString;
+ if (config_isInitialized == 0)
+ {
+ ret = init_configuration();
+ if (ret == 0)
+ {
+ config_isInitialized = 1;
+ }
+ }
+ tmpString = luaL_checkstring(L, 1);
+ ret = config_setGroupPath((char*)tmpString);
+ if (ret < 0)
+ {
+ lua_pushstring(L,"Cannot set group path");
+ lua_error(L);
+ }
+ return 0;
+}
+
+static int lua_likwid_setAccessMode(lua_State* L)
+{
+ int flag;
+ flag = luaL_checknumber(L,1);
+ luaL_argcheck(L, flag >= 0 && flag <= 1, 1, "invalid access mode, only 0 (direct) and 1 (accessdaemon) allowed");
+ HPMmode(flag);
+ lua_pushinteger(L,0);
+ return 1;
+}
+
+static int lua_likwid_init(lua_State* L)
+{
+ int ret;
+ int nrThreads = luaL_checknumber(L,1);
+ luaL_argcheck(L, nrThreads > 0, 1, "CPU count must be greater than 0");
+ int cpus[nrThreads];
+ if (!lua_istable(L, -1)) {
+ lua_pushstring(L,"No table given as second argument");
+ lua_error(L);
+ }
+ for (ret = 1; ret<=nrThreads; ret++)
+ {
+ lua_rawgeti(L,-1,ret);
+ cpus[ret-1] = ((lua_Unsigned)lua_tointegerx(L,-1, NULL));
+ lua_pop(L,1);
+ }
+ if (topology_isInitialized == 0)
+ {
+ topology_init();
+ topology_isInitialized = 1;
+ cpuinfo = get_cpuInfo();
+ cputopo = get_cpuTopology();
+ }
+ if ((topology_isInitialized) && (cpuinfo == NULL))
+ {
+ cpuinfo = get_cpuInfo();
+ }
+ if ((topology_isInitialized) && (cputopo == NULL))
+ {
+ cputopo = get_cpuTopology();
+ }
+ if (numa_isInitialized == 0)
+ {
+ numa_init();
+ numa_isInitialized = 1;
+ numainfo = get_numaTopology();
+ }
+ if ((numa_isInitialized) && (numainfo == NULL))
+ {
+ numainfo = get_numaTopology();
+ }
+ if (timer_isInitialized == 0)
+ {
+ timer_init();
+ timer_isInitialized = 1;
+ }
+ if (perfmon_isInitialized == 0)
+ {
+ ret = perfmon_init(nrThreads, &(cpus[0]));
+ if (ret != 0)
+ {
+ lua_pushstring(L,"Cannot initialize likwid perfmon");
+ perfmon_finalize();
+ lua_pushinteger(L,ret);
+ return 1;
+ }
+ perfmon_isInitialized = 1;
+ timer_isInitialized = 1;
+ lua_pushinteger(L,ret);
+ }
+ return 1;
+}
+
+
+static int lua_likwid_addEventSet(lua_State* L)
+{
+ int groupId, n;
+ const char* tmpString;
+ if (perfmon_isInitialized == 0)
+ {
+ return 0;
+ }
+ n = lua_gettop(L);
+ tmpString = luaL_checkstring(L, n);
+ luaL_argcheck(L, strlen(tmpString) > 0, n, "Event string must be larger than 0");
+
+ groupId = perfmon_addEventSet((char*)tmpString);
+ lua_pushinteger(L, groupId+1);
+ return 1;
+}
+
+static int lua_likwid_setupCounters(lua_State* L)
+{
+ int ret;
+ int groupId = lua_tonumber(L,1);
+ if (perfmon_isInitialized == 0)
+ {
+ return 0;
+ }
+ ret = perfmon_setupCounters(groupId-1);
+ lua_pushinteger(L,ret);
+ return 1;
+}
+
+
+static int lua_likwid_startCounters(lua_State* L)
+{
+ int ret;
+ if (perfmon_isInitialized == 0)
+ {
+ return 0;
+ }
+ ret = perfmon_startCounters();
+ lua_pushinteger(L,ret);
+ return 1;
+}
+
+static int lua_likwid_stopCounters(lua_State* L)
+{
+ int ret;
+ if (perfmon_isInitialized == 0)
+ {
+ return 0;
+ }
+ ret = perfmon_stopCounters();
+ lua_pushinteger(L,ret);
+ return 1;
+}
+
+static int lua_likwid_readCounters(lua_State* L)
+{
+ int ret;
+ if (perfmon_isInitialized == 0)
+ {
+ return 0;
+ }
+ ret = perfmon_readCounters();
+ lua_pushinteger(L,ret);
+ return 1;
+}
+
+static int lua_likwid_switchGroup(lua_State* L)
+{
+ int ret = -1;
+ int newgroup = lua_tonumber(L,1)-1;
+ if (perfmon_isInitialized == 0)
+ {
+ return 0;
+ }
+ if (newgroup >= perfmon_getNumberOfGroups())
+ {
+ newgroup = 0;
+ }
+ if (newgroup == perfmon_getIdOfActiveGroup())
+ {
+ lua_pushinteger(L, ret);
+ return 1;
+ }
+ ret = perfmon_switchActiveGroup(newgroup);
+ lua_pushinteger(L, ret);
+ return 1;
+}
+
+static int lua_likwid_finalize(lua_State* L)
+{
+ if (perfmon_isInitialized == 1)
+ {
+ perfmon_finalize();
+ perfmon_isInitialized = 0;
+ }
+ if (affinity_isInitialized == 1)
+ {
+ affinity_finalize();
+ affinity_isInitialized = 0;
+ affinity = NULL;
+ }
+ if (numa_isInitialized == 1)
+ {
+ numa_finalize();
+ numa_isInitialized = 0;
+ numainfo = NULL;
+ }
+ if (topology_isInitialized == 1)
+ {
+ topology_finalize();
+ topology_isInitialized = 0;
+ cputopo = NULL;
+ cpuinfo = NULL;
+ }
+ if (timer_isInitialized == 1)
+ {
+ timer_finalize();
+ timer_isInitialized = 0;
+ }
+ if (config_isInitialized == 1)
+ {
+ destroy_configuration();
+ config_isInitialized = 0;
+ configfile = NULL;
+ }
+ return 0;
+}
+
+static int lua_likwid_getResult(lua_State* L)
+{
+ int groupId, eventId, threadId;
+ double result = 0;
+ groupId = lua_tonumber(L,1);
+ eventId = lua_tonumber(L,2);
+ threadId = lua_tonumber(L,3);
+ result = perfmon_getResult(groupId-1, eventId-1, threadId-1);
+ lua_pushnumber(L,result);
+ return 1;
+}
+
+static int lua_likwid_getLastResult(lua_State* L)
+{
+ int groupId, eventId, threadId;
+ double result = 0;
+ groupId = lua_tonumber(L,1);
+ eventId = lua_tonumber(L,2);
+ threadId = lua_tonumber(L,3);
+ result = perfmon_getLastResult(groupId-1, eventId-1, threadId-1);
+ lua_pushnumber(L,result);
+ return 1;
+}
+
+static int lua_likwid_getMetric(lua_State* L)
+{
+ int groupId, metricId, threadId;
+ double result = 0;
+ groupId = lua_tonumber(L,1);
+ metricId = lua_tonumber(L,2);
+ threadId = lua_tonumber(L,3);
+ result = perfmon_getMetric(groupId-1, metricId-1, threadId-1);
+ lua_pushnumber(L,result);
+ return 1;
+}
+
+static int lua_likwid_getLastMetric(lua_State* L)
+{
+ int groupId, metricId, threadId;
+ double result = 0;
+ groupId = lua_tonumber(L,1);
+ metricId = lua_tonumber(L,2);
+ threadId = lua_tonumber(L,3);
+ result = perfmon_getLastMetric(groupId-1, metricId-1, threadId-1);
+ lua_pushnumber(L,result);
+ return 1;
+}
+
+static int lua_likwid_getNumberOfGroups(lua_State* L)
+{
+ int number;
+ if (perfmon_isInitialized == 0)
+ {
+ return 0;
+ }
+ number = perfmon_getNumberOfGroups();
+ lua_pushinteger(L,number);
+ return 1;
+}
+
+static int lua_likwid_getIdOfActiveGroup(lua_State* L)
+{
+ int number;
+ if (perfmon_isInitialized == 0)
+ {
+ return 0;
+ }
+ number = perfmon_getIdOfActiveGroup();
+ lua_pushinteger(L,number+1);
+ return 1;
+}
+
+static int lua_likwid_getRuntimeOfGroup(lua_State* L)
+{
+ double time;
+ int groupId;
+ if (perfmon_isInitialized == 0)
+ {
+ return 0;
+ }
+ groupId = lua_tonumber(L,1);
+ time = perfmon_getTimeOfGroup(groupId-1);
+ lua_pushnumber(L, time);
+ return 1;
+}
+
+static int lua_likwid_getNumberOfEvents(lua_State* L)
+{
+ int number, groupId;
+ if (perfmon_isInitialized == 0)
+ {
+ return 0;
+ }
+ groupId = lua_tonumber(L,1);
+ number = perfmon_getNumberOfEvents(groupId-1);
+ lua_pushinteger(L,number);
+ return 1;
+}
+
+static int lua_likwid_getNumberOfThreads(lua_State* L)
+{
+ int number;
+ if (perfmon_isInitialized == 0)
+ {
+ return 0;
+ }
+ number = perfmon_getNumberOfThreads();
+ lua_pushinteger(L,number);
+ return 1;
+}
+
+static int lua_likwid_getNameOfEvent(lua_State* L)
+{
+ int eventId, groupId;
+ char* tmp;
+ if (perfmon_isInitialized == 0)
+ {
+ return 0;
+ }
+ groupId = lua_tonumber(L,1);
+ eventId = lua_tonumber(L,2);
+ tmp = perfmon_getEventName(groupId-1, eventId-1);
+ lua_pushstring(L,tmp);
+ return 1;
+}
+
+static int lua_likwid_getNameOfCounter(lua_State* L)
+{
+ int eventId, groupId;
+ char* tmp;
+ if (perfmon_isInitialized == 0)
+ {
+ return 0;
+ }
+ groupId = lua_tonumber(L,1);
+ eventId = lua_tonumber(L,2);
+ tmp = perfmon_getCounterName(groupId-1, eventId-1);
+ lua_pushstring(L,tmp);
+ return 1;
+}
+
+static int lua_likwid_getNumberOfMetrics(lua_State* L)
+{
+ int number, groupId;
+ if (perfmon_isInitialized == 0)
+ {
+ return 0;
+ }
+ groupId = lua_tonumber(L,1);
+ number = perfmon_getNumberOfMetrics(groupId-1);
+ lua_pushinteger(L,number);
+ return 1;
+}
+
+static int lua_likwid_getNameOfMetric(lua_State* L)
+{
+ int metricId, groupId;
+ char* tmp;
+ if (perfmon_isInitialized == 0)
+ {
+ return 0;
+ }
+ groupId = lua_tonumber(L,1);
+ metricId = lua_tonumber(L,2);
+ tmp = perfmon_getMetricName(groupId-1, metricId-1);
+ lua_pushstring(L,tmp);
+ return 1;
+}
+
+static int lua_likwid_getNameOfGroup(lua_State* L)
+{
+ int groupId;
+ char* tmp;
+ if (perfmon_isInitialized == 0)
+ {
+ return 0;
+ }
+ groupId = lua_tonumber(L,1);
+ tmp = perfmon_getGroupName(groupId-1);
+ lua_pushstring(L,tmp);
+ return 1;
+}
+
+static int lua_likwid_getShortInfoOfGroup(lua_State* L)
+{
+ int groupId;
+ char* tmp;
+ if (perfmon_isInitialized == 0)
+ {
+ return 0;
+ }
+ groupId = lua_tonumber(L,1);
+ tmp = perfmon_getGroupInfoShort(groupId-1);
+ lua_pushstring(L,tmp);
+ return 1;
+}
+
+static int lua_likwid_getLongInfoOfGroup(lua_State* L)
+{
+ int groupId;
+ char* tmp;
+ if (perfmon_isInitialized == 0)
+ {
+ return 0;
+ }
+ groupId = lua_tonumber(L,1);
+ tmp = perfmon_getGroupInfoLong(groupId-1);
+ lua_pushstring(L,tmp);
+ return 1;
+}
+
+static int lua_likwid_getGroups(lua_State* L)
+{
+ int i, ret;
+ char** tmp, **infos, **longs;
+ if (topology_isInitialized == 0)
+ {
+ topology_init();
+ }
+ ret = perfmon_getGroups(&tmp, &infos, &longs);
+ if (ret > 0)
+ {
+ lua_newtable(L);
+ for (i=0;i<ret;i++)
+ {
+ lua_pushinteger(L, (lua_Integer)( i+1));
+ lua_newtable(L);
+ lua_pushstring(L, "Name");
+ lua_pushstring(L, tmp[i]);
+ lua_settable(L,-3);
+ lua_pushstring(L, "Info");
+ lua_pushstring(L, infos[i]);
+ lua_settable(L,-3);
+ lua_pushstring(L, "Long");
+ lua_pushstring(L, longs[i]);
+ lua_settable(L,-3);
+ lua_settable(L,-3);
+ }
+ perfmon_returnGroups(ret, tmp, infos, longs);
+ return 1;
+ }
+ return 0;
+}
+
+
+static int lua_likwid_printSupportedCPUs(lua_State* L)
+{
+ print_supportedCPUs();
+ return 0;
+}
+
+static int lua_likwid_getCpuInfo(lua_State* L)
+{
+ if (topology_isInitialized == 0)
+ {
+ topology_init();
+ topology_isInitialized = 1;
+ cpuinfo = get_cpuInfo();
+ }
+ if ((topology_isInitialized) && (cpuinfo == NULL))
+ {
+ cpuinfo = get_cpuInfo();
+ }
+ lua_newtable(L);
+ lua_pushstring(L,"family");
+ lua_pushinteger(L, (lua_Integer)(cpuinfo->family));
+ lua_settable(L,-3);
+ lua_pushstring(L,"model");
+ lua_pushinteger(L, (lua_Integer)(cpuinfo->model));
+ lua_settable(L,-3);
+ lua_pushstring(L,"stepping");
+ lua_pushinteger(L, (lua_Integer)(cpuinfo->stepping));
+ lua_settable(L,-3);
+ lua_pushstring(L,"clock");
+ lua_pushinteger(L, (lua_Integer)(cpuinfo->clock));
+ lua_settable(L,-3);
+ lua_pushstring(L,"turbo");
+ lua_pushinteger(L,cpuinfo->turbo);
+ lua_settable(L,-3);
+ lua_pushstring(L,"name");
+ lua_pushstring(L,cpuinfo->name);
+ lua_settable(L,-3);
+ lua_pushstring(L,"osname");
+ lua_pushstring(L,cpuinfo->osname);
+ lua_settable(L,-3);
+ lua_pushstring(L,"short_name");
+ lua_pushstring(L,cpuinfo->short_name);
+ lua_settable(L,-3);
+ lua_pushstring(L,"features");
+ lua_pushstring(L,cpuinfo->features);
+ lua_settable(L,-3);
+ lua_pushstring(L,"isIntel");
+ lua_pushinteger(L,cpuinfo->isIntel);
+ lua_settable(L,-3);
+ lua_pushstring(L,"featureFlags");
+ lua_pushinteger(L, (lua_Integer)(cpuinfo->featureFlags));
+ lua_settable(L,-3);
+ lua_pushstring(L,"perf_version");
+ lua_pushinteger(L, (lua_Integer)( cpuinfo->perf_version));
+ lua_settable(L,-3);
+ lua_pushstring(L,"perf_num_ctr");
+ lua_pushinteger(L, (lua_Integer)(cpuinfo->perf_num_ctr));
+ lua_settable(L,-3);
+ lua_pushstring(L,"perf_width_ctr");
+ lua_pushinteger(L, (lua_Integer)(cpuinfo->perf_width_ctr));
+ lua_settable(L,-3);
+ lua_pushstring(L,"perf_num_fixed_ctr");
+ lua_pushinteger(L, (lua_Integer)(cpuinfo->perf_num_fixed_ctr));
+ lua_settable(L,-3);
+ return 1;
+}
+
+static int lua_likwid_getCpuTopology(lua_State* L)
+{
+ int i;
+ TreeNode* socketNode;
+ int socketCount = 0;
+ TreeNode* coreNode;
+ int coreCount = 0;
+ TreeNode* threadNode;
+ int threadCount = 0;
+ if (topology_isInitialized == 0)
+ {
+ topology_init();
+ topology_isInitialized = 1;
+ cputopo = get_cpuTopology();
+ }
+ if ((topology_isInitialized) && (cputopo == NULL))
+ {
+ cputopo = get_cpuTopology();
+ }
+ if (numa_isInitialized == 0)
+ {
+ if (numa_init() == 0)
+ {
+ numa_isInitialized = 1;
+ numainfo = get_numaTopology();
+ }
+ }
+ if ((numa_isInitialized) && (numainfo == NULL))
+ {
+ numainfo = get_numaTopology();
+ }
+
+ lua_newtable(L);
+
+ lua_pushstring(L,"numHWThreads");
+ lua_pushinteger(L, (lua_Integer)(cputopo->numHWThreads));
+ lua_settable(L,-3);
+
+ lua_pushstring(L,"activeHWThreads");
+ lua_pushinteger(L, (lua_Integer)(cputopo->activeHWThreads));
+ lua_settable(L,-3);
+
+ lua_pushstring(L,"numSockets");
+ lua_pushinteger(L, (lua_Integer)(cputopo->numSockets));
+ lua_settable(L,-3);
+
+ lua_pushstring(L,"numCoresPerSocket");
+ lua_pushinteger(L, (lua_Integer)(cputopo->numCoresPerSocket));
+ lua_settable(L,-3);
+
+ lua_pushstring(L,"numThreadsPerCore");
+ lua_pushinteger(L, (lua_Integer)(cputopo->numThreadsPerCore));
+ lua_settable(L,-3);
+
+ lua_pushstring(L,"numCacheLevels");
+ lua_pushinteger(L,cputopo->numCacheLevels);
+ lua_settable(L,-3);
+
+ lua_pushstring(L,"threadPool");
+ lua_newtable(L);
+ for(i=0;i<cputopo->numHWThreads;i++)
+ {
+ lua_pushnumber(L,i);
+ lua_newtable(L);
+ lua_pushstring(L,"threadId");
+ lua_pushinteger(L, (lua_Integer)(cputopo->threadPool[i].threadId));
+ lua_settable(L,-3);
+ lua_pushstring(L,"coreId");
+ lua_pushinteger(L, (lua_Integer)(cputopo->threadPool[i].coreId));
+ lua_settable(L,-3);
+ lua_pushstring(L,"packageId");
+ lua_pushinteger(L, (lua_Integer)(cputopo->threadPool[i].packageId));
+ lua_settable(L,-3);
+ lua_pushstring(L,"apicId");
+ lua_pushinteger(L, (lua_Integer)(cputopo->threadPool[i].apicId));
+ lua_settable(L,-3);
+ lua_pushstring(L,"inCpuSet");
+ lua_pushinteger(L, (lua_Integer)(cputopo->threadPool[i].inCpuSet));
+ lua_settable(L,-3);
+ lua_settable(L,-3);
+ }
+ lua_settable(L,-3);
+
+ lua_pushstring(L,"cacheLevels");
+ lua_newtable(L);
+ for(i=0;i<cputopo->numCacheLevels;i++)
+ {
+ lua_pushnumber(L,i+1);
+ lua_newtable(L);
+
+ lua_pushstring(L,"level");
+ lua_pushinteger(L, (lua_Integer)(cputopo->cacheLevels[i].level));
+ lua_settable(L,-3);
+
+ lua_pushstring(L,"associativity");
+ lua_pushinteger(L, (lua_Integer)(cputopo->cacheLevels[i].associativity));
+ lua_settable(L,-3);
+
+ lua_pushstring(L,"sets");
+ lua_pushinteger(L, (lua_Integer)(cputopo->cacheLevels[i].sets));
+ lua_settable(L,-3);
+
+ lua_pushstring(L,"lineSize");
+ lua_pushinteger(L, (lua_Integer)(cputopo->cacheLevels[i].lineSize));
+ lua_settable(L,-3);
+
+ lua_pushstring(L,"size");
+ lua_pushinteger(L, (lua_Integer)(cputopo->cacheLevels[i].size));
+ lua_settable(L,-3);
+
+ lua_pushstring(L,"threads");
+ lua_pushinteger(L, (lua_Integer)(cputopo->cacheLevels[i].threads));
+ lua_settable(L,-3);
+
+ lua_pushstring(L,"inclusive");
+ lua_pushinteger(L, (lua_Integer)(cputopo->cacheLevels[i].inclusive));
+ lua_settable(L,-3);
+
+ lua_pushstring(L,"type");
+ switch (cputopo->cacheLevels[i].type)
+ {
+ case DATACACHE:
+ lua_pushstring(L,"DATACACHE");
+ break;
+ case INSTRUCTIONCACHE:
+ lua_pushstring(L,"INSTRUCTIONCACHE");
+ break;
+ case UNIFIEDCACHE:
+ lua_pushstring(L,"UNIFIEDCACHE");
+ break;
+ case ITLB:
+ lua_pushstring(L,"ITLB");
+ break;
+ case DTLB:
+ lua_pushstring(L,"DTLB");
+ break;
+ case NOCACHE:
+ default:
+ lua_pushstring(L,"NOCACHE");
+ break;
+ }
+ lua_settable(L,-3);
+ lua_settable(L,-3);
+ }
+ lua_settable(L,-3);
+
+ lua_pushstring(L,"topologyTree");
+ lua_newtable(L);
+
+ socketNode = tree_getChildNode(cputopo->topologyTree);
+ while (socketNode != NULL)
+ {
+ lua_pushinteger(L, socketCount);
+ lua_newtable(L);
+ lua_pushstring(L, "ID");
+ lua_pushinteger(L, (lua_Integer)(socketNode->id));
+ lua_settable(L, -3);
+ lua_pushstring(L, "Childs");
+ lua_newtable(L);
+ coreCount = 0;
+ coreNode = tree_getChildNode(socketNode);
+ while (coreNode != NULL)
+ {
+ lua_pushinteger(L, coreCount);
+ lua_newtable(L);
+ lua_pushstring(L, "ID");
+ lua_pushinteger(L, (lua_Integer)(coreNode->id));
+ lua_settable(L,-3);
+ lua_pushstring(L, "Childs");
+ lua_newtable(L);
+ threadNode = tree_getChildNode(coreNode);
+ threadCount = 0;
+ while (threadNode != NULL)
+ {
+ lua_pushinteger(L, (lua_Integer)(threadCount));
+ lua_pushinteger(L, (lua_Integer)(threadNode->id));
+ lua_settable(L,-3);
+ threadNode = tree_getNextNode(threadNode);
+ threadCount++;
+ }
+ lua_settable(L,-3);
+ coreNode = tree_getNextNode(coreNode);
+ coreCount++;
+ lua_settable(L,-3);
+ }
+ lua_settable(L,-3);
+ socketNode = tree_getNextNode(socketNode);
+ socketCount++;
+ lua_settable(L,-3);
+ }
+ lua_settable(L,-3);
+ return 1;
+}
+
+static int lua_likwid_putTopology(lua_State* L)
+{
+ if (topology_isInitialized == 1)
+ {
+ topology_finalize();
+ topology_isInitialized = 0;
+ cpuinfo = NULL;
+ cputopo = NULL;
+ }
+ return 0;
+}
+
+
+static int lua_likwid_getEventsAndCounters(lua_State* L)
+{
+ int i;
+
+ if (topology_isInitialized == 0)
+ {
+ topology_init();
+ topology_isInitialized = 1;
+ cpuinfo = get_cpuInfo();
+ }
+ if ((topology_isInitialized) && (cpuinfo == NULL))
+ {
+ cpuinfo = get_cpuInfo();
+ }
+ perfmon_init_maps();
+ lua_newtable(L);
+ lua_pushstring(L,"Counters");
+ lua_newtable(L);
+ for(i=1;i<=perfmon_numCounters;i++)
+ {
+ bstring optString = bfromcstr("");
+ lua_pushinteger(L, (lua_Integer)(i));
+ lua_newtable(L);
+ lua_pushstring(L,"Name");
+ lua_pushstring(L,counter_map[i-1].key);
+ lua_settable(L,-3);
+ lua_pushstring(L,"Options");
+ for(int j=1; j<NUM_EVENT_OPTIONS; j++)
+ {
+ if (counter_map[i-1].optionMask & REG_TYPE_MASK(j))
+ {
+ bstring tmp = bformat("%s|", eventOptionTypeName[j]);
+ bconcat(optString, tmp);
+ bdestroy(tmp);
+ }
+ }
+ lua_pushstring(L,bdata(optString));
+ lua_settable(L,-3);
+ lua_pushstring(L,"Type");
+ lua_pushinteger(L, (lua_Integer)( counter_map[i-1].type));
+ lua_settable(L,-3);
+ lua_pushstring(L,"TypeName");
+ lua_pushstring(L, RegisterTypeNames[counter_map[i-1].type]);
+ lua_settable(L,-3);
+ lua_pushstring(L,"Index");
+ lua_pushinteger(L, (lua_Integer)(counter_map[i-1].index));
+ lua_settable(L,-3);
+ lua_settable(L,-3);
+ bdestroy(optString);
+ }
+ lua_settable(L,-3);
+ lua_pushstring(L,"Events");
+ lua_newtable(L);
+ for(i=1;i<=perfmon_numArchEvents;i++)
+ {
+ bstring optString = bfromcstr("");
+ lua_pushinteger(L, (lua_Integer)(i));
+ lua_newtable(L);
+ lua_pushstring(L,"Name");
+ lua_pushstring(L,eventHash[i-1].name);
+ lua_settable(L,-3);
+ lua_pushstring(L,"ID");
+ lua_pushinteger(L, (lua_Integer)(eventHash[i-1].eventId));
+ lua_settable(L,-3);
+ lua_pushstring(L,"UMask");
+ lua_pushinteger(L, (lua_Integer)(eventHash[i-1].umask));
+ lua_settable(L,-3);
+ lua_pushstring(L,"Limit");
+ lua_pushstring(L,eventHash[i-1].limit);
+ lua_settable(L,-3);
+ lua_pushstring(L,"Options");
+ for(int j=1; j<NUM_EVENT_OPTIONS; j++)
+ {
+ if (eventHash[i-1].optionMask & REG_TYPE_MASK(j))
+ {
+ bstring tmp = bformat("%s|", eventOptionTypeName[j]);
+ bconcat(optString, tmp);
+ bdestroy(tmp);
+ }
+ }
+ lua_pushstring(L,bdata(optString));
+ lua_settable(L,-3);
+ lua_settable(L,-3);
+ bdestroy(optString);
+ }
+ lua_settable(L,-3);
+ return 1;
+}
+
+static int lua_likwid_getOnlineDevices(lua_State* L)
+{
+ int i;
+ lua_newtable(L);
+ for(i=0;i<=MAX_NUM_PCI_DEVICES;i++)
+ {
+ if (pci_devices[i].online)
+ {
+ lua_pushstring(L,pci_devices[i].likwid_name);
+ lua_newtable(L);
+ lua_pushstring(L, "Name");
+ lua_pushstring(L,pci_devices[i].name);
+ lua_settable(L,-3);
+ lua_pushstring(L, "Path");
+ lua_pushstring(L,pci_devices[i].path);
+ lua_settable(L,-3);
+ lua_pushstring(L, "Type");
+ lua_pushstring(L,pci_types[pci_devices[i].type].name);
+ lua_settable(L,-3);
+ lua_pushstring(L, "TypeDescription");
+ lua_pushstring(L,pci_types[pci_devices[i].type].desc);
+ lua_settable(L,-3);
+ }
+ lua_settable(L,-3);
+ }
+ return 1;
+}
+
+static int lua_likwid_getNumaInfo(lua_State* L)
+{
+ uint32_t i,j;
+ if (topology_isInitialized == 0)
+ {
+ topology_init();
+ topology_isInitialized = 1;
+ cpuinfo = get_cpuInfo();
+ cputopo = get_cpuTopology();
+ }
+ if ((topology_isInitialized) && (cpuinfo == NULL))
+ {
+ cpuinfo = get_cpuInfo();
+ }
+ if ((topology_isInitialized) && (cputopo == NULL))
+ {
+ cputopo = get_cpuTopology();
+ }
+ if (numa_isInitialized == 0)
+ {
+ if (numa_init() == 0)
+ {
+ numa_isInitialized = 1;
+ numainfo = get_numaTopology();
+ }
+ else
+ {
+ lua_newtable(L);
+ lua_pushstring(L,"numberOfNodes");
+ lua_pushinteger(L, (lua_Integer)(0));
+ lua_settable(L,-3);
+ lua_pushstring(L,"nodes");
+ lua_newtable(L);
+ lua_settable(L,-3);
+ return 1;
+ }
+ }
+ if ((numa_isInitialized) && (numainfo == NULL))
+ {
+ numainfo = get_numaTopology();
+ }
+ if (affinity_isInitialized == 0)
+ {
+ affinity_init();
+ affinity_isInitialized = 1;
+ affinity = get_affinityDomains();
+ }
+ if ((affinity_isInitialized) && (affinity == NULL))
+ {
+ affinity = get_affinityDomains();
+ }
+ lua_newtable(L);
+ lua_pushstring(L,"numberOfNodes");
+ lua_pushinteger(L, (lua_Integer)(numainfo->numberOfNodes));
+ lua_settable(L,-3);
+
+ lua_pushstring(L,"nodes");
+ lua_newtable(L);
+ for(i=0;i<numainfo->numberOfNodes;i++)
+ {
+ lua_pushinteger(L, i+1);
+ lua_newtable(L);
+
+ lua_pushstring(L,"id");
+ lua_pushinteger(L, (lua_Integer)(numainfo->nodes[i].id));
+ lua_settable(L,-3);
+ lua_pushstring(L,"totalMemory");
+ lua_pushinteger(L, (lua_Integer)(numainfo->nodes[i].totalMemory));
+ lua_settable(L,-3);
+ lua_pushstring(L,"freeMemory");
+ lua_pushinteger(L, (lua_Integer)(numainfo->nodes[i].freeMemory));
+ lua_settable(L,-3);
+ lua_pushstring(L,"numberOfProcessors");
+ lua_pushinteger(L, (lua_Integer)(numainfo->nodes[i].numberOfProcessors));
+ lua_settable(L,-3);
+ lua_pushstring(L,"numberOfDistances");
+ lua_pushinteger(L, (lua_Integer)(numainfo->nodes[i].numberOfDistances));
+ lua_settable(L,-3);
+
+ lua_pushstring(L,"processors");
+ lua_newtable(L);
+ for(j=0;j<numainfo->nodes[i].numberOfProcessors;j++)
+ {
+ lua_pushinteger(L, (lua_Integer)(j+1));
+ lua_pushinteger(L, (lua_Integer)(numainfo->nodes[i].processors[j]));
+ lua_settable(L,-3);
+ }
+ lua_settable(L,-3);
+
+ /*lua_pushstring(L,"processorsCompact");
+ lua_newtable(L);
+ for(j=0;j<numa->nodes[i].numberOfProcessors;j++)
+ {
+ lua_pushinteger(L, (lua_Integer)(j);
+ lua_pushinteger(L, (lua_Integer)(numa->nodes[i].processorsCompact[j]);
+ lua_settable(L,-3);
+ }
+ lua_settable(L,-3);*/
+
+ lua_pushstring(L,"distances");
+ lua_newtable(L);
+ for(j=0;j<numainfo->nodes[i].numberOfDistances;j++)
+ {
+ lua_pushinteger(L,j+1);
+ lua_newtable(L);
+ lua_pushinteger(L,j);
+ lua_pushinteger(L, (lua_Integer)(numainfo->nodes[i].distances[j]));
+ lua_settable(L,-3);
+ lua_settable(L,-3);
+ }
+ lua_settable(L,-3);
+
+ lua_settable(L,-3);
+ }
+ lua_settable(L,-3);
+ return 1;
+}
+
+static int lua_likwid_putNumaInfo(lua_State* L)
+{
+ if (numa_isInitialized)
+ {
+ numa_finalize();
+ numa_isInitialized = 0;
+ numainfo = NULL;
+ }
+ return 0;
+}
+
+static int lua_likwid_setMemInterleaved(lua_State* L)
+{
+ int ret;
+ int nrThreads = luaL_checknumber(L,1);
+ luaL_argcheck(L, nrThreads > 0, 1, "Thread count must be greater than 0");
+ int cpus[nrThreads];
+ if (!lua_istable(L, -1)) {
+ lua_pushstring(L,"No table given as second argument");
+ lua_error(L);
+ }
+ for (ret = 1; ret<=nrThreads; ret++)
+ {
+ lua_rawgeti(L,-1,ret);
+ cpus[ret-1] = ((lua_Unsigned)lua_tointegerx(L,-1, NULL));
+ lua_pop(L,1);
+ }
+ numa_setInterleaved(cpus, nrThreads);
+ return 0;
+}
+
+static int lua_likwid_getAffinityInfo(lua_State* L)
+{
+ int i,j;
+
+ if (topology_isInitialized == 0)
+ {
+ topology_init();
+ topology_isInitialized = 1;
+ cpuinfo = get_cpuInfo();
+ cputopo = get_cpuTopology();
+ }
+ if ((topology_isInitialized) && (cpuinfo == NULL))
+ {
+ cpuinfo = get_cpuInfo();
+ }
+ if ((topology_isInitialized) && (cputopo == NULL))
+ {
+ cputopo = get_cpuTopology();
+ }
+ if (numa_isInitialized == 0)
+ {
+ if (numa_init() == 0)
+ {
+ numa_isInitialized = 1;
+ numainfo = get_numaTopology();
+ }
+ }
+ if ((numa_isInitialized) && (numainfo == NULL))
+ {
+ numainfo = get_numaTopology();
+ }
+ if (affinity_isInitialized == 0)
+ {
+ affinity_init();
+ affinity_isInitialized = 1;
+ affinity = get_affinityDomains();
+ }
+ if ((affinity_isInitialized) && (affinity == NULL))
+ {
+ affinity = get_affinityDomains();
+ }
+
+ if (!affinity)
+ {
+ lua_pushstring(L,"Cannot initialize affinity groups");
+ lua_error(L);
+ }
+ lua_newtable(L);
+ lua_pushstring(L,"numberOfAffinityDomains");
+ lua_pushinteger(L, (lua_Integer)(affinity->numberOfAffinityDomains));
+ lua_settable(L,-3);
+ lua_pushstring(L,"numberOfSocketDomains");
+ lua_pushinteger(L, (lua_Integer)(affinity->numberOfSocketDomains));
+ lua_settable(L,-3);
+ lua_pushstring(L,"numberOfNumaDomains");
+ lua_pushinteger(L, (lua_Integer)(affinity->numberOfNumaDomains));
+ lua_settable(L,-3);
+ lua_pushstring(L,"numberOfProcessorsPerSocket");
+ lua_pushinteger(L, (lua_Integer)(affinity->numberOfProcessorsPerSocket));
+ lua_settable(L,-3);
+ lua_pushstring(L,"numberOfCacheDomains");
+ lua_pushinteger(L, (lua_Integer)(affinity->numberOfCacheDomains));
+ lua_settable(L,-3);
+ lua_pushstring(L,"numberOfCoresPerCache");
+ lua_pushinteger(L, (lua_Integer)(affinity->numberOfCoresPerCache));
+ lua_settable(L,-3);
+ lua_pushstring(L,"numberOfProcessorsPerCache");
+ lua_pushinteger(L, (lua_Integer)(affinity->numberOfProcessorsPerCache));
+ lua_settable(L,-3);
+ lua_pushstring(L,"domains");
+ lua_newtable(L);
+ for(i=0;i<affinity->numberOfAffinityDomains;i++)
+ {
+ lua_pushinteger(L, (lua_Integer)( i+1));
+ lua_newtable(L);
+ lua_pushstring(L,"tag");
+ lua_pushstring(L,bdata(affinity->domains[i].tag));
+ lua_settable(L,-3);
+ lua_pushstring(L,"numberOfProcessors");
+ lua_pushinteger(L, (lua_Integer)(affinity->domains[i].numberOfProcessors));
+ lua_settable(L,-3);
+ lua_pushstring(L,"numberOfCores");
+ lua_pushinteger(L, (lua_Integer)(affinity->domains[i].numberOfCores));
+ lua_settable(L,-3);
+ lua_pushstring(L,"processorList");
+ lua_newtable(L);
+ for(j=0;j<affinity->domains[i].numberOfProcessors;j++)
+ {
+ lua_pushinteger(L, (lua_Integer)(j+1));
+ lua_pushinteger(L, (lua_Integer)(affinity->domains[i].processorList[j]));
+ lua_settable(L,-3);
+ }
+ lua_settable(L,-3);
+ lua_settable(L,-3);
+ }
+ lua_settable(L,-3);
+ return 1;
+}
+
+static int lua_likwid_cpustr_to_cpulist(lua_State* L)
+{
+ int ret = 0;
+ char* cpustr = (char *)luaL_checkstring(L, 1);
+ int* cpulist = (int*) malloc(MAX_NUM_THREADS * sizeof(int));
+ if (cpulist == NULL)
+ {
+ lua_pushstring(L,"Cannot allocate data for the CPU list");
+ lua_error(L);
+ }
+ ret = cpustr_to_cpulist(cpustr, cpulist, MAX_NUM_THREADS);
+ if (ret <= 0)
+ {
+ lua_pushstring(L,"Cannot parse cpustring");
+ lua_error(L);
+ }
+ lua_pushnumber(L, ret);
+ lua_newtable(L);
+ for (int i=0;i<ret;i++)
+ {
+ lua_pushinteger(L, (lua_Integer)( i+1));
+ lua_pushinteger(L, (lua_Integer)( cpulist[i]));
+ lua_settable(L,-3);
+ }
+ free(cpulist);
+ return 2;
+}
+
+static int lua_likwid_nodestr_to_nodelist(lua_State* L)
+{
+ int ret = 0;
+ char* nodestr = (char *)luaL_checkstring(L, 1);
+ int* nodelist = (int*) malloc(MAX_NUM_NODES * sizeof(int));
+ if (nodelist == NULL)
+ {
+ lua_pushstring(L,"Cannot allocate data for the node list");
+ lua_error(L);
+ }
+ ret = nodestr_to_nodelist(nodestr, nodelist, MAX_NUM_NODES);
+ if (ret <= 0)
+ {
+ lua_pushstring(L,"Cannot parse node string");
+ lua_error(L);
+ }
+ lua_pushnumber(L, ret);
+ lua_newtable(L);
+ for (int i=0;i<ret;i++)
+ {
+ lua_pushinteger(L, (lua_Integer)( i+1));
+ lua_pushinteger(L, (lua_Integer)( nodelist[i]));
+ lua_settable(L,-3);
+ }
+ free(nodelist);
+ return 2;
+}
+
+static int lua_likwid_sockstr_to_socklist(lua_State* L)
+{
+ int ret = 0;
+ char* sockstr = (char *)luaL_checkstring(L, 1);
+ int* socklist = (int*) malloc(MAX_NUM_NODES * sizeof(int));
+ if (socklist == NULL)
+ {
+ lua_pushstring(L,"Cannot allocate data for the socket list");
+ lua_error(L);
+ }
+ ret = nodestr_to_nodelist(sockstr, socklist, MAX_NUM_NODES);
+ if (ret <= 0)
+ {
+ lua_pushstring(L,"Cannot parse socket string");
+ lua_error(L);
+ }
+ lua_pushnumber(L, ret);
+ lua_newtable(L);
+ for (int i=0;i<ret;i++)
+ {
+ lua_pushinteger(L, (lua_Integer)( i+1));
+ lua_pushinteger(L, (lua_Integer)( socklist[i]));
+ lua_settable(L,-3);
+ }
+ free(socklist);
+ return 2;
+}
+
+static int lua_likwid_putAffinityInfo(lua_State* L)
+{
+ if (affinity_isInitialized)
+ {
+ affinity_finalize();
+ affinity_isInitialized = 0;
+ affinity = NULL;
+ }
+ return 0;
+}
+
+static int lua_likwid_getPowerInfo(lua_State* L)
+{
+
+ int i;
+ if (topology_isInitialized == 0)
+ {
+ topology_init();
+ topology_isInitialized = 1;
+ cpuinfo = get_cpuInfo();
+ cputopo = get_cpuTopology();
+ }
+ if ((topology_isInitialized) && (cpuinfo == NULL))
+ {
+ cpuinfo = get_cpuInfo();
+ }
+ if ((topology_isInitialized) && (cputopo == NULL))
+ {
+ cputopo = get_cpuTopology();
+ }
+ if (affinity_isInitialized == 0)
+ {
+ affinity_init();
+ affinity_isInitialized = 1;
+ affinity = get_affinityDomains();
+ }
+ if ((affinity_isInitialized) && (affinity == NULL))
+ {
+ affinity = get_affinityDomains();
+ }
+
+ if (power_isInitialized == 0)
+ {
+ power_hasRAPL = power_init(0);
+ for(i=0;i<affinity->numberOfAffinityDomains;i++)
+ {
+ if (bstrchrp(affinity->domains[i].tag, 'S', 0) != BSTR_ERR)
+ {
+ HPMaddThread(affinity->domains[i].processorList[0]);
+ }
+ }
+ if (power_hasRAPL)
+ {
+ power_isInitialized = 1;
+ power = get_powerInfo();
+ }
+ else
+ {
+ return 0;
+ }
+ }
+
+
+ lua_newtable(L);
+ lua_pushstring(L,"hasRAPL");
+ lua_pushboolean(L,power_hasRAPL);
+ lua_settable(L,-3);
+ lua_pushstring(L,"baseFrequency");
+ lua_pushnumber(L,power->baseFrequency);
+ lua_settable(L,-3);
+ lua_pushstring(L,"minFrequency");
+ lua_pushnumber(L,power->minFrequency);
+ lua_settable(L,-3);
+ lua_pushstring(L,"powerUnit");
+ lua_pushnumber(L,power->powerUnit);
+ lua_settable(L,-3);
+ lua_pushstring(L,"timeUnit");
+ lua_pushnumber(L,power->timeUnit);
+ lua_settable(L,-3);
+
+ lua_pushstring(L,"turbo");
+ lua_newtable(L);
+ lua_pushstring(L,"numSteps");
+ lua_pushinteger(L, (lua_Integer)(power->turbo.numSteps));
+ lua_settable(L,-3);
+ lua_pushstring(L,"steps");
+ lua_newtable(L);
+ for(i=0;i<power->turbo.numSteps;i++)
+ {
+ lua_pushinteger(L, (lua_Integer)(i+1));
+ lua_pushnumber(L,power->turbo.steps[i]);
+ lua_settable(L,-3);
+ }
+ lua_settable(L,-3);
+ lua_settable(L,-3);
+
+ lua_pushstring(L,"domains");
+ lua_newtable(L);
+ for(i=0;i<NUM_POWER_DOMAINS;i++)
+ {
+ lua_pushstring(L,power_names[i]);
+ lua_newtable(L);
+
+ lua_pushstring(L, "ID");
+ lua_pushnumber(L, power->domains[i].type);
+ lua_settable(L,-3);
+ lua_pushstring(L, "energyUnit");
+ lua_pushnumber(L, power->domains[i].energyUnit);
+ lua_settable(L,-3);
+ lua_pushstring(L,"supportStatus");
+ if (power->domains[i].supportFlags & POWER_DOMAIN_SUPPORT_STATUS)
+ {
+ lua_pushboolean(L, 1);
+ }
+ else
+ {
+ lua_pushboolean(L, 0);
+ }
+ lua_settable(L,-3);
+ lua_pushstring(L,"supportPerf");
+ if (power->domains[i].supportFlags & POWER_DOMAIN_SUPPORT_PERF)
+ {
+ lua_pushboolean(L, 1);
+ }
+ else
+ {
+ lua_pushboolean(L, 0);
+ }
+ lua_settable(L,-3);
+ lua_pushstring(L,"supportPolicy");
+ if (power->domains[i].supportFlags & POWER_DOMAIN_SUPPORT_POLICY)
+ {
+ lua_pushboolean(L, 1);
+ }
+ else
+ {
+ lua_pushboolean(L, 0);
+ }
+ lua_settable(L,-3);
+ lua_pushstring(L,"supportLimit");
+ if (power->domains[i].supportFlags & POWER_DOMAIN_SUPPORT_LIMIT)
+ {
+ lua_pushboolean(L, 1);
+ }
+ else
+ {
+ lua_pushboolean(L, 0);
+ }
+ lua_settable(L,-3);
+ if (power->domains[i].supportFlags & POWER_DOMAIN_SUPPORT_INFO)
+ {
+ lua_pushstring(L,"supportInfo");
+ lua_pushboolean(L, 1);
+ lua_settable(L,-3);
+ lua_pushstring(L,"tdp");
+ lua_pushnumber(L, power->domains[i].tdp);
+ lua_settable(L,-3);
+ lua_pushstring(L,"minPower");
+ lua_pushnumber(L, power->domains[i].minPower);
+ lua_settable(L,-3);
+ lua_pushstring(L,"maxPower");
+ lua_pushnumber(L, power->domains[i].maxPower);
+ lua_settable(L,-3);
+ lua_pushstring(L,"maxTimeWindow");
+ lua_pushnumber(L, power->domains[i].maxTimeWindow);
+ lua_settable(L,-3);
+ }
+ else
+ {
+ lua_pushstring(L,"supportInfo");
+ lua_pushboolean(L, 0);
+ lua_settable(L,-3);
+ }
+
+ lua_settable(L,-3);
+ }
+ lua_settable(L,-3);
+
+
+ return 1;
+}
+
+static int lua_likwid_putPowerInfo(lua_State* L)
+{
+ if (power_isInitialized)
+ {
+ power_finalize();
+ power_isInitialized = 0;
+ power = NULL;
+ }
+ return 0;
+}
+
+static int lua_likwid_startPower(lua_State* L)
+{
+ PowerData pwrdata;
+ int cpuId = lua_tonumber(L,1);
+ luaL_argcheck(L, cpuId >= 0, 1, "CPU ID must be greater than 0");
+ PowerType type = (PowerType) ((lua_Unsigned)lua_tointegerx(L,2, NULL));
+ luaL_argcheck(L, type >= PKG+1 && type <= DRAM+1, 2, "Type not valid");
+ power_start(&pwrdata, cpuId, type-1);
+ lua_pushnumber(L,pwrdata.before);
+ return 1;
+}
+
+static int lua_likwid_stopPower(lua_State* L)
+{
+ PowerData pwrdata;
+ int cpuId = lua_tonumber(L,1);
+ luaL_argcheck(L, cpuId >= 0, 1, "CPU ID must be greater than 0");
+ PowerType type = (PowerType) ((lua_Unsigned)lua_tointegerx(L,2, NULL));
+ luaL_argcheck(L, type >= PKG+1 && type <= DRAM+1, 2, "Type not valid");
+ power_stop(&pwrdata, cpuId, type-1);
+ lua_pushnumber(L,pwrdata.after);
+ return 1;
+}
+
+static int lua_likwid_printEnergy(lua_State* L)
+{
+ PowerData pwrdata;
+ pwrdata.before = lua_tonumber(L,1);
+ pwrdata.after = lua_tonumber(L,2);
+ pwrdata.domain = lua_tonumber(L,3);
+ lua_pushnumber(L,power_printEnergy(&pwrdata));
+ return 1;
+}
+
+static int lua_likwid_power_limitGet(lua_State* L)
+{
+ int err;
+ int cpuId = lua_tonumber(L,1);
+ int domain = lua_tonumber(L,2);
+ double power = 0.0;
+ double time = 0.0;
+ err = power_limitGet(cpuId, domain, &power, &time);
+ if (err < 0)
+ {
+ lua_pushnumber(L,err);
+ return 1;
+ }
+ lua_pushnumber(L,power);
+ lua_pushnumber(L,time);
+ return 2;
+}
+
+static int lua_likwid_power_limitSet(lua_State* L)
+{
+ int cpuId = lua_tonumber(L,1);
+ int domain = lua_tonumber(L,2);
+ double power = lua_tonumber(L,3);
+ double time = lua_tonumber(L,4);
+ int clamp = lua_tonumber(L,5);
+ lua_pushinteger(L, power_limitSet(cpuId, domain, power, time, clamp));
+ return 1;
+}
+
+static int lua_likwid_power_limitState(lua_State* L)
+{
+ int cpuId = lua_tonumber(L,1);
+ int domain = lua_tonumber(L,2);
+ lua_pushnumber(L,power_limitState(cpuId, domain));
+ return 1;
+}
+
+static int lua_likwid_getCpuClock(lua_State* L)
+{
+ if (timer_isInitialized == 0)
+ {
+ timer_init();
+ timer_isInitialized = 1;
+ }
+ lua_pushnumber(L,timer_getCpuClock());
+ return 1;
+}
+
+static int lua_likwid_getCycleClock(lua_State* L)
+{
+ if (timer_isInitialized == 0)
+ {
+ timer_init();
+ timer_isInitialized = 1;
+ }
+ lua_pushnumber(L,timer_getCycleClock());
+ return 1;
+}
+
+static int lua_sleep(lua_State* L)
+{
+ lua_pushnumber(L, timer_sleep(((lua_Unsigned)lua_tointegerx(L,-1, NULL))));
+ return 1;
+}
+
+static int lua_likwid_startClock(lua_State* L)
+{
+ TimerData timer;
+ double value;
+ if (timer_isInitialized == 0)
+ {
+ timer_init();
+ timer_isInitialized = 1;
+ }
+ timer_start(&timer);
+ value = (double)timer.start.int64;
+ lua_pushnumber(L, value);
+ return 1;
+}
+
+static int lua_likwid_stopClock(lua_State* L)
+{
+ TimerData timer;
+ double value;
+ if (timer_isInitialized == 0)
+ {
+ timer_init();
+ timer_isInitialized = 1;
+ }
+ timer_stop(&timer);
+ value = (double)timer.stop.int64;
+ lua_pushnumber(L, value);
+ return 1;
+}
+
+static int lua_likwid_getClockCycles(lua_State* L)
+{
+ TimerData timer;
+ double start, stop;
+ start = lua_tonumber(L,1);
+ stop = lua_tonumber(L,2);
+ timer.start.int64 = (uint64_t)start;
+ timer.stop.int64 = (uint64_t)stop;
+ if (timer_isInitialized == 0)
+ {
+ timer_init();
+ timer_isInitialized = 1;
+ }
+ lua_pushnumber(L, (double)timer_printCycles(&timer));
+ return 1;
+}
+
+static int lua_likwid_getClock(lua_State* L)
+{
+ TimerData timer;
+ double runtime, start, stop;
+ if (timer_isInitialized == 0)
+ {
+ timer_init();
+ timer_isInitialized = 1;
+ }
+ start = lua_tonumber(L,1);
+ stop = lua_tonumber(L,2);
+ timer.start.int64 = (uint64_t)start;
+ timer.stop.int64 = (uint64_t)stop;
+ runtime = timer_print(&timer);
+ lua_pushnumber(L, runtime);
+ return 1;
+}
+
+static int lua_likwid_initTemp(lua_State* L)
+{
+ int cpuid = ((lua_Unsigned)lua_tointegerx(L,-1, NULL));
+ thermal_init(cpuid);
+ return 0;
+}
+
+static int lua_likwid_readTemp(lua_State* L)
+{
+ int cpuid = ((lua_Unsigned)lua_tointegerx(L,-1, NULL));
+ uint32_t data;
+
+ if (thermal_read(cpuid, &data)) {
+ lua_pushstring(L,"Cannot read thermal data");
+ lua_error(L);
+ }
+ lua_pushnumber(L, data);
+ return 1;
+}
+
+
+static volatile int recv_sigint = 0;
+
+static void signal_catcher(int signo)
+{
+ if (signo == SIGINT)
+ {
+ recv_sigint++;
+ }
+ return;
+}
+
+static int lua_likwid_catch_signal(lua_State* L)
+{
+ signal(SIGINT,signal_catcher);
+ return 0;
+}
+
+static int lua_likwid_return_signal_state(lua_State* L)
+{
+ lua_pushnumber(L, recv_sigint);
+ return 1;
+}
+
+void parse(char *line, char **argv)
+{
+ while (*line != '\0') { /* if not the end of line ....... */
+ while (*line == ' ' || *line == '\t' || *line == '\n')
+ *line++ = '\0'; /* replace white spaces with 0 */
+ *argv++ = line; /* save the argument position */
+ while (*line != '\0' && *line != ' ' &&
+ *line != '\t' && *line != '\n')
+ line++; /* skip the argument until ... */
+ }
+ *argv = '\0'; /* mark the end of argument list */
+}
+
+static volatile int program_running = 0;
+
+static void catch_sigchild(int signo) {
+ program_running = 0;
+}
+
+static int lua_likwid_startProgram(lua_State* L)
+{
+ pid_t pid, ppid;
+ int status;
+ char *exec;
+ char *argv[4096];
+ exec = (char *)luaL_checkstring(L, 1);
+ int nrThreads = luaL_checknumber(L,2);
+ int cpus[MAX_NUM_THREADS];
+ cpu_set_t cpuset;
+ if (nrThreads > 0)
+ {
+ if (!lua_istable(L, -1)) {
+ lua_pushstring(L,"No table given as second argument");
+ lua_error(L);
+ }
+ for (status = 1; status<=nrThreads; status++)
+ {
+ lua_rawgeti(L,-1,status);
+ cpus[status-1] = ((lua_Unsigned)lua_tointegerx(L,-1, NULL));
+ lua_pop(L,1);
+ }
+ }
+ else
+ {
+ for (nrThreads = 0; nrThreads < cpuid_topology.numHWThreads; nrThreads++)
+ cpus[nrThreads] = cpuid_topology.threadPool[nrThreads].apicId;
+ nrThreads = cpuid_topology.numHWThreads;
+ }
+ parse(exec, argv);
+ ppid = getpid();
+ program_running = 1;
+ pid = fork();
+ if (pid < 0)
+ {
+ return 0;
+ }
+ else if ( pid == 0)
+ {
+ if (nrThreads > 0)
+ {
+ affinity_pinProcesses(nrThreads, cpus);
+ }
+ timer_sleep(10);
+ status = execvp(*argv, argv);
+ if (status < 0)
+ {
+ kill(ppid, SIGCHLD);
+ }
+ return 0;
+ }
+ else
+ {
+ signal(SIGCHLD, catch_sigchild);
+ lua_pushnumber(L, pid);
+ }
+ return 1;
+}
+static int lua_likwid_checkProgram(lua_State* L)
+{
+ if (lua_gettop(L) == 1)
+ {
+ int status;
+ pid_t retpid;
+ pid_t pid = lua_tonumber(L, 1);
+ retpid = waitpid(pid, &status, WNOHANG);
+ if (retpid == pid)
+ program_running = 0;
+ }
+ lua_pushboolean(L, program_running);
+ return 1;
+}
+
+static int lua_likwid_killProgram(lua_State* L)
+{
+ pid_t pid = lua_tonumber(L, 1);
+ kill(pid, SIGTERM);
+ program_running = 0;
+ return 0;
+}
+
+static int lua_likwid_waitwid(lua_State* L)
+{
+ int status;
+ pid_t pid = lua_tonumber(L, 1);
+ waitpid(pid, &status, 0);
+ return 0;
+}
+
+static int lua_likwid_memSweep(lua_State* L)
+{
+ int i;
+ int nrThreads = luaL_checknumber(L,1);
+ luaL_argcheck(L, nrThreads > 0, 1, "Thread count must be greater than 0");
+ int cpus[nrThreads];
+ if (!lua_istable(L, -1)) {
+ lua_pushstring(L,"No table given as second argument");
+ lua_error(L);
+ }
+ for (i = 1; i <= nrThreads; i++)
+ {
+ lua_rawgeti(L,-1,i);
+ cpus[i-1] = ((lua_Unsigned)lua_tointegerx(L,-1, NULL));
+ lua_pop(L,1);
+ }
+ memsweep_threadGroup(cpus, nrThreads);
+ return 0;
+}
+
+static int lua_likwid_memSweepDomain(lua_State* L)
+{
+ int domain = luaL_checknumber(L,1);
+ luaL_argcheck(L, domain >= 0, 1, "Domain ID must be greater or equal 0");
+ memsweep_domain(domain);
+ return 0;
+}
+
+static int lua_likwid_pinProcess(lua_State* L)
+{
+ int cpuID = luaL_checknumber(L,-2);
+ int silent = luaL_checknumber(L,-1);
+ luaL_argcheck(L, cpuID >= 0, 1, "CPU ID must be greater or equal 0");
+ if (affinity_isInitialized == 0)
+ {
+ affinity_init();
+ affinity_isInitialized = 1;
+ affinity = get_affinityDomains();
+ }
+ affinity_pinProcess(cpuID);
+ if (!silent)
+ {
+#ifdef COLOR
+ color_on(BRIGHT, COLOR);
+#endif
+ printf("[likwid-pin] Main PID -> core %d - OK", cpuID);
+#ifdef COLOR
+ color_reset();
+#endif
+ printf("\n");
+ }
+ return 0;
+}
+
+static int lua_likwid_setenv(lua_State* L)
+{
+ const char* element = (const char*)luaL_checkstring(L, -2);
+ const char* value = (const char*)luaL_checkstring(L, -1);
+ setenv(element, value, 1);
+ return 0;
+}
+
+static int lua_likwid_getpid(lua_State* L)
+{
+ lua_pushinteger(L, (lua_Integer)(getpid()));
+ return 1;
+}
+
+static int lua_likwid_setVerbosity(lua_State* L)
+{
+ int verbosity = lua_tointeger(L,-1);
+ luaL_argcheck(L, (verbosity >= 0 && verbosity <= DEBUGLEV_DEVELOP), -1,
+ "Verbosity must be between 0 (only errors) and 3 (developer)");
+ perfmon_verbosity = verbosity;
+ return 0;
+}
+
+static int lua_likwid_access(lua_State* L)
+{
+ int flags = 0;
+ const char* file = (const char*)luaL_checkstring(L, 1);
+ const char* perm = (const char*)luaL_checkstring(L, 2);
+ if (!perm)
+ {
+ flags = F_OK;
+ }
+ else
+ {
+ for (int i=0;i<strlen(perm);i++)
+ {
+ if (perm[i] == 'r') {
+ flags |= R_OK;
+ } else if (perm[i] == 'w') {
+ flags |= W_OK;
+ } else if (perm[i] == 'x') {
+ flags |= X_OK;
+ } else if (perm[i] == 'e') {
+ flags |= F_OK;
+ }
+ }
+ }
+ if (file)
+ {
+ lua_pushinteger(L, access(file, flags));
+ return 1;
+ }
+ lua_pushinteger(L, -1);
+ return 1;
+}
+
+static int lua_likwid_markerInit(lua_State* L)
+{
+ likwid_markerInit();
+ return 0;
+}
+
+static int lua_likwid_markerThreadInit(lua_State* L)
+{
+ likwid_markerThreadInit();
+ return 0;
+}
+
+static int lua_likwid_markerClose(lua_State* L)
+{
+ likwid_markerClose();
+ return 0;
+}
+
+static int lua_likwid_markerNext(lua_State* L)
+{
+ likwid_markerNextGroup();
+ return 0;
+}
+
+static int lua_likwid_registerRegion(lua_State* L)
+{
+ const char* tag = (const char*)luaL_checkstring(L, -1);
+ lua_pushinteger(L, likwid_markerRegisterRegion(tag));
+ return 1;
+}
+
+static int lua_likwid_startRegion(lua_State* L)
+{
+ const char* tag = (const char*)luaL_checkstring(L, -1);
+ lua_pushinteger(L, likwid_markerStartRegion(tag));
+ return 1;
+}
+
+static int lua_likwid_stopRegion(lua_State* L)
+{
+ const char* tag = (const char*)luaL_checkstring(L, -1);
+ lua_pushinteger(L, likwid_markerStopRegion(tag));
+ return 1;
+}
+
+static int lua_likwid_getRegion(lua_State* L)
+{
+ int i = 0;
+ const char* tag = (const char*)luaL_checkstring(L, -2);
+ int nr_events = perfmon_getNumberOfEvents(perfmon_getIdOfActiveGroup());
+ double* events = NULL;
+ double time = 0.0;
+ int count = 0;
+
+ events = (double*) malloc(nr_events * sizeof(double));
+ if (events == NULL)
+ {
+ lua_pushstring(L,"Cannot allocate memory for event data\n");
+ lua_error(L);
+ }
+ for (i = 0; i < nr_events; i++)
+ {
+ events[i] = 0.0;
+ }
+ likwid_markerGetRegion(tag, &nr_events, events, &time, &count);
+
+ lua_pushinteger(L, nr_events);
+ lua_newtable(L);
+ for (i=0;i<nr_events;i++)
+ {
+ lua_pushinteger(L, i+1);
+ lua_pushnumber(L, events[i]);
+ lua_settable(L, -3);
+ }
+ lua_pushnumber(L, time);
+ lua_pushinteger(L, count);
+ free(events);
+ return 4;
+}
+
+static int lua_likwid_cpuFeatures_init(lua_State* L)
+{
+ cpuFeatures_init();
+ return 0;
+}
+
+static int lua_likwid_cpuFeatures_print(lua_State* L)
+{
+ int cpu = lua_tointeger(L,-1);
+ cpuFeatures_print(cpu);
+ return 0;
+}
+
+static int lua_likwid_cpuFeatures_get(lua_State* L)
+{
+ int cpu = lua_tointeger(L,-2);
+ CpuFeature feature = lua_tointeger(L,-1);
+ lua_pushinteger(L, cpuFeatures_get(cpu, feature));
+ return 1;
+}
+
+static int lua_likwid_cpuFeatures_name(lua_State* L)
+{
+ char* name = NULL;
+ CpuFeature feature = ((lua_Unsigned)lua_tointegerx(L,-1, NULL));
+ name = cpuFeatures_name(feature);
+ if (name != NULL)
+ {
+ lua_pushstring(L, name);
+ return 1;
+ }
+ return 0;
+}
+
+static int lua_likwid_cpuFeatures_enable(lua_State* L)
+{
+ int cpu = lua_tointeger(L,-3);
+ CpuFeature feature = lua_tointeger(L,-2);
+ int verbose = lua_tointeger(L,-1);
+ lua_pushinteger(L, cpuFeatures_enable(cpu, feature, verbose));
+ return 1;
+}
+
+static int lua_likwid_cpuFeatures_disable(lua_State* L)
+{
+ int cpu = lua_tointeger(L,-3);
+ CpuFeature feature = lua_tointeger(L,-2);
+ int verbose = lua_tointeger(L,-1);
+ lua_pushinteger(L, cpuFeatures_disable(cpu, feature, verbose));
+ return 1;
+}
+
+static int lua_likwid_markerFile_read(lua_State* L)
+{
+ const char* filename = (const char*)luaL_checkstring(L, -1);
+ perfmon_readMarkerFile(filename);
+ return 0;
+}
+
+static int lua_likwid_markerFile_destroy(lua_State* L)
+{
+ perfmon_destroyMarkerResults();
+ return 0;
+}
+
+static int lua_likwid_markerNumRegions(lua_State* L)
+{
+ lua_pushinteger(L, perfmon_getNumberOfRegions());
+ return 1;
+}
+
+static int lua_likwid_markerRegionGroup(lua_State* L)
+{
+ int region = lua_tointeger(L,-1);
+ lua_pushinteger(L, perfmon_getGroupOfRegion(region-1)+1);
+ return 1;
+}
+
+static int lua_likwid_markerRegionTag(lua_State* L)
+{
+ int region = lua_tointeger(L,-1);
+ lua_pushstring(L, perfmon_getTagOfRegion(region-1));
+ return 1;
+}
+
+static int lua_likwid_markerRegionEvents(lua_State* L)
+{
+ int region = lua_tointeger(L,-1);
+ lua_pushinteger(L, perfmon_getEventsOfRegion(region-1));
+ return 1;
+}
+
+static int lua_likwid_markerRegionThreads(lua_State* L)
+{
+ int region = lua_tointeger(L,-1);
+ lua_pushinteger(L, perfmon_getThreadsOfRegion(region-1));
+ return 1;
+}
+
+static int lua_likwid_markerRegionCpulist(lua_State* L)
+{
+ int i = 0;
+ int region = lua_tointeger(L,-1);
+ int* cpulist;
+ int regionCPUs = 0;
+ if (topology_isInitialized == 0)
+ {
+ topology_init();
+ topology_isInitialized = 1;
+ cpuinfo = get_cpuInfo();
+ cputopo = get_cpuTopology();
+ }
+ if ((topology_isInitialized) && (cpuinfo == NULL))
+ {
+ cpuinfo = get_cpuInfo();
+ }
+ if ((topology_isInitialized) && (cputopo == NULL))
+ {
+ cputopo = get_cpuTopology();
+ }
+ cpulist = (int*)malloc(cputopo->numHWThreads * sizeof(int));
+ if (cpulist == NULL)
+ {
+ return 0;
+ }
+ regionCPUs = perfmon_getCpulistOfRegion(region-1, cputopo->numHWThreads, cpulist);
+ if (regionCPUs > 0)
+ {
+ lua_newtable(L);
+ for (i=0; i < regionCPUs; i++)
+ {
+ lua_pushinteger(L, i+1);
+ lua_pushinteger(L, cpulist[i]);
+ lua_settable(L, -3);
+ }
+ return 1;
+ }
+ return 0;
+}
+
+static int lua_likwid_markerRegionTime(lua_State* L)
+{
+ int region = lua_tointeger(L,-2);
+ int thread = lua_tointeger(L,-1);
+ lua_pushnumber(L, perfmon_getTimeOfRegion(region-1, thread-1));
+ return 1;
+}
+
+static int lua_likwid_markerRegionCount(lua_State* L)
+{
+ int region = lua_tointeger(L,-2);
+ int thread = lua_tointeger(L,-1);
+ lua_pushinteger(L, perfmon_getCountOfRegion(region-1, thread-1));
+ return 1;
+}
+
+static int lua_likwid_markerRegionResult(lua_State* L)
+{
+ int region = lua_tointeger(L,-3);
+ int event = lua_tointeger(L,-2);
+ int thread = lua_tointeger(L,-1);
+ lua_pushnumber(L, perfmon_getResultOfRegionThread(region-1, event-1, thread-1));
+ return 1;
+}
+
+static int lua_likwid_markerRegionMetric(lua_State* L)
+{
+ int region = lua_tointeger(L,-3);
+ int metric = lua_tointeger(L,-2);
+ int thread = lua_tointeger(L,-1);
+ lua_pushnumber(L, perfmon_getMetricOfRegionThread(region-1, metric-1, thread-1));
+ return 1;
+}
+
+int __attribute__ ((visibility ("default") )) luaopen_liblikwid(lua_State* L){
+ // Configuration functions
+ lua_register(L, "likwid_getConfiguration", lua_likwid_getConfiguration);
+ lua_register(L, "likwid_setGroupPath", lua_likwid_setGroupPath);
+ lua_register(L, "likwid_putConfiguration", lua_likwid_putConfiguration);
+ // Perfmon functions
+ //lua_register(L, "accessClient_setaccessmode",lua_accessClient_setaccessmode);
+ lua_register(L, "likwid_setAccessClientMode",lua_likwid_setAccessMode);
+ lua_register(L, "likwid_init",lua_likwid_init);
+ lua_register(L, "likwid_addEventSet", lua_likwid_addEventSet);
+ lua_register(L, "likwid_setupCounters",lua_likwid_setupCounters);
+ lua_register(L, "likwid_startCounters",lua_likwid_startCounters);
+ lua_register(L, "likwid_stopCounters",lua_likwid_stopCounters);
+ lua_register(L, "likwid_readCounters",lua_likwid_readCounters);
+ lua_register(L, "likwid_switchGroup",lua_likwid_switchGroup);
+ lua_register(L, "likwid_finalize",lua_likwid_finalize);
+ lua_register(L, "likwid_getEventsAndCounters", lua_likwid_getEventsAndCounters);
+ // Perfmon results functions
+ lua_register(L, "likwid_getResult",lua_likwid_getResult);
+ lua_register(L, "likwid_getLastResult",lua_likwid_getLastResult);
+ lua_register(L, "likwid_getMetric",lua_likwid_getMetric);
+ lua_register(L, "likwid_getLastMetric",lua_likwid_getLastMetric);
+ lua_register(L, "likwid_getNumberOfGroups",lua_likwid_getNumberOfGroups);
+ lua_register(L, "likwid_getRuntimeOfGroup", lua_likwid_getRuntimeOfGroup);
+ lua_register(L, "likwid_getIdOfActiveGroup",lua_likwid_getIdOfActiveGroup);
+ lua_register(L, "likwid_getNumberOfEvents",lua_likwid_getNumberOfEvents);
+ lua_register(L, "likwid_getNumberOfMetrics",lua_likwid_getNumberOfMetrics);
+ lua_register(L, "likwid_getNumberOfThreads",lua_likwid_getNumberOfThreads);
+ lua_register(L, "likwid_getNameOfEvent",lua_likwid_getNameOfEvent);
+ lua_register(L, "likwid_getNameOfCounter",lua_likwid_getNameOfCounter);
+ lua_register(L, "likwid_getNameOfMetric",lua_likwid_getNameOfMetric);
+ lua_register(L, "likwid_getNameOfGroup",lua_likwid_getNameOfGroup);
+ lua_register(L, "likwid_getGroups",lua_likwid_getGroups);
+ lua_register(L, "likwid_getShortInfoOfGroup",lua_likwid_getShortInfoOfGroup);
+ lua_register(L, "likwid_getLongInfoOfGroup",lua_likwid_getLongInfoOfGroup);
+ // Topology functions
+ lua_register(L, "likwid_getCpuInfo",lua_likwid_getCpuInfo);
+ lua_register(L, "likwid_getCpuTopology",lua_likwid_getCpuTopology);
+ lua_register(L, "likwid_putTopology",lua_likwid_putTopology);
+ lua_register(L, "likwid_getNumaInfo",lua_likwid_getNumaInfo);
+ lua_register(L, "likwid_putNumaInfo",lua_likwid_putNumaInfo);
+ lua_register(L, "likwid_setMemInterleaved", lua_likwid_setMemInterleaved);
+ lua_register(L, "likwid_getAffinityInfo",lua_likwid_getAffinityInfo);
+ lua_register(L, "likwid_putAffinityInfo",lua_likwid_putAffinityInfo);
+ lua_register(L, "likwid_getPowerInfo",lua_likwid_getPowerInfo);
+ lua_register(L, "likwid_putPowerInfo",lua_likwid_putPowerInfo);
+ lua_register(L, "likwid_getOnlineDevices", lua_likwid_getOnlineDevices);
+ lua_register(L, "likwid_printSupportedCPUs", lua_likwid_printSupportedCPUs);
+ // CPU string parse functions
+ lua_register(L, "likwid_cpustr_to_cpulist",lua_likwid_cpustr_to_cpulist);
+ lua_register(L, "likwid_nodestr_to_nodelist",lua_likwid_nodestr_to_nodelist);
+ lua_register(L, "likwid_sockstr_to_socklist",lua_likwid_sockstr_to_socklist);
+ // Timer functions
+ lua_register(L, "likwid_getCpuClock",lua_likwid_getCpuClock);
+ lua_register(L, "likwid_getCycleClock",lua_likwid_getCycleClock);
+ lua_register(L, "likwid_startClock",lua_likwid_startClock);
+ lua_register(L, "likwid_stopClock",lua_likwid_stopClock);
+ lua_register(L, "likwid_getClockCycles",lua_likwid_getClockCycles);
+ lua_register(L, "likwid_getClock",lua_likwid_getClock);
+ lua_register(L, "sleep",lua_sleep);
+ // Power functions
+ lua_register(L, "likwid_startPower",lua_likwid_startPower);
+ lua_register(L, "likwid_stopPower",lua_likwid_stopPower);
+ lua_register(L, "likwid_printEnergy",lua_likwid_printEnergy);
+ lua_register(L, "likwid_powerLimitGet",lua_likwid_power_limitGet);
+ lua_register(L, "likwid_powerLimitSet",lua_likwid_power_limitSet);
+ lua_register(L, "likwid_powerLimitState",lua_likwid_power_limitState);
+ // Temperature functions
+ lua_register(L, "likwid_initTemp",lua_likwid_initTemp);
+ lua_register(L, "likwid_readTemp",lua_likwid_readTemp);
+ // MemSweep functions
+ lua_register(L, "likwid_memSweep", lua_likwid_memSweep);
+ lua_register(L, "likwid_memSweepDomain", lua_likwid_memSweepDomain);
+ // Pinning functions
+ lua_register(L, "likwid_pinProcess", lua_likwid_pinProcess);
+ // Helper functions
+ lua_register(L, "likwid_setenv", lua_likwid_setenv);
+ lua_register(L, "likwid_getpid", lua_likwid_getpid);
+ lua_register(L, "likwid_access", lua_likwid_access);
+ lua_register(L, "likwid_startProgram", lua_likwid_startProgram);
+ lua_register(L, "likwid_checkProgram", lua_likwid_checkProgram);
+ lua_register(L, "likwid_killProgram", lua_likwid_killProgram);
+ lua_register(L, "likwid_catchSignal", lua_likwid_catch_signal);
+ lua_register(L, "likwid_getSignalState", lua_likwid_return_signal_state);
+ lua_register(L, "likwid_waitwid", lua_likwid_waitwid);
+ // Verbosity functions
+ lua_register(L, "likwid_setVerbosity", lua_likwid_setVerbosity);
+ // Marker API functions
+ lua_register(L, "likwid_markerInit", lua_likwid_markerInit);
+ lua_register(L, "likwid_markerThreadInit", lua_likwid_markerThreadInit);
+ lua_register(L, "likwid_markerNextGroup", lua_likwid_markerNext);
+ lua_register(L, "likwid_markerClose", lua_likwid_markerClose);
+ lua_register(L, "likwid_registerRegion", lua_likwid_registerRegion);
+ lua_register(L, "likwid_startRegion", lua_likwid_startRegion);
+ lua_register(L, "likwid_stopRegion", lua_likwid_stopRegion);
+ lua_register(L, "likwid_getRegion", lua_likwid_getRegion);
+ // CPU feature manipulation functions
+ lua_register(L, "likwid_cpuFeaturesInit", lua_likwid_cpuFeatures_init);
+ lua_register(L, "likwid_cpuFeaturesGet", lua_likwid_cpuFeatures_get);
+ lua_register(L, "likwid_cpuFeaturesEnable", lua_likwid_cpuFeatures_enable);
+ lua_register(L, "likwid_cpuFeaturesDisable", lua_likwid_cpuFeatures_disable);
+ // Marker API related functions
+ lua_register(L, "likwid_readMarkerFile", lua_likwid_markerFile_read);
+ lua_register(L, "likwid_destroyMarkerFile", lua_likwid_markerFile_destroy);
+ lua_register(L, "likwid_markerNumRegions", lua_likwid_markerNumRegions);
+ lua_register(L, "likwid_markerRegionGroup", lua_likwid_markerRegionGroup);
+ lua_register(L, "likwid_markerRegionTag", lua_likwid_markerRegionTag);
+ lua_register(L, "likwid_markerRegionEvents", lua_likwid_markerRegionEvents);
+ lua_register(L, "likwid_markerRegionThreads", lua_likwid_markerRegionThreads);
+ lua_register(L, "likwid_markerRegionCpulist", lua_likwid_markerRegionCpulist);
+ lua_register(L, "likwid_markerRegionTime", lua_likwid_markerRegionTime);
+ lua_register(L, "likwid_markerRegionCount", lua_likwid_markerRegionCount);
+ lua_register(L, "likwid_markerRegionResult", lua_likwid_markerRegionResult);
+ lua_register(L, "likwid_markerRegionMetric", lua_likwid_markerRegionMetric);
+#ifdef __MIC__
+ setuid(0);
+ seteuid(0);
+#endif
+ return 0;
+}
diff --git a/src/memsweep.c b/src/memsweep.c
index 8abf796..012c000 100644
--- a/src/memsweep.c
+++ b/src/memsweep.c
@@ -5,13 +5,13 @@
*
* Description: Implementation of sweeper module.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -37,13 +37,12 @@
#include <error.h>
#include <types.h>
#include <memsweep.h>
-#include <cpuid.h>
+#include <topology.h>
#include <numa.h>
#include <affinity.h>
extern void _loadData(uint32_t size, void* ptr);
-
/* ##### EXPORTED VARIABLES ########################################### */
@@ -57,14 +56,14 @@ static uint64_t memoryFraction = 80ULL;
/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
-static void*
+static void*
allocateOnNode(size_t size, int domainId)
{
- char *ptr;
+ char *ptr;
- ptr = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
+ ptr = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
- if (ptr == (char *)-1)
+ if (ptr == (char *)-1)
{
ERROR;
}
@@ -74,7 +73,7 @@ allocateOnNode(size_t size, int domainId)
return ptr;
}
-static void
+static void
initMemory(size_t size, char* ptr, int domainId)
{
affinity_pinProcess(numa_info.nodes[domainId].processors[0]);
@@ -101,20 +100,18 @@ findProcessor(uint32_t nodeId, uint32_t coreId)
}
/* evict all dirty cachelines from last level cache */
-static void cleanupCache(FILE* OUTSTREAM, char* ptr)
+static void cleanupCache(char* ptr)
{
-#ifdef __x86_64
+#if defined(__x86_64__) || defined(__i386__)
uint32_t cachesize = 2 * cpuid_topology.cacheLevels[cpuid_topology.numCacheLevels-1].size;
- if (OUTSTREAM != NULL)
- {
- fprintf(OUTSTREAM, "Cleanup LLC using %u MB\n", cachesize / (1000000));
- }
+ printf("Cleaning LLC with %g MB\n", (double)cachesize/(1024.0 * 1024.0));
_loadData(cachesize,ptr);
#else
- ERROR_PLAIN_PRINT(Cleanup cache is currently only available on 64bit X86 systems.);
+ ERROR_PLAIN_PRINT(Cleanup cache is currently only available on X86 systems.);
#endif
}
+
/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
void
@@ -125,35 +122,32 @@ memsweep_setMemoryFraction(uint64_t fraction)
void
-memsweep_node(FILE* OUTSTREAM)
+memsweep_node(void)
{
for ( uint32_t i=0; i < numa_info.numberOfNodes; i++)
{
- memsweep_domain(OUTSTREAM, i);
+ memsweep_domain(i);
}
}
void
-memsweep_domain(FILE* OUTSTREAM, int domainId)
+memsweep_domain(int domainId)
{
char* ptr = NULL;
size_t size = numa_info.nodes[domainId].totalMemory * 1024ULL * memoryFraction / 100ULL;
- if (OUTSTREAM != NULL)
- {
- fprintf(OUTSTREAM, "Sweeping domain %d: Using %g MB of %g MB\n",
- domainId,
- size / (1000.0 * 1000.0),
- numa_info.nodes[domainId].totalMemory/ 1000.0);
- }
+ printf("Sweeping domain %d: Using %g MB of %g MB\n",
+ domainId,
+ size / (1024.0 * 1024.0),
+ numa_info.nodes[domainId].totalMemory/ 1024.0);
ptr = (char*) allocateOnNode(size, domainId);
initMemory(size, ptr, domainId);
- cleanupCache(OUTSTREAM, ptr);
+ cleanupCache(ptr);
munmap(ptr, size);
}
void
-memsweep_threadGroup(FILE* OUTSTREAM, int* processorList, int numberOfProcessors)
+memsweep_threadGroup(int* processorList, int numberOfProcessors)
{
for (uint32_t i=0; i<numa_info.numberOfNodes; i++)
{
@@ -161,10 +155,13 @@ memsweep_threadGroup(FILE* OUTSTREAM, int* processorList, int numberOfProcessors
{
if (findProcessor(i,processorList[j]))
{
- memsweep_domain(OUTSTREAM, i);
+ memsweep_domain(i);
break;
}
}
}
}
+
+
+
diff --git a/src/msr.c b/src/msr.c
deleted file mode 100644
index cb867f2..0000000
--- a/src/msr.c
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: msr.c
- *
- * Description: Implementation of msr module.
- * Provides API to read and write values to the model
- * specific registers on x86 processors using the msr
- * sys interface of the Linux 2.6 kernel. This module
- * is based on the msr-util tools.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-/* ##### HEADER FILE INCLUDES ######################################### */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <fcntl.h>
-#include <string.h>
-#include <unistd.h>
-#include <signal.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <sys/wait.h>
-
-#include <types.h>
-#include <error.h>
-#include <cpuid.h>
-#include <accessClient.h>
-#include <msr.h>
-#include <registers.h>
-
-/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
-#define MAX_LENGTH_MSR_DEV_NAME 20
-#define STRINGIFY(x) #x
-#define TOSTRING(x) STRINGIFY(x)
-
-/* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */
-static int FD[MAX_NUM_THREADS];
-static int socket_fd = -1;
-static int rdpmc_works = 0;
-
-/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
-static inline int __rdpmc(int counter, uint64_t* value)
-{
- unsigned low, high;
- __asm__ volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter));
- *value = ((low) | ((uint64_t )(high) << 32));
- return 0;
-}
-//Needed for rdpmc check
-void segfault_sigaction(int signal, siginfo_t *si, void *arg)
-{
- exit(1);
-}
-
-int test_rdpmc(int flag)
-{
- int ret, waiting;
- int pid;
- int status = 0;
- uint64_t tmp;
- struct sigaction sa;
- memset(&sa, 0, sizeof(struct sigaction));
- sigemptyset(&sa.sa_mask);
- sa.sa_sigaction = segfault_sigaction;
- sa.sa_flags = SA_SIGINFO;
-
- pid = fork();
-
- if (pid < 0)
- {
- return -1;
- }
- if (!pid)
- {
- sigaction(SIGSEGV, &sa, NULL);
- if (flag == 0)
- {
- __rdpmc(0, &tmp);
- }
- exit(0);
- } else {
-
- waiting = waitpid(pid, &status, 0);
- if (waiting < 0 || status)
- {
- ret = 0;
- } else
- {
- ret = 1;
- }
- }
- return ret;
-}
-
-/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
-
-
-void
-msr_init(int initSocket_fd)
-{
- if (accessClient_mode == DAEMON_AM_DIRECT)
- {
- char* msr_file_name = (char*) malloc(MAX_LENGTH_MSR_DEV_NAME * sizeof(char));
-
- sprintf(msr_file_name,"/dev/msr0");
- if( access( msr_file_name, F_OK ) == -1 )
- {
- sprintf(msr_file_name,"/dev/cpu/0/msr");
- }
-
- if (access(msr_file_name, R_OK|W_OK))
- {
- ERROR_PRINT(Cannot access MSR device file %s: %s.\n
- Please check if 'msr' module is loaded and device files have correct permissions\n
- Alternatively you might want to look into (sys)daemonmode\n,msr_file_name , strerror(errno));
- free(msr_file_name);
- exit(127);
- }
- rdpmc_works = test_rdpmc(0);
-
- /* NOTICE: This assumes consecutive processor Ids! */
- for ( uint32_t i=0; i < cpuid_topology.numHWThreads; i++ )
- {
- sprintf(msr_file_name,"/dev/msr%d",i);
- if( access( msr_file_name, F_OK ) == -1 )
- {
- sprintf(msr_file_name,"/dev/cpu/%d/msr",i);
- }
- FD[i] = open(msr_file_name, O_RDWR);
- if ( FD[i] < 0 )
- {
- ERROR_PRINT(Cannot access MSR device file %s: %s\n,
- msr_file_name , strerror(errno));
- free(msr_file_name);
- ERROR;
- }
- }
- free(msr_file_name);
- }
- else
- {
- socket_fd = initSocket_fd;
- }
-}
-
-void
-msr_finalize(void)
-{
- if (accessClient_mode == DAEMON_AM_DIRECT)
- {
- for ( uint32_t i=0; i < cpuid_topology.numHWThreads; i++ )
- {
- close(FD[i]);
- }
- rdpmc_works = 0;
- }
- else
- {
- socket_fd = -1;
- }
-}
-
-
-uint64_t
-msr_tread(const int tsocket_fd, const int cpu, uint32_t reg)
-{
- if (accessClient_mode == DAEMON_AM_DIRECT)
- {
- uint64_t data;
-
- if (rdpmc_works && reg >= MSR_PMC0 && reg <=MSR_PMC3)
- {
- if (__rdpmc(reg - MSR_PMC0, &data) )
- {
- ERROR_PRINT(Cannot read MSR reg 0x%x with RDPMC instruction on CPU %d\n,
- reg,cpu);
- }
- }
- else if (rdpmc_works && reg >= MSR_PERF_FIXED_CTR0 && reg <= MSR_PERF_FIXED_CTR2)
- {
- if (__rdpmc(0x4000000ULL + (reg - MSR_PERF_FIXED_CTR0), &data) )
- {
- ERROR_PRINT(Cannot read MSR reg 0x%x with RDPMC instruction on CPU %d\n,
- reg,cpu);
- }
- }
- else
- {
- if ( pread(FD[cpu], &data, sizeof(data), reg) != sizeof(data) )
- {
- ERROR_PRINT(Cannot read MSR reg 0x%x with RDMSR instruction on CPU %d\n,
- reg, cpu);
- }
- }
-
- return data;
- }
- else
- { /* daemon or sysdaemon-mode */
- return accessClient_read(tsocket_fd, cpu, DAEMON_AD_MSR, reg);
- }
-}
-
-
-void
-msr_twrite(const int tsocket_fd, const int cpu, uint32_t reg, uint64_t data)
-{
- if (accessClient_mode == DAEMON_AM_DIRECT)
- {
- if (pwrite(FD[cpu], &data, sizeof(data), reg) != sizeof(data))
- {
- ERROR_PRINT(Cannot write MSR reg 0x%x with WRMSR instruction on CPU %d\n,
- reg, cpu);
- }
- }
- else
- { /* daemon or sysdaemon-mode */
- accessClient_write(tsocket_fd, cpu, DAEMON_AD_MSR, reg, data);
- }
-}
-
-
-uint64_t
-msr_read( const int cpu, uint32_t reg)
-{
- if (accessClient_mode == DAEMON_AM_DIRECT)
- {
- uint64_t data;
-
- if (rdpmc_works && reg >= MSR_PMC0 && reg <=MSR_PMC3)
- {
- if (__rdpmc(reg - MSR_PMC0, &data) )
- {
- ERROR_PRINT(Cannot read MSR reg 0x%x with RDPMC instruction on CPU %d\n,
- reg,cpu);
- }
- }
- else if (rdpmc_works && reg >= MSR_PERF_FIXED_CTR0 && reg <= MSR_PERF_FIXED_CTR2)
- {
- if (__rdpmc(0x4000000ULL + (reg - MSR_PERF_FIXED_CTR0), &data) )
- {
- ERROR_PRINT(Cannot read MSR reg 0x%x with RDPMC instruction on CPU %d\n,
- reg,cpu);
- }
- }
- else
- {
- if ( pread(FD[cpu], &data, sizeof(data), reg) != sizeof(data) )
- {
- ERROR_PRINT(Cannot read MSR reg 0x%x with RDMSR instruction on CPU %d\n,
- reg, cpu);
- }
- }
-
- return data;
- }
- else
- { /* daemon or sysdaemon-mode */
- return accessClient_read(socket_fd, cpu, DAEMON_AD_MSR, reg);
- }
-}
-
-
-void
-msr_write( const int cpu, uint32_t reg, uint64_t data)
-{
- if (accessClient_mode == DAEMON_AM_DIRECT)
- {
- if (pwrite(FD[cpu], &data, sizeof(data), reg) != sizeof(data))
- {
- ERROR_PRINT(Cannot write MSR reg 0x%x with WRMSR instruction on CPU %d\n,
- reg, cpu);
- }
- }
- else
- { /* daemon or sysdaemon-mode */
- accessClient_write(socket_fd, cpu, DAEMON_AD_MSR, reg, data);
- }
-}
-
-
diff --git a/src/multiplex.c b/src/multiplex.c
deleted file mode 100644
index 68a6b88..0000000
--- a/src/multiplex.c
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: multiplex.c
- *
- * Description:
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <signal.h>
-#include <sys/time.h>
-
-#include <timer.h>
-#include <perfmon.h>
-#include <multiplex.h>
-
-#if 0
-static int currentCollection = -1;
-static MultiplexCollections* multiplex_set = NULL;
-static TimerData timeData;
-static int multiplex_useMarker = 0;
-
-void
-multiplex_printCounters ()
-{
-
-
-
-}
-
-
-
-void
-multiplex_swapEventSet ()
-{
- int threadId;
- PerfmonEventSet* collection;
-
- /* collection from last run */
- collection = multiplex_set->collections + currentCollection;
-
- for (threadId = 0; threadId < perfmon_numThreads; threadId++)
- {
- /* Stop counters */
- if (!multiplex_useMarker) perfmon_stopCountersThread(threadId);
- /* Accumulate counters */
- for (int i=0; i<collection->numberOfEvents; i++)
- {
-// collection->events[i].result[threadId] +=
- // (double) perfmon_threadData[threadId].counters[collection->events[i].index].counterData;
- }
- }
-
- /* switch to next collection */
- if( currentCollection == multiplex_set->numberOfCollections-1)
- {
- currentCollection = 0;
- }
- else
- {
- currentCollection++;
- }
- collection = multiplex_set->collections + currentCollection;
-
- for (threadId = 0; threadId < perfmon_numThreads; threadId++)
- {
- /* Reconfigure counters */
- for (int i=0; i<collection->numberOfEvents; i++)
- {
- perfmon_setupCounterThread(threadId,
- collection->events[i].event.eventId,
- collection->events[i].event.umask,
- collection->events[i].index);
- }
-
- /* Start counters */
- if (!multiplex_useMarker) perfmon_startCountersThread(threadId);
- }
-}
-
-void
-multiplex_init(MultiplexCollections* set)
-{
- int i;
-
- multiplex_set = set;
-
- for (i=0;i<multiplex_set->numberOfCollections; i++)
- {
-// perfmon_initEventset(multiplex_set->collections+i);
- }
-}
-
-void
-multiplex_start()
-{
- struct itimerval val;
- struct sigaction sa;
-
-// multiplex_useMarker = useMarker;
-
- val.it_interval.tv_sec = 0;
- val.it_interval.tv_usec = 500;
- val.it_value.tv_sec = 0;
- val.it_value.tv_usec = 100;
-
- sa.sa_handler = multiplex_printCounters;
- sigemptyset(&sa.sa_mask);
- sa.sa_flags = SA_RESTART;
- if (sigaction(SIGALRM, &sa, NULL) == -1)
- {
- /* Handle error */;
- perror("sigaction");
- }
-
- perfmon_startCounters();
- setitimer(ITIMER_REAL, &val,0);
- timer_start(&timeData);
-}
-
-void
-multiplex_stop()
-{
- struct itimerval val;
-
- val.it_interval.tv_sec = 0;
- val.it_interval.tv_usec = 0;
- val.it_value.tv_sec = 0;
- val.it_value.tv_usec = 0;
-
- timer_stop(&timeData);
- setitimer(ITIMER_REAL, &val,0);
- perfmon_stopCounters();
-
- multiplex_set->time = timer_print(&timeData);
-}
-
-#endif
-
-
diff --git a/src/numa.c b/src/numa.c
index 2f72765..09459d3 100644
--- a/src/numa.c
+++ b/src/numa.c
@@ -3,15 +3,17 @@
*
* Filename: numa.c
*
- * Description: Implementation of Linux NUMA interface
+ * Description: Implementation of Linux NUMA interface. Selects between hwloc and
+ * procfs/sysfs backends.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -37,352 +39,194 @@
#include <sched.h>
#include <sys/syscall.h>
#include <sys/types.h>
+#include <error.h>
#include <dirent.h>
#ifdef HAS_MEMPOLICY
#include <linux/mempolicy.h>
#endif
+#include <topology.h>
+
+#include <configuration.h>
#include <error.h>
#include <bstrlib.h>
+
#include <numa.h>
-#include <strUtil.h>
+#include <numa_proc.h>
-/* ##### EXPORTED VARIABLES ########################################### */
+#ifdef LIKWID_USE_HWLOC
+#include <hwloc.h>
+#include <topology_hwloc.h>
+#include <numa_hwloc.h>
+#endif
-NumaTopology numa_info;
+/* ##### EXPORTED VARIABLES ########################################### */
+NumaTopology numa_info = {0,NULL};
+static int numaInitialized = 0;
/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
-
-#ifdef HAS_MEMPOLICY
-#define get_mempolicy(policy,nmask,maxnode,addr,flags) syscall(SYS_get_mempolicy,policy,nmask,maxnode,addr,flags)
-#define set_mempolicy(mode,nmask,maxnode) syscall(SYS_set_mempolicy,mode,nmask,maxnode)
-#define mbind(start, len, nmask, maxnode, flags) syscall(SYS_mbind,(start),len,MPOL_BIND,(nmask),maxnode,flags)
-#endif
-
/* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */
-
-static int maxIdConfiguredNode = 0;
-
/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
-static void
-setConfiguredNodes(void)
+int str2int(const char* str)
{
- DIR *dir;
- struct dirent *de;
-
- dir = opendir("/sys/devices/system/node");
+ char* endptr;
+ errno = 0;
+ unsigned long val;
+ val = strtoul(str, &endptr, 10);
- if (!dir)
+ if ((errno == ERANGE && val == LONG_MAX)
+ || (errno != 0 && val == 0))
{
- maxIdConfiguredNode = 0;
+ fprintf(stderr, "Value in string out of range\n");
+ return -EINVAL;
}
- else
- {
- while ((de = readdir(dir)) != NULL)
- {
- int nd;
- if (strncmp(de->d_name, "node", 4))
- {
- continue;
- }
- nd = str2int(de->d_name+4);
-
- if (maxIdConfiguredNode < nd)
- {
- maxIdConfiguredNode = nd;
- }
- }
- closedir(dir);
- }
-}
-
-
-static void
-nodeMeminfo(int node, uint64_t* totalMemory, uint64_t* freeMemory)
-{
- FILE *fp;
- bstring filename;
- bstring totalString = bformat("MemTotal:");
- bstring freeString = bformat("MemFree:");
- int i;
-
- filename = bformat("/sys/devices/system/node/node%d/meminfo", node);
-
- if (NULL != (fp = fopen (bdata(filename), "r")))
- {
- bstring src = bread ((bNread) fread, fp);
- struct bstrList* tokens = bsplit(src,(char) '\n');
-
- for (i=0;i<tokens->qty;i++)
- {
- if (binstr(tokens->entry[i],0,totalString) != BSTR_ERR)
- {
- bstring tmp = bmidstr (tokens->entry[i], 18, blength(tokens->entry[i])-18 );
- bltrimws(tmp);
- struct bstrList* subtokens = bsplit(tmp,(char) ' ');
- *totalMemory = str2int(bdata(subtokens->entry[0]));
- }
- else if (binstr(tokens->entry[i],0,freeString) != BSTR_ERR)
- {
- bstring tmp = bmidstr (tokens->entry[i], 18, blength(tokens->entry[i])-18 );
- bltrimws(tmp);
- struct bstrList* subtokens = bsplit(tmp,(char) ' ');
- *freeMemory = str2int(bdata(subtokens->entry[0]));
- }
- }
- }
- else
+ if (endptr == str)
{
- ERROR;
+ fprintf(stderr, "No digits were found\n");
+ return -EINVAL;
}
- fclose(fp);
+ return (int) val;
}
-static int
-nodeProcessorList(int node, uint32_t** list)
-{
- FILE *fp;
- bstring filename;
- int count = 0;
- bstring src;
- int i,j;
- struct bstrList* tokens;
- unsigned long val;
- char* endptr;
- int cursor=0;
-// int unitSize = (int) (sizeof(unsigned long)*8);
- int unitSize = (int) 32; /* 8 nibbles */
-
- *list = (uint32_t*) malloc(MAX_NUM_THREADS * sizeof(uint32_t));
-
- /* the cpumap interface should be always there */
- filename = bformat("/sys/devices/system/node/node%d/cpumap", node);
-
- if (NULL != (fp = fopen (bdata(filename), "r")))
- {
-
- src = bread ((bNread) fread, fp);
- tokens = bsplit(src,',');
-
- for (i=(tokens->qty-1); i>=0 ;i--)
- {
- val = strtoul((char*) tokens->entry[i]->data, &endptr, 16);
-
- if ((errno != 0 && val == LONG_MAX )
- || (errno != 0 && val == 0))
- {
- ERROR;
- }
-
- if (endptr == (char*) tokens->entry[i]->data)
- {
- ERROR_PLAIN_PRINT(No digits were found);
- }
-
- if (val != 0UL)
- {
- for (j=0; j<unitSize; j++)
- {
- if (val&(1UL<<j))
- {
- if (count < MAX_NUM_THREADS)
- {
- (*list)[count] = (j+cursor);
- }
- else
- {
- ERROR_PRINT(Number Of threads %d too large,count);
- }
- count++;
- }
- }
- }
- cursor += unitSize;
- }
-
- bstrListDestroy(tokens);
- bdestroy(src);
- bdestroy(filename);
- fclose(fp);
-
- /* FIXME: CPU list here is not physical cores first but numerical sorted */
-
- return count;
- }
+/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
- /* something went wrong */
- return -1;
-}
-
-static int
-nodeDistanceList(int node, int numberOfNodes, uint32_t** list)
+int
+empty_numa_init()
{
- FILE *fp;
- bstring filename;
- int count = 0;
- bstring src;
- struct bstrList* tokens;
-
- *list = (uint32_t*) malloc(numberOfNodes * sizeof(uint32_t));
-
- /* the distance interface should be always there */
- filename = bformat("/sys/devices/system/node/node%d/distance", node);
-
- if (NULL != (fp = fopen (bdata(filename), "r")))
- {
-
- src = bread ((bNread) fread, fp);
- tokens = bsplit(src,' ');
-
- for (int i=0; i<(tokens->qty); i++)
- {
- if (count < numberOfNodes)
- {
- (*list)[count] = (uint32_t)strtoul((char*) (tokens->entry[i]->data), NULL, 10);
- }
- else
- {
- ERROR_PRINT(Number Of nodes %d too large,count);
- }
- count++;
- }
-
- bstrListDestroy(tokens);
- bdestroy(src);
- bdestroy(filename);
- fclose(fp);
- return count;
- }
-
- /* something went wrong */
- return -1;
+ printf("MEMPOLICY NOT supported in kernel!\n");
+ return 0;
}
-
-
-static int
-findProcessor(uint32_t nodeId, uint32_t coreId)
+void
+empty_numa_setInterleaved(int* processorList, int numberOfProcessors)
{
- int i;
+ printf("MEMPOLICY NOT supported in kernel!\n");
+ return;
+}
- for (i=0; i<numa_info.nodes[nodeId].numberOfProcessors; i++)
- {
- if (numa_info.nodes[nodeId].processors[i] == coreId)
- {
- return 1;
- }
- }
- return 0;
+void
+empty_numa_membind(void* ptr, size_t size, int domainId)
+{
+ printf("MBIND NOT supported in kernel!\n");
+ return;
}
-/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
+const struct numa_functions numa_funcs = {
+#ifndef HAS_MEMPOLICY
+ .numa_init = empty_numa_init,
+ .numa_setInterleaved = empty_numa_setInterleaved,
+ .numa_membind = empty_numa_membind
+#else
+#ifdef LIKWID_USE_HWLOC
+ .numa_init = hwloc_numa_init,
+#else
+ .numa_init = proc_numa_init,
+#endif
+ .numa_setInterleaved = proc_numa_setInterleaved,
+ .numa_membind = proc_numa_membind
+#endif
+};
-#ifdef HAS_MEMPOLICY
-int
-numa_init()
-{
- int errno;
- uint32_t i;
- if (get_mempolicy(NULL, NULL, 0, 0, 0) < 0 && errno == ENOSYS)
+int numa_init(void)
+{
+ const struct numa_functions funcs = numa_funcs;
+ int ret = 0;
+ if (init_config == 0)
{
- return -1;
+ init_configuration();
}
-
- /* First determine maximum number of nodes */
- setConfiguredNodes();
- numa_info.numberOfNodes = maxIdConfiguredNode+1;
- numa_info.nodes = (NumaNode*) malloc(numa_info.numberOfNodes * sizeof(NumaNode));
-
- for (i=0; i<numa_info.numberOfNodes; i++)
+ if (numaInitialized == 1)
{
- nodeMeminfo(i, &numa_info.nodes[i].totalMemory, &numa_info.nodes[i].freeMemory);
- numa_info.nodes[i].id = i;
- numa_info.nodes[i].numberOfProcessors = nodeProcessorList(i,&numa_info.nodes[i].processors);
- numa_info.nodes[i].numberOfDistances = nodeDistanceList(i, numa_info.numberOfNodes, &numa_info.nodes[i].distances);
+ return 0;
}
- if (numa_info.nodes[0].numberOfProcessors < 0)
+ if ((config.topologyCfgFileName != NULL) && (!access(config.topologyCfgFileName, R_OK)) && (numa_info.nodes != NULL))
{
- return -1;
+ /* If we read in the topology file, the NUMA related stuff is already initialized */
+ numaInitialized = 1;
+ return 0;
}
else
{
- return 0;
+ cpu_set_t cpuSet;
+ CPU_ZERO(&cpuSet);
+ sched_getaffinity(0,sizeof(cpu_set_t), &cpuSet);
+ if (cpuid_topology.activeHWThreads < cpuid_topology.numHWThreads)
+ {
+ ret = proc_numa_init();
+ }
+ else
+ {
+ ret = funcs.numa_init();
+ }
+ if (ret == 0)
+ numaInitialized = 1;
}
+ return ret;
}
-void
-numa_setInterleaved(int* processorList, int numberOfProcessors)
+void numa_setInterleaved(int* processorList, int numberOfProcessors)
{
- long i;
- int j;
- int ret=0;
- unsigned long numberOfNodes = 65;
- unsigned long mask = 0UL;
+ const struct numa_functions funcs = numa_funcs;
+ return funcs.numa_setInterleaved(processorList, numberOfProcessors);
+}
+
+void numa_membind(void* ptr, size_t size, int domainId)
+{
+ const struct numa_functions funcs = numa_funcs;
+ return funcs.numa_membind(ptr, size, domainId);
+}
- for (i=0; i<numa_info.numberOfNodes; i++)
+#ifndef HAS_MEMPOLICY
+void numa_finalize(void)
+{
+ return;
+}
+#else
+void numa_finalize(void)
+{
+ int i;
+ if (!numaInitialized)
{
- for (j=0; j<numberOfProcessors; j++)
+ return;
+ }
+ for(i=0;i<numa_info.numberOfNodes;i++)
+ {
+ if (numa_info.nodes[i].processors)
+ {
+ free(numa_info.nodes[i].processors);
+ }
+ if (numa_info.nodes[i].distances)
{
- if (findProcessor(i,processorList[j]))
- {
- mask |= (1UL<<i);
- break;
- }
+ free(numa_info.nodes[i].distances);
}
+ numa_info.nodes[i].id = 0;
+ numa_info.nodes[i].totalMemory = 0;
+ numa_info.nodes[i].freeMemory = 0;
+ numa_info.nodes[i].numberOfProcessors = 0;
+ numa_info.nodes[i].numberOfDistances = 0;
}
-
- ret = set_mempolicy(MPOL_INTERLEAVE,&mask,numberOfNodes);
-
- if (ret < 0)
+ if (numa_info.nodes)
{
- ERROR;
+ free(numa_info.nodes);
}
+ numa_info.numberOfNodes = 0;
+ numaInitialized = 0;
+ return;
}
-void
-numa_membind(void* ptr, size_t size, int domainId)
+int likwid_getNumberOfNodes()
{
- int ret=0;
- unsigned long mask = 0UL;
- unsigned int flags = 0U;
-
- flags |= MPOL_MF_STRICT;
- mask |= (1UL<<domainId);
-
- ret = mbind(ptr, size, &mask, numa_info.numberOfNodes+1, flags);
-
- if (ret < 0)
+ if (numaInitialized)
{
- ERROR;
+ return numa_info.numberOfNodes;
}
+ return 0;
}
-
-#else
-int
-numa_init()
-{
- printf("MEMPOLICY NOT supported in kernel!\n");
-}
-
-void
-numa_setInterleaved(int* processorList, int numberOfProcessors)
-{
- printf("MEMPOLICY NOT supported in kernel!\n");
-}
-
-void
-numa_membind(void* ptr, size_t size, int domainId)
-{
- printf("MBIND NOT supported in kernel!\n");
-}
-
#endif
-
-
diff --git a/src/numa_hwloc.c b/src/numa_hwloc.c
new file mode 100644
index 0000000..94639fc
--- /dev/null
+++ b/src/numa_hwloc.c
@@ -0,0 +1,415 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: numa_hwloc.c
+ *
+ * Description: Interface to hwloc for NUMA topology
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <error.h>
+
+#include <numa.h>
+#include <topology.h>
+#ifdef LIKWID_USE_HWLOC
+#include <hwloc.h>
+#include <topology_hwloc.h>
+#endif
+
+
+
+/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
+#ifdef LIKWID_USE_HWLOC
+uint64_t getFreeNodeMem(int nodeId)
+{
+ FILE *fp;
+ bstring filename;
+ uint64_t free = 0;
+ bstring freeString = bformat("MemFree:");
+ int i;
+
+ filename = bformat("/sys/devices/system/node/node%d/meminfo", nodeId);
+
+ if (NULL != (fp = fopen (bdata(filename), "r")))
+ {
+ bstring src = bread ((bNread) fread, fp);
+ struct bstrList* tokens = bsplit(src,(char) '\n');
+
+ for (i=0;i<tokens->qty;i++)
+ {
+ if (binstr(tokens->entry[i],0,freeString) != BSTR_ERR)
+ {
+ bstring tmp = bmidstr (tokens->entry[i], 18, blength(tokens->entry[i])-18 );
+ bltrimws(tmp);
+ struct bstrList* subtokens = bsplit(tmp,(char) ' ');
+ free = str2int(bdata(subtokens->entry[0]));
+ bdestroy(tmp);
+ bstrListDestroy(subtokens);
+ }
+ }
+ bstrListDestroy(tokens);
+ bdestroy(src);
+ fclose(fp);
+ }
+ else if (!access("/proc/meminfo", R_OK))
+ {
+ bdestroy(filename);
+ filename = bfromcstr("/proc/meminfo");
+ if (NULL != (fp = fopen (bdata(filename), "r")))
+ {
+ bstring src = bread ((bNread) fread, fp);
+ struct bstrList* tokens = bsplit(src,(char) '\n');
+ for (i=0;i<tokens->qty;i++)
+ {
+ if (binstr(tokens->entry[i],0,freeString) != BSTR_ERR)
+ {
+ bstring tmp = bmidstr (tokens->entry[i], 10, blength(tokens->entry[i])-10 );
+ bltrimws(tmp);
+ struct bstrList* subtokens = bsplit(tmp,(char) ' ');
+ free = str2int(bdata(subtokens->entry[0]));
+ bdestroy(tmp);
+ bstrListDestroy(subtokens);
+ }
+ }
+ bstrListDestroy(tokens);
+ bdestroy(src);
+ fclose(fp);
+ }
+ }
+ else
+ {
+ bdestroy(freeString);
+ bdestroy(filename);
+ ERROR;
+ }
+ bdestroy(freeString);
+ bdestroy(filename);
+ return free;
+
+}
+
+uint64_t getTotalNodeMem(int nodeId)
+{
+ int i;
+ FILE *fp;
+ uint64_t total = 0;
+ bstring totalString = bformat("MemTotal:");
+ bstring sysfilename = bformat("/sys/devices/system/node/node%d/meminfo", nodeId);
+ bstring procfilename = bformat("/proc/meminfo");
+
+ if (NULL != (fp = fopen (bdata(sysfilename), "r")))
+ {
+ bstring src = bread ((bNread) fread, fp);
+ struct bstrList* tokens = bsplit(src,(char) '\n');
+
+ for (i=0;i<tokens->qty;i++)
+ {
+ if (binstr(tokens->entry[i],0,totalString) != BSTR_ERR)
+ {
+ bstring tmp = bmidstr (tokens->entry[i], 18, blength(tokens->entry[i])-18 );
+ bltrimws(tmp);
+ struct bstrList* subtokens = bsplit(tmp,(char) ' ');
+ total = str2int(bdata(subtokens->entry[0]));
+ bdestroy(tmp);
+ bstrListDestroy(subtokens);
+ }
+ }
+ bstrListDestroy(tokens);
+ bdestroy(src);
+ fclose(fp);
+ }
+ else if (!access(bdata(procfilename), R_OK))
+ {
+ if (NULL != (fp = fopen (bdata(procfilename), "r")))
+ {
+ bstring src = bread ((bNread) fread, fp);
+ struct bstrList* tokens = bsplit(src,(char) '\n');
+ for (i=0;i<tokens->qty;i++)
+ {
+ if (binstr(tokens->entry[i],0,totalString) != BSTR_ERR)
+ {
+ bstring tmp = bmidstr (tokens->entry[i], 10, blength(tokens->entry[i])-10 );
+ bltrimws(tmp);
+ struct bstrList* subtokens = bsplit(tmp,(char) ' ');
+ total = str2int(bdata(subtokens->entry[0]));
+ bdestroy(tmp);
+ bstrListDestroy(subtokens);
+ }
+ }
+ bstrListDestroy(tokens);
+ bdestroy(src);
+ fclose(fp);
+ }
+ }
+ else
+ {
+ bdestroy(totalString);
+ bdestroy(sysfilename);
+ bdestroy(procfilename);
+ ERROR;
+ }
+
+ bdestroy(totalString);
+ bdestroy(sysfilename);
+ bdestroy(procfilename);
+ return total;
+}
+
+int likwid_hwloc_findProcessor(int nodeID, int cpuID)
+{
+ hwloc_obj_t obj;
+ int i;
+ int pu_count = likwid_hwloc_get_nbobjs_by_type(hwloc_topology, HWLOC_OBJ_PU);
+
+ for (i=0; i<pu_count; i++)
+ {
+ obj = likwid_hwloc_get_obj_by_type(hwloc_topology, HWLOC_OBJ_PU, i);
+ if (!obj)
+ {
+ continue;
+ }
+ else
+ {
+ if (obj->os_index == cpuID)
+ {
+ return 1;
+ }
+ }
+ }
+ return 0;
+
+}
+
+/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
+int hwloc_numa_init(void)
+{
+ int errno;
+ uint32_t i;
+ int d;
+ int depth;
+ int cores_per_socket;
+ hwloc_obj_t obj;
+ const struct hwloc_distances_s* distances;
+ hwloc_obj_type_t hwloc_type = HWLOC_OBJ_NODE;
+
+ if (!hwloc_topology)
+ {
+ likwid_hwloc_topology_init(&hwloc_topology);
+ likwid_hwloc_topology_load(hwloc_topology);
+ }
+
+ numa_info.numberOfNodes = likwid_hwloc_get_nbobjs_by_type(hwloc_topology, hwloc_type);
+
+ /* If the amount of NUMA nodes == 0, there is actually no NUMA node, hence
+ aggregate all sockets in the system into the single virtually created NUMA node */
+ if (numa_info.numberOfNodes == 0)
+ {
+ hwloc_type = HWLOC_OBJ_SOCKET;
+ numa_info.numberOfNodes = 1;
+
+ numa_info.nodes = (NumaNode*) malloc(sizeof(NumaNode));
+ if (!numa_info.nodes)
+ {
+ fprintf(stderr,"No memory to allocate %ld byte for nodes array\n",sizeof(NumaNode));
+ return -1;
+ }
+
+ numa_info.nodes[0].id = 0;
+ numa_info.nodes[0].numberOfProcessors = 0;
+ numa_info.nodes[0].totalMemory = getTotalNodeMem(0);
+ numa_info.nodes[0].freeMemory = getFreeNodeMem(0);
+ numa_info.nodes[0].processors = (uint32_t*) malloc(MAX_NUM_THREADS * sizeof(uint32_t));
+ if (!numa_info.nodes[0].processors)
+ {
+ fprintf(stderr,"No memory to allocate %ld byte for processors array of NUMA node %d\n",MAX_NUM_THREADS * sizeof(uint32_t),0);
+ return -1;
+ }
+ numa_info.nodes[0].distances = (uint32_t*) malloc(sizeof(uint32_t));
+ if (!numa_info.nodes[0].distances)
+ {
+ fprintf(stderr,"No memory to allocate %ld byte for distances array of NUMA node %d\n",sizeof(uint32_t),0);
+ return -1;
+ }
+ numa_info.nodes[0].distances[0] = 10;
+ numa_info.nodes[0].numberOfDistances = 1;
+ cores_per_socket = cpuid_topology.numHWThreads/cpuid_topology.numSockets;
+
+ for (d=0; d<likwid_hwloc_get_nbobjs_by_type(hwloc_topology, hwloc_type); d++)
+ {
+ obj = likwid_hwloc_get_obj_by_type(hwloc_topology, hwloc_type, d);
+ /* depth is here used as index in the processors array */
+ depth = d * cores_per_socket;
+ numa_info.nodes[0].numberOfProcessors += likwid_hwloc_record_objs_of_type_below_obj(
+ likwid_hwloc_topology, obj, HWLOC_OBJ_PU, &depth, &numa_info.nodes[0].processors);
+ }
+ }
+ else
+ {
+ numa_info.nodes = (NumaNode*) malloc(numa_info.numberOfNodes * sizeof(NumaNode));
+ if (!numa_info.nodes)
+ {
+ fprintf(stderr,"No memory to allocate %ld byte for nodes array\n",numa_info.numberOfNodes * sizeof(NumaNode));
+ return -1;
+ }
+ depth = likwid_hwloc_get_type_depth(hwloc_topology, hwloc_type);
+ distances = likwid_hwloc_get_whole_distance_matrix_by_type(hwloc_topology, hwloc_type);
+ for (i=0; i<numa_info.numberOfNodes; i++)
+ {
+ obj = likwid_hwloc_get_obj_by_depth(hwloc_topology, depth, i);
+
+ numa_info.nodes[i].id = obj->os_index;
+
+ if (obj->memory.local_memory != 0)
+ {
+ numa_info.nodes[i].totalMemory = (uint64_t)(obj->memory.local_memory/1024);
+ }
+ else if (obj->memory.total_memory != 0)
+ {
+ numa_info.nodes[i].totalMemory = (uint64_t)(obj->memory.total_memory/1024);
+ }
+ else
+ {
+ numa_info.nodes[i].totalMemory = getTotalNodeMem(numa_info.nodes[i].id);
+ }
+
+ /* freeMemory not detected by hwloc, do it the native way */
+ numa_info.nodes[i].freeMemory = getFreeNodeMem(numa_info.nodes[i].id);
+ numa_info.nodes[i].processors = (uint32_t*) malloc(MAX_NUM_THREADS * sizeof(uint32_t));
+ if (!numa_info.nodes[i].processors)
+ {
+ fprintf(stderr,"No memory to allocate %ld byte for processors array of NUMA node %d\n",MAX_NUM_THREADS * sizeof(uint32_t), i);
+ return -1;
+ }
+ d = 0;
+ numa_info.nodes[i].numberOfProcessors = likwid_hwloc_record_objs_of_type_below_obj(
+ hwloc_topology, obj, HWLOC_OBJ_PU, &d, &numa_info.nodes[i].processors);
+
+ numa_info.nodes[i].distances = (uint32_t*) malloc(numa_info.numberOfNodes * sizeof(uint32_t));
+ if (!numa_info.nodes[i].distances)
+ {
+ fprintf(stderr,"No memory to allocate %ld byte for distances array of NUMA node %d\n",numa_info.numberOfNodes*sizeof(uint32_t),i);
+ return -1;
+ }
+ if (distances)
+ {
+ numa_info.nodes[i].numberOfDistances = distances->nbobjs;
+ for(d=0;d<distances->nbobjs;d++)
+ {
+ numa_info.nodes[i].distances[d] = distances->latency[i*distances->nbobjs + d] * distances->latency_base;
+ }
+ }
+ else
+ {
+ numa_info.nodes[i].numberOfDistances = numa_info.numberOfNodes;
+ for(d=0;d<numa_info.numberOfNodes;d++)
+ {
+ numa_info.nodes[i].distances[d] = 10;
+ }
+ }
+
+ }
+
+ }
+
+ if (numa_info.nodes[0].numberOfProcessors == 0)
+ {
+ return -1;
+ }
+ else
+ {
+ return 0;
+ }
+}
+
+void hwloc_numa_membind(void* ptr, size_t size, int domainId)
+{
+ int ret = 0;
+ hwloc_membind_flags_t flags = HWLOC_MEMBIND_STRICT|HWLOC_MEMBIND_PROCESS;
+ hwloc_nodeset_t nodeset = likwid_hwloc_bitmap_alloc();
+
+ likwid_hwloc_bitmap_zero(nodeset);
+ likwid_hwloc_bitmap_set(nodeset, domainId);
+
+ ret = likwid_hwloc_set_area_membind_nodeset(hwloc_topology, ptr, size, nodeset, HWLOC_MEMBIND_BIND, flags);
+
+ likwid_hwloc_bitmap_free(nodeset);
+
+ if (ret < 0)
+ {
+ ERROR;
+ }
+}
+
+
+
+void hwloc_numa_setInterleaved(int* processorList, int numberOfProcessors)
+{
+ int i,j;
+ int ret = 0;
+ likwid_hwloc_cpuset_t cpuset = hwloc_bitmap_alloc();
+ likwid_hwloc_membind_flags_t flags = HWLOC_MEMBIND_STRICT|HWLOC_MEMBIND_PROCESS;
+
+ likwid_hwloc_bitmap_zero(cpuset);
+
+ for (i=0; i<numa_info.numberOfNodes; i++)
+ {
+ for (j=0; j<numberOfProcessors; j++)
+ {
+ if (likwid_hwloc_findProcessor(i,processorList[j]))
+ {
+ likwid_hwloc_bitmap_set(cpuset, i);
+ }
+ }
+ }
+
+
+ ret = likwid_hwloc_set_membind(hwloc_topology, cpuset, HWLOC_MEMBIND_INTERLEAVE, flags);
+
+ likwid_hwloc_bitmap_free(cpuset);
+
+ if (ret < 0)
+ {
+ ERROR;
+ }
+}
+#else
+int hwloc_numa_init(void)
+{
+ return 1;
+}
+
+void hwloc_numa_membind(void* ptr, size_t size, int domainId)
+{
+ return;
+}
+
+void hwloc_numa_setInterleaved(int* processorList, int numberOfProcessors)
+{
+ return;
+}
+
+#endif
diff --git a/src/numa_proc.c b/src/numa_proc.c
new file mode 100644
index 0000000..a17d824
--- /dev/null
+++ b/src/numa_proc.c
@@ -0,0 +1,383 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: numa_proc.c
+ *
+ * Description: Get NUMA topology from procfs and sysfs
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <dirent.h>
+#include <error.h>
+//#include <strUtil.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#ifdef HAS_MEMPOLICY
+#include <linux/mempolicy.h>
+#endif
+
+#include <numa.h>
+#include <topology.h>
+
+/* ##### EXPORTED VARIABLES ########################################### */
+
+
+
+
+/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
+
+#ifdef HAS_MEMPOLICY
+#define get_mempolicy(policy,nmask,maxnode,addr,flags) syscall(SYS_get_mempolicy,policy,nmask,maxnode,addr,flags)
+#define set_mempolicy(mode,nmask,maxnode) syscall(SYS_set_mempolicy,mode,nmask,maxnode)
+#define mbind(start, len, nmask, maxnode, flags) syscall(SYS_mbind,(start),len,MPOL_BIND,(nmask),maxnode,flags)
+#endif
+
+/* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */
+int
+proc_findProcessor(uint32_t nodeId, uint32_t coreId)
+{
+ int i;
+
+ for (i=0; i<numa_info.nodes[nodeId].numberOfProcessors; i++)
+ {
+ if (numa_info.nodes[nodeId].processors[i] == coreId)
+ {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
+static int
+setConfiguredNodes(void)
+{
+ DIR *dir;
+ struct dirent *de;
+ int maxIdConfiguredNode = 0;
+
+ dir = opendir("/sys/devices/system/node");
+
+ if (!dir)
+ {
+ maxIdConfiguredNode = 0;
+ }
+ else
+ {
+ while ((de = readdir(dir)) != NULL)
+ {
+ int nd;
+ if (strncmp(de->d_name, "node", 4))
+ {
+ continue;
+ }
+
+ nd = str2int(de->d_name+4);
+
+ if (maxIdConfiguredNode < nd)
+ {
+ maxIdConfiguredNode = nd;
+ }
+ }
+ closedir(dir);
+ }
+ return maxIdConfiguredNode;
+}
+
+
+static void
+nodeMeminfo(int node, uint64_t* totalMemory, uint64_t* freeMemory)
+{
+ FILE *fp;
+ bstring filename;
+ bstring totalString = bformat("MemTotal:");
+ bstring freeString = bformat("MemFree:");
+ int i;
+
+ filename = bformat("/sys/devices/system/node/node%d/meminfo", node);
+
+ if (NULL != (fp = fopen (bdata(filename), "r")))
+ {
+ bstring src = bread ((bNread) fread, fp);
+ struct bstrList* tokens = bsplit(src,(char) '\n');
+
+ for (i=0;i<tokens->qty;i++)
+ {
+ if (binstr(tokens->entry[i],0,totalString) != BSTR_ERR)
+ {
+ bstring tmp = bmidstr (tokens->entry[i], 18, blength(tokens->entry[i])-18 );
+ bltrimws(tmp);
+ struct bstrList* subtokens = bsplit(tmp,(char) ' ');
+ *totalMemory = str2int(bdata(subtokens->entry[0]));
+ bstrListDestroy(subtokens);
+ bdestroy(tmp);
+ }
+ else if (binstr(tokens->entry[i],0,freeString) != BSTR_ERR)
+ {
+ bstring tmp = bmidstr (tokens->entry[i], 18, blength(tokens->entry[i])-18 );
+ bltrimws(tmp);
+ struct bstrList* subtokens = bsplit(tmp,(char) ' ');
+ *freeMemory = str2int(bdata(subtokens->entry[0]));
+ bstrListDestroy(subtokens);
+ bdestroy(tmp);
+ }
+ }
+ bdestroy(src);
+ bstrListDestroy(tokens);
+ }
+ else
+ {
+ bdestroy(filename);
+ bdestroy(totalString);
+ bdestroy(freeString);
+ ERROR;
+ }
+ bdestroy(filename);
+ bdestroy(totalString);
+ bdestroy(freeString);
+ fclose(fp);
+}
+
+static int
+nodeProcessorList(int node, uint32_t** list)
+{
+ FILE *fp;
+ bstring filename;
+ int count = 0;
+ bstring src;
+ int i,j;
+ struct bstrList* tokens;
+ unsigned long val;
+ char* endptr;
+ int cursor=0;
+// int unitSize = (int) (sizeof(unsigned long)*8);
+ int unitSize = (int) 32; /* 8 nibbles */
+
+ *list = (uint32_t*) malloc(MAX_NUM_THREADS * sizeof(uint32_t));
+ if (!(*list))
+ {
+ return -ENOMEM;
+ }
+
+ /* the cpumap interface should be always there */
+ filename = bformat("/sys/devices/system/node/node%d/cpumap", node);
+
+ if (NULL != (fp = fopen (bdata(filename), "r")))
+ {
+
+ src = bread ((bNread) fread, fp);
+ tokens = bsplit(src,',');
+
+ for (i=(tokens->qty-1); i>=0 ;i--)
+ {
+ val = strtoul((char*) tokens->entry[i]->data, &endptr, 16);
+
+ if ((errno != 0 && val == LONG_MAX )
+ || (errno != 0 && val == 0))
+ {
+ return -EFAULT;
+ }
+
+ if (endptr == (char*) tokens->entry[i]->data)
+ {
+ ERROR_PLAIN_PRINT(No digits were found);
+ return -EFAULT;
+ }
+
+ if (val != 0UL)
+ {
+ for (j=0; j<unitSize; j++)
+ {
+ if (val&(1UL<<j))
+ {
+ if (count < MAX_NUM_THREADS)
+ {
+ (*list)[count] = (j+cursor);
+ }
+ else
+ {
+ ERROR_PRINT(Number Of threads %d too large,count);
+ return -EFAULT;
+ }
+ count++;
+ }
+ }
+ }
+ cursor += unitSize;
+ }
+
+ bstrListDestroy(tokens);
+ bdestroy(src);
+ bdestroy(filename);
+ fclose(fp);
+
+ /* FIXME: CPU list here is not physical cores first but numerical sorted */
+
+
+ return count;
+ }
+
+ /* something went wrong */
+ return -1;
+}
+
+static int
+nodeDistanceList(int node, int numberOfNodes, uint32_t** list)
+{
+ FILE *fp;
+ bstring filename;
+ int count = 0;
+ bstring src;
+ struct bstrList* tokens;
+
+ *list = (uint32_t*) malloc(numberOfNodes * sizeof(uint32_t));
+ if (!(*list))
+ {
+ return -ENOMEM;
+ }
+
+ /* the distance interface should be always there */
+ filename = bformat("/sys/devices/system/node/node%d/distance", node);
+
+ if (NULL != (fp = fopen (bdata(filename), "r")))
+ {
+
+ src = bread ((bNread) fread, fp);
+ tokens = bsplit(src,' ');
+
+ for (int i=0; i<(tokens->qty); i++)
+ {
+ if (count < numberOfNodes)
+ {
+ (*list)[count] = (uint32_t)strtoul((char*) (tokens->entry[i]->data), NULL, 10);
+ }
+ else
+ {
+ ERROR_PRINT(Number Of nodes %d too large,count);
+ return -EFAULT;
+ }
+ count++;
+ }
+
+ bstrListDestroy(tokens);
+ bdestroy(src);
+ bdestroy(filename);
+ fclose(fp);
+ return count;
+ }
+
+ /* something went wrong */
+ return -1;
+}
+
+/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
+
+int proc_numa_init(void)
+{
+ int errno;
+ uint32_t i;
+
+ if (get_mempolicy(NULL, NULL, 0, 0, 0) < 0 && errno == ENOSYS)
+ {
+ numa_info.numberOfNodes = 0;
+ numa_info.nodes = NULL;
+ return -1;
+ }
+ /* First determine maximum number of nodes */
+ numa_info.numberOfNodes = setConfiguredNodes()+1;
+ numa_info.nodes = (NumaNode*) malloc(numa_info.numberOfNodes * sizeof(NumaNode));
+ if (!numa_info.nodes)
+ {
+ return -ENOMEM;
+ }
+
+ for (i=0; i<numa_info.numberOfNodes; i++)
+ {
+ numa_info.nodes[i].id = i;
+ nodeMeminfo(i, &numa_info.nodes[i].totalMemory, &numa_info.nodes[i].freeMemory);
+ numa_info.nodes[i].numberOfProcessors = nodeProcessorList(i,&numa_info.nodes[i].processors);
+ if (numa_info.nodes[i].numberOfProcessors == 0)
+ {
+ return -EFAULT;
+ }
+ numa_info.nodes[i].numberOfDistances = nodeDistanceList(i, numa_info.numberOfNodes, &numa_info.nodes[i].distances);
+ if (numa_info.nodes[i].numberOfDistances == 0)
+ {
+ return -EFAULT;
+ }
+ }
+
+ return 0;
+}
+
+void
+proc_numa_setInterleaved(int* processorList, int numberOfProcessors)
+{
+ long i;
+ int j;
+ int ret=0;
+ unsigned long numberOfNodes = 65;
+ unsigned long mask = 0UL;
+
+ for (i=0; i<numa_info.numberOfNodes; i++)
+ {
+ for (j=0; j<numberOfProcessors; j++)
+ {
+ if (proc_findProcessor(i,processorList[j]))
+ {
+ mask |= (1UL<<i);
+ break;
+ }
+ }
+ }
+
+ ret = set_mempolicy(MPOL_INTERLEAVE,&mask,numberOfNodes);
+
+ if (ret < 0)
+ {
+ ERROR;
+ }
+}
+
+void
+proc_numa_membind(void* ptr, size_t size, int domainId)
+{
+ int ret=0;
+ unsigned long mask = 0UL;
+ unsigned int flags = 0U;
+
+ flags |= MPOL_MF_STRICT;
+ mask |= (1UL<<domainId);
+
+ ret = mbind(ptr, size, &mask, numa_info.numberOfNodes+1, flags);
+
+ if (ret < 0)
+ {
+ ERROR;
+ }
+}
diff --git a/src/pci.c b/src/pci.c
deleted file mode 100644
index 2e8a22f..0000000
--- a/src/pci.c
+++ /dev/null
@@ -1,398 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: pci.c
- *
- * Description: Implementation of pci module.
- * Provides API to read and write values to the hardware
- * performance monitoring registers in PCI Cfg space
- * for Intel Sandy Bridge Processors.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-/* ##### HEADER FILE INCLUDES ######################################### */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <fcntl.h>
-#include <string.h>
-#include <unistd.h>
-#include <signal.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-
-#include <types.h>
-#include <accessClient.h>
-#include <bstrlib.h>
-#include <error.h>
-#include <pci.h>
-#include <cpuid.h>
-#include <affinity.h>
-
-/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
-#define STRINGIFY(x) #x
-#define TOSTRING(x) STRINGIFY(x)
-
-#define PCI_ROOT_PATH "/proc/bus/pci/"
-
-/* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */
-
-static int socket_fd = -1;
-static int FD[MAX_NUM_NODES][MAX_NUM_DEVICES];
-
-static char* pci_DevicePath[MAX_NUM_DEVICES] = {
- "13.5", /* PCI_R3QPI_DEVICE_LINK_0 */
- "13.6", /* PCI_R3QPI_DEVICE_LINK_1 */
- "13.1", /* PCI_R2PCIE_DEVICE */
- "10.0", /* PCI_IMC_DEVICE_CH_0 */
- "10.1", /* PCI_IMC_DEVICE_CH_1 */
- "10.4", /* PCI_IMC_DEVICE_CH_2 */
- "10.5", /* PCI_IMC_DEVICE_CH_3 */
- "0e.1", /* PCI_HA_DEVICE */
- "08.2", /* PCI_QPI_DEVICE_PORT_0 */
- "09.2", /* PCI_QPI_DEVICE_PORT_1 */
- "08.6", /* PCI_QPI_MASK_DEVICE_PORT_0 */
- "09.6", /* PCI_QPI_MASK_DEVICE_PORT_1 */
- "08.0", /* PCI_QPI_MISC_DEVICE_PORT_0 */
- "09.0" }; /* PCI_QPI_MISC_DEVICE_PORT_1 */
-
-/* Socket to bus mapping -- will be determined at runtime;
- * typical mappings are:
- * Socket Bus (2S) Bus (4s)
- * 0 0xff 0x3f
- * 1 0x7f 0x7f
- * 2 0xbf
- * 3 0xff
- */
-static char* socket_bus[MAX_NUM_NODES];
-static int socket_count = 0;
-
-/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
-
-
-/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
-
-void
-pci_init(int initSocket_fd)
-{
- FILE *fptr;
- char buf[1024];
- uint32_t testDevice;
- uint32_t sbus, sdevfn, svend;
- int cntr = 0;
- int active_devs = 0;
-
- for ( int j=0; j<MAX_NUM_NODES; j++ )
- {
- socket_bus[j] = "N-A";
- for (int i=0; i<MAX_NUM_DEVICES; i++)
- {
- FD[j][i] = 0;
- }
- }
-
- if (cpuid_info.model == SANDYBRIDGE_EP)
- {
- testDevice = 0x80863c44;
- }
- else if (cpuid_info.model == IVYBRIDGE_EP)
- {
- testDevice = 0x80860e36;
- }
- else
- {
- /*
- fprintf(stderr, "Unsupported architecture for pci based uncore. \
- Thus, no support for PCI based Uncore counters.\n");
- */
- return;
- }
-
- if ( (fptr = fopen( "/proc/bus/pci/devices", "r")) == NULL )
- {
- fprintf(stderr, "Unable to open /proc/bus/pci/devices. \
- Thus, no support for PCI based Uncore counters.\n");
- return;
- }
-
- while( fgets(buf, sizeof(buf)-1, fptr) )
- {
- if ( sscanf(buf, "%2x%2x %8x", &sbus, &sdevfn, &svend) == 3 &&
- svend == testDevice )
- {
- socket_bus[cntr] = (char*)malloc(4);
- sprintf(socket_bus[cntr++], "%02x/", sbus);
- }
- }
- fclose(fptr);
-
- if ( cntr == 0 )
- {
- fprintf(stderr, "Uncore not supported on this system\n");
- return;
- }
-
- socket_count = cntr;
-
- bstring filepath = bfromcstr ( PCI_ROOT_PATH );
- bcatcstr(filepath, socket_bus[0]);
- bcatcstr(filepath, pci_DevicePath[0] );
-
-
- if (access(bdata(filepath),F_OK))
- {
- fprintf(stderr, "INFO\n");
- fprintf(stderr, "This system has no support for PCI based Uncore counters.\n");
- fprintf(stderr, "This means you cannot use performance groups as MEM, which require Uncore counters.\n\n");
- return;
- }
- bdestroy(filepath);
-
- for (int j=0; j<socket_count; j++)
- {
- for (int i=0; i<MAX_NUM_DEVICES; i++)
- {
-
- bstring filepath = bfromcstr ( PCI_ROOT_PATH );
- bcatcstr(filepath, socket_bus[j]);
- bcatcstr(filepath, pci_DevicePath[i] );
-
- if (!access(bdata(filepath),F_OK))
- {
- FD[j][i] = 0;
- }
- else
- {
- FD[j][i] = -2;
- }
- bdestroy(filepath);
- }
- }
-
- if (accessClient_mode == DAEMON_AM_DIRECT)
- {
- if(geteuid() != 0)
- {
- fprintf(stderr, "WARNING\n");
- fprintf(stderr, "Direct access to the PCI Cfg Adressspace is only allowed for uid root!\n");
- fprintf(stderr, "This means you can use performance groups as MEM only as root in direct mode.\n");
- fprintf(stderr, "Alternatively you might want to look into (sys)daemonmode.\n\n");
- }
- }
- else /* daemon or sysdaemon-mode */
- {
- socket_fd = initSocket_fd;
- }
-}
-
-
-void
-pci_finalize()
-{
- for (int j=0; j<socket_count; j++)
- {
- for (int i=0; i<MAX_NUM_DEVICES; i++)
- {
- if (FD[j][i] > 0)
- {
- close(FD[j][i]);
- }
- }
- }
-
- if (accessClient_mode != DAEMON_AM_DIRECT)
- {
- socket_fd = -1;
- }
-}
-
-
-uint32_t
-pci_read(int cpu, PciDeviceIndex device, uint32_t reg)
-{
- int socketId = affinity_core2node_lookup[cpu];
- if ( FD[socketId][device] == -2)
- {
- fprintf(stderr, "Trying to access non-existent PCI device (%s) for reading\n", pci_DevicePath[device]);
- return 0;
- }
-
- if (accessClient_mode == DAEMON_AM_DIRECT)
- {
- uint32_t data = 0;
- if ( !FD[socketId][device] )
- {
- bstring filepath = bfromcstr ( PCI_ROOT_PATH );
- bcatcstr(filepath, socket_bus[socketId]);
- bcatcstr(filepath, pci_DevicePath[device] );
- FD[socketId][device] = open( bdata(filepath), O_RDWR);
-
- if ( FD[socketId][device] < 0)
- {
- fprintf(stderr, "ERROR in pci_read: failed to open pci device %s: %s!\n",
- bdata(filepath), strerror(errno));
- }
- bdestroy(filepath);
- }
-
- if ( FD[socketId][device] > 0 &&
- pread(FD[socketId][device], &data, sizeof data, reg) != sizeof data )
- {
- ERROR_PRINT("ERROR in pci_read: failed on CPU %d Register 0x%x", cpu, reg);
- }
-
- return data;
- }
- else
- { /* daemon or sysdaemon-mode */
- return (uint32_t) accessClient_read(socket_fd, socketId, device, reg);
- }
-}
-
-
-
-void
-pci_write(int cpu, PciDeviceIndex device, uint32_t reg, uint32_t data)
-{
- int socketId = affinity_core2node_lookup[cpu];
-
- if ( FD[socketId][device] == -2)
- {
- fprintf(stderr, "Trying to access non-existent PCI device (%s) for writing\n", pci_DevicePath[device]);
- return;
- }
- if (accessClient_mode == DAEMON_AM_DIRECT)
- {
- if ( !FD[socketId][device] )
- {
- bstring filepath = bfromcstr ( PCI_ROOT_PATH );
- bcatcstr(filepath, socket_bus[socketId]);
- bcatcstr(filepath, pci_DevicePath[device] );
- FD[socketId][device] = open( bdata(filepath), O_RDWR);
-
- if ( FD[socketId][device] < 0)
- {
- fprintf(stderr, "ERROR in pci_write: failed to open pci device %s: %s!\n",
- bdata(filepath), strerror(errno));
- }
- bdestroy(filepath);
- }
-
- if ( FD[socketId][device] > 0 &&
- pwrite(FD[socketId][device], &data, sizeof data, reg) != sizeof data)
- {
- ERROR_PRINT("ERROR in pci_write: failed on CPU %d Register 0x%x", cpu, reg);
- }
- }
- else
- { /* daemon or sysdaemon-mode */
- accessClient_write(socket_fd, socketId, device, reg, (uint64_t) data);
- }
-}
-
-uint32_t
-pci_tread(const int tsocket_fd, const int cpu, PciDeviceIndex device, uint32_t reg)
-{
- int socketId = affinity_core2node_lookup[cpu];
- if ( FD[socketId][device] == -2)
- {
- return 0;
- }
-
- if (accessClient_mode == DAEMON_AM_DIRECT)
- {
- uint32_t data = 0;
- if ( !FD[socketId][device] )
- {
- bstring filepath = bfromcstr ( PCI_ROOT_PATH );
- bcatcstr(filepath, socket_bus[socketId]);
- bcatcstr(filepath, pci_DevicePath[device] );
-
- FD[socketId][device] = open( bdata(filepath), O_RDWR);
-
- if ( FD[socketId][device] < 0)
- {
- fprintf(stderr, "ERROR in pci_tread:\n failed to open pci device %s: %s!\n",
- bdata(filepath), strerror(errno));
- }
- bdestroy(filepath);
- }
-
- if ( FD[socketId][device] > 0 &&
- pread(FD[socketId][device], &data, sizeof data, reg) != sizeof data )
- {
- ERROR_PRINT("ERROR in pci_tread: failed on CPU %d Register 0x%x", cpu, reg);
- }
-
- return data;
- }
- else
- { /* daemon or sysdaemon-mode */
- return accessClient_read(tsocket_fd, socketId, device, reg);
- }
-}
-
-void
-pci_twrite( const int tsocket_fd, const int cpu, PciDeviceIndex device, uint32_t reg, uint32_t data)
-{
- int socketId = affinity_core2node_lookup[cpu];
- if ( FD[socketId][device] == -2)
- {
- return;
- }
- if (accessClient_mode == DAEMON_AM_DIRECT)
- {
- if ( !FD[socketId][device] )
- {
- bstring filepath = bfromcstr ( PCI_ROOT_PATH );
- bcatcstr(filepath, socket_bus[socketId]);
- bcatcstr(filepath, pci_DevicePath[device] );
-
- FD[socketId][device] = open( bdata(filepath), O_RDWR);
-
- if ( FD[socketId][device] < 0)
- {
- fprintf(stderr, "ERROR in pci_twrite: failed to open pci device %s: %s!\n",
- bdata(filepath), strerror(errno));
- }
- bdestroy(filepath);
- }
-
- if ( FD[socketId][device] > 0 &&
- pwrite(FD[socketId][device], &data, sizeof data, reg) != sizeof data)
- {
- ERROR_PRINT("ERROR in pci_twrite: failed on CPU %d Register 0x%x", cpu, reg);
- }
- }
- else
- { /* daemon or sysdaemon-mode */
- accessClient_write(tsocket_fd, socketId, device, reg, data);
- }
-}
-
-
-
diff --git a/src/pci_hwloc.c b/src/pci_hwloc.c
new file mode 100644
index 0000000..217e447
--- /dev/null
+++ b/src/pci_hwloc.c
@@ -0,0 +1,81 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: pci_hwloc.c
+ *
+ * Description: Interface to hwloc for PCI device lookup
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+
+
+#include <hwloc.h>
+#include <types.h>
+#include <bstrlib.h>
+#include <affinity.h>
+#include <topology.h>
+#include <topology_hwloc.h>
+#include <error.h>
+
+int
+hwloc_pci_init(uint16_t testDevice, char** socket_bus, int* nrSockets)
+{
+ int cntr = 0;
+ uint16_t testVendor = 0x8086;
+ hwloc_obj_t obj;
+ int flags;
+ int i;
+
+ if (!hwloc_topology)
+ {
+ likwid_hwloc_topology_init(&hwloc_topology);
+ likwid_hwloc_topology_set_flags(hwloc_topology, HWLOC_TOPOLOGY_FLAG_WHOLE_IO );
+ likwid_hwloc_topology_load(hwloc_topology);
+ }
+
+ for(i = 0; i < likwid_hwloc_get_nbobjs_by_type(hwloc_topology, HWLOC_OBJ_PCI_DEVICE); i++)
+ {
+ obj = likwid_hwloc_get_obj_by_type(hwloc_topology, HWLOC_OBJ_PCI_DEVICE, i);
+ if (obj->attr->pcidev.vendor_id != testVendor)
+ {
+ continue;
+ }
+ if ((obj->attr->pcidev.vendor_id == testVendor) && (obj->attr->pcidev.device_id == testDevice))
+ {
+ socket_bus[cntr] = (char*)malloc(4);
+ sprintf(socket_bus[cntr++], "%02x/", obj->attr->pcidev.bus);
+ }
+ }
+ *nrSockets = cntr;
+
+ if (cntr == 0)
+ {
+ return -ENODEV;
+ }
+
+ return 0;
+}
diff --git a/src/pci_proc.c b/src/pci_proc.c
new file mode 100644
index 0000000..cee436f
--- /dev/null
+++ b/src/pci_proc.c
@@ -0,0 +1,125 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: pci_proc.c
+ *
+ * Description: Interface to procfs/sysfs for PCI device lookup
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <fcntl.h>
+
+
+#include <types.h>
+#include <bstrlib.h>
+#include <affinity.h>
+#include <topology.h>
+#include <error.h>
+
+int getBusFromSocket(const uint32_t socket)
+{
+ int cur_bus = 0;
+ uint32_t cur_socket = 0;
+ char pci_filepath[1024];
+ int fp;
+ int ret = 0;
+ while(cur_socket <= socket)
+ {
+ sprintf(pci_filepath, "/proc/bus/pci/%02x/05.0", cur_bus);
+ fp = open(pci_filepath, O_RDONLY);
+ if (fp < 0)
+ {
+ return -1;
+ }
+ uint32_t cpubusno = 0;
+ ret = pread(fp, &cpubusno, sizeof(uint32_t), 0x108);
+ if (ret != sizeof(uint32_t))
+ {
+ close(fp);
+ return -1;
+ }
+ cur_bus = (cpubusno >> 8) & 0x0ff;
+ close(fp);
+ if(socket == cur_socket)
+ return cur_bus;
+ ++cur_socket;
+ ++cur_bus;
+ if(cur_bus > 0x0ff)
+ return -1;
+ }
+
+ return -1;
+}
+
+int
+proc_pci_init(uint16_t testDevice, char** socket_bus, int* nrSockets)
+{
+ FILE *fptr;
+ char buf[1024];
+ int cntr = 0;
+ uint16_t testVendor = 0x8086;
+ uint32_t sbus, sdevfn, svend, sdev;
+ int busID;
+
+
+ if ( (fptr = fopen( "/proc/bus/pci/devices", "r")) == NULL )
+ {
+ fprintf(stderr, "Unable to open /proc/bus/pci/devices. \
+ Thus, no support for PCI based Uncore counters.\n");
+ return -ENODEV;
+ }
+
+ while( fgets(buf, sizeof(buf)-1, fptr) )
+ {
+ if ( sscanf(buf, "%2x%2x %4x%4x", &sbus, &sdevfn, &svend, &sdev) == 4 &&
+ svend == testVendor && sdev == testDevice )
+ {
+ socket_bus[cntr] = (char*)malloc(4);
+ busID = getBusFromSocket(cntr);
+ if (busID == sbus)
+ {
+ sprintf(socket_bus[cntr], "%02x/", sbus);
+ }
+ else
+ {
+ sprintf(socket_bus[cntr], "%02x/", busID);
+ }
+ cntr++;
+ }
+ }
+ fclose(fptr);
+
+ *nrSockets = cntr;
+
+ if ( cntr == 0 )
+ {
+ //fprintf(stderr, "Uncore not supported on this system\n");
+ return -ENODEV;
+ }
+
+ return 0;
+}
diff --git a/src/perfgroup.c b/src/perfgroup.c
new file mode 100644
index 0000000..166790e
--- /dev/null
+++ b/src/perfgroup.c
@@ -0,0 +1,1285 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: perfgroup.c
+ *
+ * Description: Handler for performance groups and event sets
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at gmail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <dirent.h>
+
+#include <error.h>
+#include <perfgroup.h>
+#include <calculator.h>
+#include <likwid.h>
+
+int isdir(char* dirname)
+{
+ struct stat st;
+ if (NULL == dirname) {
+ return 0;
+ }
+ if (access(dirname, R_OK) != 0)
+ return 0;
+ stat(dirname, &st);
+ return S_ISDIR(st.st_mode) ? 1 : 0;
+}
+
+int get_groups(char* grouppath, char* architecture, char*** groupnames, char*** groupshort, char*** grouplong)
+{
+ int i = 0, j = 0, s = 0;
+ int fsize = 0, hsize = 0;
+ DIR *dp = NULL;
+ FILE* fp = NULL;
+ char buf[256] = { [0 ... 255] = '\0' };
+ struct dirent *ep = NULL;
+ *groupnames = NULL;
+ *groupshort = NULL;
+ *grouplong = NULL;
+ int search_home = 0;
+ bstring SHORT = bformat("SHORT");
+ bstring LONG = bformat("LONG");
+ int read_long = 0;
+ if ((grouppath == NULL)||(architecture == NULL)||(groupnames == NULL))
+ return -EINVAL;
+ char* fullpath = malloc((strlen(grouppath)+strlen(architecture)+50) * sizeof(char));
+ if (fullpath == NULL)
+ {
+ bdestroy(SHORT);
+ bdestroy(LONG);
+ return -ENOMEM;
+ }
+ char* homepath = malloc((strlen(getenv("HOME"))+strlen(architecture)+50) * sizeof(char));
+ if (homepath == NULL)
+ {
+ free(fullpath);
+ bdestroy(SHORT);
+ bdestroy(LONG);
+ return -ENOMEM;
+ }
+ fsize = sprintf(fullpath, "%s/%s", grouppath, architecture);
+ if (isdir(fullpath))
+ {
+ dp = opendir(fullpath);
+ if (dp == NULL)
+ {
+ printf("Cannot open directory %s\n", fullpath);
+ free(fullpath);
+ free(homepath);
+ bdestroy(SHORT);
+ bdestroy(LONG);
+ return -EACCES;
+ }
+ }
+ else
+ {
+ printf("Cannot access directory %s\n", fullpath);
+ free(fullpath);
+ free(homepath);
+ bdestroy(SHORT);
+ bdestroy(LONG);
+ return -EACCES;
+ }
+ i = 0;
+ s = 0;
+ while (ep = readdir(dp))
+ {
+ if (strncmp(&(ep->d_name[strlen(ep->d_name)-4]), ".txt", 4) == 0)
+ {
+ i++;
+ if (strlen(ep->d_name)-4 > s)
+ s = strlen(ep->d_name)-4;
+ }
+ }
+ closedir(dp);
+ hsize = sprintf(homepath, "%s/.likwid/groups/%s", getenv("HOME"), architecture);
+ if (isdir(homepath))
+ {
+ search_home = 1;
+ dp = opendir(homepath);
+ if (dp == NULL)
+ {
+ search_home = 0;
+ }
+ if (search_home)
+ {
+ while (ep = readdir(dp))
+ {
+ if (strncmp(&(ep->d_name[strlen(ep->d_name)-4]), ".txt", 4) == 0)
+ {
+ i++;
+ if (strlen(ep->d_name)-4 > s)
+ s = strlen(ep->d_name)-4;
+ }
+ }
+ closedir(dp);
+ }
+ }
+
+ *groupnames = malloc(i * sizeof(char**));
+ if (*groupnames == NULL)
+ {
+ free(fullpath);
+ free(homepath);
+ bdestroy(SHORT);
+ bdestroy(LONG);
+ return -ENOMEM;
+ }
+ *groupshort = malloc(i * sizeof(char**));
+ if (*groupshort == NULL)
+ {
+ free(*groupnames);
+ *groupnames = NULL;
+ free(fullpath);
+ free(homepath);
+ bdestroy(SHORT);
+ bdestroy(LONG);
+ return -ENOMEM;
+ }
+ *grouplong = malloc(i * sizeof(char**));
+ if (*grouplong == NULL)
+ {
+ free(*groupnames);
+ *groupnames = NULL;
+ free(*groupshort);
+ *groupshort = NULL;
+ free(fullpath);
+ free(homepath);
+ bdestroy(SHORT);
+ bdestroy(LONG);
+ return -ENOMEM;
+ }
+ for (j=0; j < i; j++)
+ {
+ (*grouplong)[i] == NULL;
+ (*groupshort)[i] == NULL;
+ (*groupnames)[j] = malloc((s+1) * sizeof(char));
+ if ((*groupnames)[j] == NULL)
+ {
+ free(*groupnames);
+ *groupnames = NULL;
+ free(*groupshort);
+ *groupshort = NULL;
+ free(*grouplong);
+ *grouplong = NULL;
+ free(fullpath);
+ free(homepath);
+ bdestroy(SHORT);
+ bdestroy(LONG);
+ return -ENOMEM;
+ }
+ }
+ dp = opendir(fullpath);
+ i = 0;
+
+ while (ep = readdir(dp))
+ {
+ if (strncmp(&(ep->d_name[strlen(ep->d_name)-4]), ".txt", 4) == 0)
+ {
+ read_long = 0;
+ bstring long_info = bfromcstr("");;
+ sprintf(&(fullpath[fsize]), "/%s", ep->d_name);
+ if (!access(fullpath, R_OK))
+ {
+ (*grouplong)[i] = NULL;
+ s = sprintf((*groupnames)[i], "%.*s", (int)(strlen(ep->d_name)-4), ep->d_name);
+ (*groupnames)[i][s] = '\0';
+ fp = fopen(fullpath,"r");
+
+ while (fgets (buf, sizeof(buf), fp)) {
+ bstring bbuf = bfromcstr(buf);
+ btrimws(bbuf);
+ if ((blength(bbuf) == 0) || (buf[0] == '#'))
+ {
+ bdestroy(bbuf);
+ continue;
+ }
+ if (bstrncmp(bbuf, SHORT, 5) == 0)
+ {
+ struct bstrList * linelist = bsplit(bbuf, ' ');
+ bstring sinfo;
+ if (linelist->qty == 1)
+ {
+ fprintf(stderr,"Cannot read SHORT section in groupfile %s",fullpath);
+ bdestroy(bbuf);
+ bstrListDestroy(linelist);
+ continue;
+ }
+ s = 1;
+ for (j=s;j<linelist->qty; j++)
+ {
+ btrimws(linelist->entry[j]);
+ if (blength(linelist->entry[j]) == 0)
+ s += 1;
+ else
+ break;
+ }
+ btrimws(linelist->entry[s]);
+ sinfo = bformat("%s", bdata(linelist->entry[s]));
+ for (j=s+1;j<linelist->qty; j++)
+ {
+ btrimws(linelist->entry[j]);
+ bstring tmp = bformat(" %s", bdata(linelist->entry[j]));
+ bconcat(sinfo, tmp);
+ bdestroy(tmp);
+ }
+
+ (*groupshort)[i] = malloc((blength(sinfo)+1) * sizeof(char));
+ if ((*groupshort)[i] == NULL)
+ {
+ bdestroy(SHORT);
+ bdestroy(LONG);
+ bdestroy(bbuf);
+ bdestroy(sinfo);
+ free(homepath);
+ free(fullpath);
+ bstrListDestroy(linelist);
+ return -ENOMEM;
+ }
+ s = sprintf((*groupshort)[i], "%s", bdata(sinfo));
+ (*groupshort)[i][s] = '\0';
+ bstrListDestroy(linelist);
+ bdestroy(sinfo);
+ }
+ else if (bstrncmp(bbuf, LONG, 4) == 0)
+ {
+ read_long = 1;
+ }
+ else if ((read_long == 1) && (bstrncmp(bbuf, LONG, 4) != 0))
+ {
+ bstring tmp = bfromcstr(buf);
+ bconcat(long_info, tmp);
+ bdestroy(tmp);
+ }
+ bdestroy(bbuf);
+ }
+ if (read_long)
+ {
+ (*grouplong)[i] = malloc((blength(long_info) + 1) * sizeof(char) );
+ if ((*grouplong)[i] != NULL)
+ {
+ j = sprintf((*grouplong)[i], "%s", bdata(long_info));
+ (*grouplong)[i][j] = '\0';
+ }
+ }
+ fclose(fp);
+
+ i++;
+ }
+ bdestroy(long_info);
+ }
+ }
+ closedir(dp);
+ if (!search_home)
+ {
+ free(homepath);
+ free(fullpath);
+ bdestroy(SHORT);
+ bdestroy(LONG);
+ return i;
+ }
+ else
+ {
+ dp = opendir(homepath);
+ while (ep = readdir(dp))
+ {
+ if (strncmp(&(ep->d_name[strlen(ep->d_name)-4]), ".txt", 4) == 0)
+ {
+ read_long = 0;
+ bstring long_info = bfromcstr("");;
+ sprintf(&(homepath[hsize]), "/%s", ep->d_name);
+ if (!access(homepath, R_OK))
+ {
+ (*grouplong)[i] = NULL;
+ s = sprintf((*groupnames)[i], "%.*s", (int)(strlen(ep->d_name)-4), ep->d_name);
+ (*groupnames)[i][s] = '\0';
+ fp = fopen(homepath,"r");
+ while (fgets (buf, sizeof(buf), fp)) {
+
+ bstring bbuf = bfromcstr(buf);
+ btrimws(bbuf);
+ if ((blength(bbuf) == 0) || (buf[0] == '#'))
+ {
+ bdestroy(bbuf);
+ continue;
+ }
+ if (bstrncmp(bbuf, SHORT, 5) == 0)
+ {
+ struct bstrList * linelist = bsplit(bbuf, ' ');
+ bstring sinfo;
+ if (linelist->qty == 1)
+ {
+ fprintf(stderr,"Cannot read SHORT section in groupfile %s",fullpath);
+ bdestroy(bbuf);
+ bstrListDestroy(linelist);
+ continue;
+ }
+ s = 1;
+ for (j=s;j<linelist->qty; j++)
+ {
+ btrimws(linelist->entry[j]);
+ if (blength(linelist->entry[j]) == 0)
+ s += 1;
+ else
+ break;
+ }
+ btrimws(linelist->entry[s]);
+ sinfo = bformat("%s", bdata(linelist->entry[s]));
+ for (j=s+1;j<linelist->qty; j++)
+ {
+ btrimws(linelist->entry[j]);
+ bstring tmp = bformat(" %s", bdata(linelist->entry[j]));
+ bconcat(sinfo, tmp);
+ bdestroy(tmp);
+ }
+
+ (*groupshort)[i] = malloc((blength(sinfo)+1) * sizeof(char));
+ if ((*groupshort)[i] == NULL)
+ {
+ bdestroy(SHORT);
+ bdestroy(LONG);
+ bdestroy(bbuf);
+ bdestroy(sinfo);
+ free(homepath);
+ free(fullpath);
+ bstrListDestroy(linelist);
+ return -ENOMEM;
+ }
+ s = sprintf((*groupshort)[i], "%s", bdata(sinfo));
+ (*groupshort)[i][s] = '\0';
+ bstrListDestroy(linelist);
+ bdestroy(sinfo);
+ }
+ else if (bstrncmp(bbuf, LONG, 4) == 0)
+ {
+ read_long = 1;
+ }
+ else if ((read_long == 1) && (bstrncmp(bbuf, LONG, 4) != 0))
+ {
+ bstring tmp = bfromcstr(buf);
+ bconcat(long_info, tmp);
+ bdestroy(tmp);
+ }
+ bdestroy(bbuf);
+ }
+ if (read_long)
+ {
+ (*grouplong)[i] = malloc((blength(long_info) + 1) * sizeof(char) );
+ if ((*grouplong)[i] != NULL)
+ {
+ j = sprintf((*grouplong)[i], "%s", bdata(long_info));
+ (*grouplong)[i][j] = '\0';
+ }
+ }
+ fclose(fp);
+ i++;
+ }
+ bdestroy(long_info);
+ }
+ }
+ closedir(dp);
+ }
+ bdestroy(SHORT);
+ bdestroy(LONG);
+ free(fullpath);
+ free(homepath);
+ return i;
+}
+
+void return_groups(int groups, char** groupnames, char** groupshort, char** grouplong)
+{
+ int i;
+ for (i = 0; i <groups; i++)
+ {
+ if (groupnames[i])
+ free(groupnames[i]);
+ if (groupshort[i])
+ free(groupshort[i]);
+ if (grouplong[i])
+ free(grouplong[i]);
+ }
+ if (groupnames)
+ free(groupnames);
+ if (groupshort)
+ free(groupshort);
+ if (grouplong)
+ free(grouplong);
+}
+
+
+
+int custom_group(char* eventStr, GroupInfo* ginfo)
+{
+ int i, j;
+ int err = 0;
+ char delim = ',';
+ bstring edelim = bformat(":");
+ int has_fix0 = 0;
+ int has_fix1 = 0;
+ int has_fix2 = 0;
+ ginfo->shortinfo = NULL;
+ ginfo->nevents = 0;
+ ginfo->events = NULL;
+ ginfo->counters = NULL;
+ ginfo->nmetrics = 0;
+ ginfo->metricformulas = NULL;
+ ginfo->metricnames = NULL;
+ ginfo->longinfo = NULL;
+ bstring eventBstr;
+ struct bstrList * eventList;
+ bstring fix0 = bformat("FIXC0");
+ bstring fix1 = bformat("FIXC1");
+ bstring fix2 = bformat("FIXC2");
+ DEBUG_PRINT(DEBUGLEV_INFO, Creating custom group for event string %s, eventStr);
+
+ ginfo->shortinfo = malloc(7 * sizeof(char));
+ if (ginfo->shortinfo == NULL)
+ {
+ err = -ENOMEM;
+ goto cleanup;
+ }
+ sprintf(ginfo->shortinfo, "%s", "Custom");
+ ginfo->longinfo = malloc(7 * sizeof(char));
+ if (ginfo->longinfo == NULL)
+ {
+ err = -ENOMEM;
+ goto cleanup;
+ }
+ sprintf(ginfo->longinfo, "%s", "Custom");
+ ginfo->groupname = malloc(7 * sizeof(char));
+ if (ginfo->groupname == NULL)
+ {
+ err = -ENOMEM;
+ goto cleanup;
+ }
+ sprintf(ginfo->groupname, "%s", "Custom");
+
+ eventBstr = bfromcstr(eventStr);
+ eventList = bsplit(eventBstr, delim);
+ ginfo->nevents = eventList->qty;
+
+ if (binstr(eventBstr, 0, fix0) > 0)
+ {
+ has_fix0 = 1;
+ }
+ else
+ {
+ ginfo->nevents++;
+ }
+ if (binstr(eventBstr, 0, fix1) > 0)
+ {
+ has_fix1 = 1;
+ }
+ else
+ {
+ ginfo->nevents++;
+ }
+ if (binstr(eventBstr, 0, fix2) > 0)
+ {
+ has_fix2 = 1;
+ }
+ else
+ {
+ ginfo->nevents++;
+ }
+ bdestroy(eventBstr);
+
+ ginfo->events = malloc(ginfo->nevents * sizeof(char*));
+ if (ginfo->events == NULL)
+ {
+ err = -ENOMEM;
+ bstrListDestroy(eventList);
+ goto cleanup;
+ }
+ ginfo->counters = malloc(ginfo->nevents * sizeof(char*));
+ if (ginfo->counters == NULL)
+ {
+ err = -ENOMEM;
+ bstrListDestroy(eventList);
+ goto cleanup;
+ }
+ for (i = 0; i< eventList->qty; i++)
+ {
+ int s;
+ struct bstrList * elist;
+ elist = bsplit(eventList->entry[i], ':');
+ ginfo->events[i] = malloc((blength(elist->entry[0]) + 1) * sizeof(char));
+ if (ginfo->events[i] == NULL)
+ {
+ bstrListDestroy(elist);
+ err = -ENOMEM;
+ goto cleanup;
+ }
+ bstring ctr = bstrcpy(elist->entry[1]);
+ if (elist->qty > 2)
+ {
+ for (j = 2; j < elist->qty; j++)
+ {
+ bconcat(ctr, edelim);
+ bconcat(ctr, elist->entry[j]);
+ }
+ }
+ ginfo->counters[i] = malloc((blength(ctr) + 1) * sizeof(char));
+ if (ginfo->counters[i] == NULL)
+ {
+ bstrListDestroy(elist);
+ bdestroy(ctr);
+ err = -ENOMEM;
+ goto cleanup;
+ }
+ sprintf(ginfo->events[i], "%s", bdata(elist->entry[0]));
+ snprintf(ginfo->counters[i], blength(ctr)+1, "%s", bdata(ctr));
+ bdestroy(ctr);
+ bstrListDestroy(elist);
+ }
+ i = eventList->qty;
+ if (!has_fix0)
+ {
+ ginfo->events[i] = malloc(18 * sizeof(char));
+ ginfo->counters[i] = malloc(6 * sizeof(char));
+ sprintf(ginfo->events[i], "%s", "INSTR_RETIRED_ANY");
+ sprintf(ginfo->counters[i], "%s", "FIXC0");
+ i++;
+ }
+ if (!has_fix1)
+ {
+ ginfo->events[i] = malloc(22 * sizeof(char));
+ ginfo->counters[i] = malloc(6 * sizeof(char));
+ sprintf(ginfo->events[i], "%s", "CPU_CLK_UNHALTED_CORE");
+ sprintf(ginfo->counters[i], "%s", "FIXC1");
+ i++;
+ }
+ if (!has_fix2)
+ {
+ ginfo->events[i] = malloc(21 * sizeof(char));
+ ginfo->counters[i] = malloc(6 * sizeof(char));
+ sprintf(ginfo->events[i], "%s", "CPU_CLK_UNHALTED_REF");
+ sprintf(ginfo->counters[i], "%s", "FIXC2");
+ i++;
+ }
+
+ bstrListDestroy(eventList);
+ bdestroy(fix0);
+ bdestroy(fix1);
+ bdestroy(fix2);
+ bdestroy(edelim);
+ return 0;
+cleanup:
+ bstrListDestroy(eventList);
+ bdestroy(fix0);
+ bdestroy(fix1);
+ bdestroy(fix2);
+ bdestroy(edelim);
+ if (ginfo->shortinfo != NULL)
+ free(ginfo->shortinfo);
+ if (ginfo->events != NULL)
+ free(ginfo->events);
+ if (ginfo->counters != NULL)
+ free(ginfo->counters);
+ return err;
+}
+
+int read_group(char* grouppath, char* architecture, char* groupname, GroupInfo* ginfo)
+{
+ FILE* fp;
+ int i, s, e, err = 0;
+ char buf[512];
+ GroupFileSections sec = GROUP_NONE;
+ if ((grouppath == NULL)||(architecture == NULL)||(groupname == NULL)||(ginfo == NULL))
+ return -EINVAL;
+
+ bstring fullpath = bformat("%s/%s/%s.txt", grouppath,architecture, groupname);
+ bstring homepath = bformat("%s/.likwid/groups/%s/%s.txt", getenv("HOME"),architecture, groupname);
+
+ if (access(bdata(fullpath), R_OK))
+ {
+ DEBUG_PRINT(DEBUGLEV_INFO, Cannot read group file %s. Trying %s, bdata(fullpath), bdata(homepath));
+ if (access(bdata(homepath), R_OK))
+ {
+ ERROR_PRINT(Cannot read group file %s.txt. Searched in %s and %s, groupname, bdata(fullpath), bdata(homepath));
+ bdestroy(fullpath);
+ bdestroy(homepath);
+ return -EACCES;
+ }
+ else
+ {
+ fullpath = bstrcpy(homepath);
+ }
+ }
+
+ DEBUG_PRINT(DEBUGLEV_INFO, Reading group %s from %s, groupname, bdata(fullpath));
+
+ ginfo->shortinfo = NULL;
+ ginfo->nevents = 0;
+ ginfo->events = NULL;
+ ginfo->counters = NULL;
+ ginfo->nmetrics = 0;
+ ginfo->metricformulas = NULL;
+ ginfo->metricnames = NULL;
+ ginfo->longinfo = NULL;
+ ginfo->groupname = (char*)malloc((strlen(groupname)+10)*sizeof(char));
+ if (ginfo->groupname == NULL)
+ {
+ err = -ENOMEM;
+ goto cleanup;
+ }
+ //strncpy(ginfo->groupname, groupname, strlen(groupname));
+ i = sprintf(ginfo->groupname, "%s", groupname);
+ ginfo->groupname[i] = '\0';
+
+ fp = fopen(bdata(fullpath), "r");
+ if (fp == NULL)
+ {
+ free(ginfo->groupname);
+ bdestroy(fullpath);
+ bdestroy(homepath);
+ return -EACCES;
+ }
+ struct bstrList * linelist;
+ while (fgets (buf, sizeof(buf), fp)) {
+ if ((strlen(buf) == 0) || (buf[0] == '#'))
+ continue;
+
+ if (strncmp(groupFileSectionNames[GROUP_SHORT], buf, strlen(groupFileSectionNames[GROUP_SHORT])) == 0)
+ {
+ sec = GROUP_SHORT;
+ for (i=strlen(groupFileSectionNames[GROUP_SHORT]); i < strlen(buf); i++)
+ {
+ if (buf[i] == ' ')
+ continue;
+ break;
+ }
+ ginfo->shortinfo = malloc(strlen(&(buf[i])) * sizeof(char));
+ sprintf(ginfo->shortinfo, "%.*s", (int)strlen(&(buf[i]))-1, &(buf[i]));
+ continue;
+ }
+ else if (strncmp(groupFileSectionNames[GROUP_EVENTSET], buf, strlen(groupFileSectionNames[GROUP_EVENTSET])) == 0)
+ {
+ sec = GROUP_EVENTSET;
+ continue;
+ }
+ else if (strncmp(groupFileSectionNames[GROUP_METRICS], buf, strlen(groupFileSectionNames[GROUP_METRICS])) == 0)
+ {
+ sec = GROUP_METRICS;
+ continue;
+ }
+ else if (strncmp(groupFileSectionNames[GROUP_LONG], buf, strlen(groupFileSectionNames[GROUP_LONG])) == 0)
+ {
+ sec = GROUP_LONG;
+ continue;
+ }
+ if (sec == GROUP_NONE)
+ continue;
+ if (sec == GROUP_EVENTSET)
+ {
+ i = 0;
+ bstring bbuf = bfromcstr(buf);
+ btrimws(bbuf);
+ if (blength(bbuf) == 0)
+ {
+ bdestroy(bbuf);
+ sec = GROUP_NONE;
+ continue;
+ }
+ linelist = bsplit(bbuf, ' ');
+ for (i=0; i<linelist->qty; i++)
+ btrimws(linelist->entry[i]);
+ bdestroy(bbuf);
+ bbuf = bstrcpy(linelist->entry[0]);
+ for (i=1; i<linelist->qty; i++)
+ {
+ if (blength(linelist->entry[i]) > 0)
+ {
+ bstring tmp = bformat(" %s", bdata(linelist->entry[i]));
+ bconcat(bbuf, tmp);
+ bdestroy(tmp);
+ }
+ }
+ if (ginfo->events == NULL)
+ {
+ ginfo->events = (char**)malloc(sizeof(char*));
+ if (ginfo->events == NULL)
+ {
+ err = -ENOMEM;
+ bdestroy(bbuf);
+ goto cleanup;
+ }
+ }
+ else
+ {
+ char** tmp = NULL;
+ tmp = realloc(ginfo->events, (ginfo->nevents + 1) * sizeof(char*));
+ if (tmp == NULL)
+ {
+ free(ginfo->events);
+ bdestroy(bbuf);
+ err = -ENOMEM;
+ goto cleanup;
+ }
+ else
+ {
+ ginfo->events = tmp;
+ tmp = NULL;
+ }
+ }
+ if (ginfo->counters == NULL)
+ {
+ ginfo->counters = (char**)malloc(sizeof(char*));
+ if (ginfo->counters == NULL)
+ {
+ err = -ENOMEM;
+ bdestroy(bbuf);
+ goto cleanup;
+ }
+ }
+ else
+ {
+ char** tmp = NULL;
+ tmp = realloc(ginfo->counters, (ginfo->nevents + 1) * sizeof(char*));
+ if (tmp == NULL)
+ {
+ free(ginfo->counters);
+ bdestroy(bbuf);
+ err = -ENOMEM;
+ goto cleanup;
+ }
+ else
+ {
+ ginfo->counters = tmp;
+ tmp = NULL;
+ }
+ }
+ bstrListDestroy(linelist);
+
+
+ linelist = bsplit(bbuf, ' ');
+ bdestroy(bbuf);
+ for (i=0; i<linelist->qty; i++)
+ btrimws(linelist->entry[i]);
+ ginfo->counters[ginfo->nevents] = malloc((blength(linelist->entry[0])+1) * sizeof(char));
+ if (ginfo->counters[ginfo->nevents] == NULL)
+ {
+ err = -ENOMEM;
+ goto cleanup;
+ }
+ ginfo->events[ginfo->nevents] = malloc((blength(linelist->entry[1])+1) * sizeof(char));
+ if (ginfo->events[ginfo->nevents] == NULL)
+ {
+ err = -ENOMEM;
+ goto cleanup;
+ }
+ sprintf(ginfo->counters[ginfo->nevents], "%s", bdata(linelist->entry[0]));
+ sprintf(ginfo->events[ginfo->nevents], "%s", bdata(linelist->entry[1]));
+
+ ginfo->nevents++;
+ bstrListDestroy(linelist);
+ continue;
+ }
+ else if (sec == GROUP_METRICS)
+ {
+ i = 0;
+ bstring bbuf = bfromcstr(buf);
+ btrimws(bbuf);
+ if (blength(bbuf) == 0)
+ {
+ bdestroy(bbuf);
+ sec = GROUP_NONE;
+ continue;
+ }
+ linelist = bsplit(bbuf, ' ');
+ for (i=0; i<linelist->qty; i++)
+ btrimws(linelist->entry[i]);
+ bdestroy(bbuf);
+ bbuf = bstrcpy(linelist->entry[0]);
+ for (i=1; i<linelist->qty; i++)
+ {
+ if (blength(linelist->entry[i]) > 0)
+ {
+ bstring tmp = bformat(" %s", bdata(linelist->entry[i]));
+ bconcat(bbuf, tmp);
+ bdestroy(tmp);
+ }
+ }
+ char** tmp;
+ tmp = realloc(ginfo->metricformulas, (ginfo->nmetrics + 1) * sizeof(char*));
+ if (tmp == NULL)
+ {
+ free(ginfo->metricformulas);
+ bdestroy(bbuf);
+ bstrListDestroy(linelist);
+ err = -ENOMEM;
+ goto cleanup;
+ }
+ else
+ {
+ ginfo->metricformulas = tmp;
+ }
+ tmp = realloc(ginfo->metricnames, (ginfo->nmetrics + 1) * sizeof(char*));
+ if (tmp == NULL)
+ {
+ free(ginfo->metricnames);
+ bdestroy(bbuf);
+ bstrListDestroy(linelist);
+ err = -ENOMEM;
+ goto cleanup;
+ }
+ else
+ {
+ ginfo->metricnames = tmp;
+ }
+ bstrListDestroy(linelist);
+ linelist = bsplit(bbuf, ' ');
+ ginfo->metricformulas[ginfo->nmetrics] = malloc((blength(linelist->entry[linelist->qty - 1])+1) * sizeof(char));
+ if (ginfo->metricformulas[ginfo->nmetrics] == NULL)
+ {
+ err = -ENOMEM;
+ bdestroy(bbuf);
+ bstrListDestroy(linelist);
+ goto cleanup;
+ }
+ ginfo->metricnames[ginfo->nmetrics] = malloc(((blength(bbuf)-blength(linelist->entry[linelist->qty - 1]))+1) * sizeof(char));
+ if (ginfo->metricnames[ginfo->nmetrics] == NULL)
+ {
+ err = -ENOMEM;
+ bdestroy(bbuf);
+ bstrListDestroy(linelist);
+ goto cleanup;
+ }
+ bdestroy(bbuf);
+ sprintf(ginfo->metricformulas[ginfo->nmetrics], "%s", bdata(linelist->entry[linelist->qty - 1]));
+ bbuf = bstrcpy(linelist->entry[0]);
+ for (i=1; i<linelist->qty - 1; i++)
+ {
+ if (blength(linelist->entry[i]) > 0)
+ {
+ bstring tmp = bformat(" %s", bdata(linelist->entry[i]));
+ bconcat(bbuf, tmp);
+ bdestroy(tmp);
+ }
+ }
+ sprintf(ginfo->metricnames[ginfo->nmetrics], "%s", bdata(bbuf));
+ bdestroy(bbuf);
+ bstrListDestroy(linelist);
+ ginfo->nmetrics++;
+ continue;
+ }
+ else if (sec == GROUP_LONG)
+ {
+ s = (ginfo->longinfo == NULL ? 0 : strlen(ginfo->longinfo));
+ char *tmp;
+ tmp = realloc(ginfo->longinfo, (s + strlen(buf) + 3) * sizeof(char));
+ if (tmp == NULL)
+ {
+ free(ginfo->longinfo);
+ err = -ENOMEM;
+ goto cleanup;
+ }
+ else
+ {
+ ginfo->longinfo = tmp;
+ }
+ sprintf(&(ginfo->longinfo[s]), "%.*s", (int)strlen(buf), buf);
+ continue;
+ }
+ }
+ //bstrListDestroy(linelist);
+ fclose(fp);
+ bdestroy(homepath);
+ bdestroy(fullpath);
+ return 0;
+cleanup:
+ bdestroy(homepath);
+ bdestroy(fullpath);
+ if (ginfo->groupname)
+ free(ginfo->groupname);
+ if (ginfo->shortinfo)
+ free(ginfo->shortinfo);
+ if (ginfo->longinfo)
+ free(ginfo->longinfo);
+ if (ginfo->nevents > 0)
+ {
+ for(i=0;i<ginfo->nevents; i++)
+ {
+ if (ginfo->counters[i])
+ free(ginfo->counters[i]);
+ if (ginfo->events[i])
+ free(ginfo->events[i]);
+ }
+ }
+ if (ginfo->nmetrics > 0)
+ {
+ for(i=0;i<ginfo->nmetrics; i++)
+ {
+ if (ginfo->metricformulas[i])
+ free(ginfo->metricformulas[i]);
+ if (ginfo->metricnames[i])
+ free(ginfo->metricnames[i]);
+ }
+ }
+ return err;
+}
+
+int new_group(GroupInfo* ginfo)
+{
+ if (!ginfo)
+ return -EINVAL;
+ ginfo->groupname = NULL;
+ ginfo->shortinfo = NULL;
+ ginfo->nevents = 0;
+ ginfo->events = NULL;
+ ginfo->counters = NULL;
+ ginfo->nmetrics = 0;
+ ginfo->metricformulas = NULL;
+ ginfo->metricnames = NULL;
+ ginfo->longinfo = NULL;
+ return 0;
+}
+
+char* get_eventStr(GroupInfo* ginfo)
+{
+ int i;
+ char* string;
+ int size = 0;
+ if (!ginfo)
+ return NULL;
+ if (ginfo->nevents == 0)
+ return NULL;
+ for(i=0;i<ginfo->nevents-1; i++)
+ {
+ size += strlen(ginfo->events[i]) + strlen(ginfo->counters[i]) + 2;
+ }
+ size += strlen(ginfo->events[ginfo->nevents-1]) + strlen(ginfo->counters[ginfo->nevents-1]) + 1 + 1;
+ size++;
+ string = malloc(size * sizeof(char));
+ if (string == NULL)
+ return NULL;
+ size = 0;
+ for(i=0;i<ginfo->nevents-1; i++)
+ {
+ size += sprintf(&(string[size]), "%s:%s,", ginfo->events[i], ginfo->counters[i]);
+ }
+ size += sprintf(&(string[size]), "%s:%s", ginfo->events[ginfo->nevents-1], ginfo->counters[ginfo->nevents-1]);
+ string[size] = '\0';
+ return string;
+}
+
+void put_eventStr(char* eventset)
+{
+ if (eventset != NULL)
+ {
+ free(eventset);
+ eventset = NULL;
+ }
+}
+
+int add_event(GroupInfo* ginfo, char* event, char* counter)
+{
+ if ((!ginfo) || (!event) || (!counter))
+ return -EINVAL;
+ ginfo->events = realloc(ginfo->events, (ginfo->nevents + 1) * sizeof(char*));
+ if (!ginfo->events)
+ return -ENOMEM;
+ ginfo->counters = realloc(ginfo->counters, (ginfo->nevents + 1) * sizeof(char*));
+ if (!ginfo->counters)
+ return -ENOMEM;
+ ginfo->events[ginfo->nevents] = malloc((strlen(event) + 1) * sizeof(char));
+ if (!ginfo->events[ginfo->nevents])
+ return -ENOMEM;
+ ginfo->counters[ginfo->nevents] = malloc((strlen(counter) + 1) * sizeof(char));
+ if (!ginfo->counters[ginfo->nevents])
+ return -ENOMEM;
+ sprintf(ginfo->events[ginfo->nevents], "%s", event);
+ sprintf(ginfo->counters[ginfo->nevents], "%s", counter);
+ ginfo->nevents++;
+ return 0;
+}
+
+int add_metric(GroupInfo* ginfo, char* mname, char* mcalc)
+{
+ if ((!ginfo) || (!mname) || (!mcalc))
+ return -EINVAL;
+ ginfo->metricnames = realloc(ginfo->metricnames, (ginfo->nmetrics + 1) * sizeof(char*));
+ if (!ginfo->metricnames)
+ return -ENOMEM;
+ ginfo->metricformulas = realloc(ginfo->metricformulas, (ginfo->nmetrics + 1) * sizeof(char*));
+ if (!ginfo->metricformulas)
+ return -ENOMEM;
+ ginfo->metricnames[ginfo->nmetrics] = malloc((strlen(mname) + 1) * sizeof(char));
+ if (!ginfo->metricnames[ginfo->nmetrics])
+ return -ENOMEM;
+ ginfo->metricformulas[ginfo->nmetrics] = malloc((strlen(mcalc) + 1) * sizeof(char));
+ if (!ginfo->metricformulas[ginfo->nmetrics])
+ return -ENOMEM;
+ sprintf(ginfo->metricnames[ginfo->nmetrics], "%s", mname);
+ sprintf(ginfo->metricformulas[ginfo->nmetrics], "%s", mcalc);
+ ginfo->nmetrics++;
+ return 0;
+}
+
+
+char* get_groupName(GroupInfo* ginfo)
+{
+ if ((ginfo != NULL) && (ginfo->groupname != NULL))
+ {
+ int size = strlen(ginfo->groupname)+1;
+ char* gstr = malloc(size * sizeof(char));
+ sprintf(gstr, "%s", ginfo->groupname);
+ return gstr;
+ }
+ return NULL;
+}
+
+int set_groupName(GroupInfo* ginfo, char* groupName)
+{
+ if ((ginfo == NULL) || (groupName == NULL))
+ return -EINVAL;
+ int size = strlen(groupName)+1;
+ ginfo->groupname = realloc(ginfo->groupname, size * sizeof(char));
+ if (ginfo->groupname == NULL)
+ return -ENOMEM;
+ sprintf(ginfo->groupname, "%s", groupName);
+ return 0;
+}
+
+char* get_shortInfo(GroupInfo* ginfo)
+{
+ if ((ginfo != NULL) && (ginfo->shortinfo != NULL))
+ {
+ int size = strlen(ginfo->shortinfo)+1;
+ char* sstr = malloc(size * sizeof(char));
+ sprintf(sstr, "%s", ginfo->shortinfo);
+ return sstr;
+ }
+ return NULL;
+}
+
+void put_shortInfo(char* sinfo)
+{
+ if (sinfo != NULL)
+ {
+ free(sinfo);
+ sinfo = NULL;
+ }
+}
+
+int set_shortInfo(GroupInfo* ginfo, char* shortInfo)
+{
+ if ((ginfo == NULL) || (shortInfo == NULL))
+ return -EINVAL;
+ int size = strlen(shortInfo)+1;
+ ginfo->shortinfo = realloc(ginfo->shortinfo, size * sizeof(char));
+ if (ginfo->shortinfo == NULL)
+ return -ENOMEM;
+ sprintf(ginfo->shortinfo, "%s", shortInfo);
+ return 0;
+}
+
+char* get_longInfo(GroupInfo* ginfo)
+{
+ if ((ginfo != NULL) && (ginfo->longinfo != NULL))
+ {
+ int size = strlen(ginfo->longinfo)+1;
+ char* lstr = malloc(size * sizeof(char));
+ sprintf(lstr, "%s", ginfo->longinfo);
+ return lstr;
+ }
+ return NULL;
+}
+
+void put_longInfo(char* linfo)
+{
+ if (linfo != NULL)
+ {
+ free(linfo);
+ linfo = NULL;
+ }
+}
+
+int set_longInfo(GroupInfo* ginfo, char* longInfo)
+{
+ if ((ginfo == NULL) || (longInfo == NULL))
+ return -EINVAL;
+ int size = strlen(longInfo)+1;
+ ginfo->longinfo = realloc(ginfo->longinfo, size * sizeof(char));
+ if (ginfo->longinfo == NULL)
+ return -ENOMEM;
+ sprintf(ginfo->longinfo, "%s", longInfo);
+ return 0;
+}
+
+void return_group(GroupInfo* ginfo)
+{
+ int i;
+ if (ginfo->groupname)
+ free(ginfo->groupname);
+ if (ginfo->shortinfo)
+ free(ginfo->shortinfo);
+ if (ginfo->longinfo)
+ free(ginfo->longinfo);
+ if (ginfo->nevents > 0)
+ {
+ for(i=0;i<ginfo->nevents; i++)
+ {
+ if (ginfo->counters[i])
+ free(ginfo->counters[i]);
+ if (ginfo->events[i])
+ free(ginfo->events[i]);
+ }
+ free(ginfo->counters);
+ free(ginfo->events);
+ }
+ if (ginfo->nmetrics > 0)
+ {
+ for(i=0;i<ginfo->nmetrics; i++)
+ {
+ if (ginfo->metricformulas[i])
+ free(ginfo->metricformulas[i]);
+ if (ginfo->metricnames[i])
+ free(ginfo->metricnames[i]);
+ }
+ free(ginfo->metricformulas);
+ free(ginfo->metricnames);
+ }
+ ginfo->groupname = NULL;
+ ginfo->shortinfo = NULL;
+ ginfo->longinfo = NULL;
+ ginfo->counters = NULL;
+ ginfo->events = NULL;
+ ginfo->metricformulas = NULL;
+ ginfo->metricnames = NULL;
+ ginfo->nevents = 0;
+ ginfo->nmetrics = 0;
+}
+
+void init_clist(CounterList* clist)
+{
+ clist->counters = 0;
+ clist->cnames = NULL;
+ clist->cvalues = NULL;
+}
+
+int add_to_clist(CounterList* clist, char* counter, double result)
+{
+ char** tmpnames;
+ double* tmpvalues;
+ if ((clist == NULL)||(counter == NULL))
+ return -EINVAL;
+ tmpnames = realloc(clist->cnames, (clist->counters + 1) * sizeof(char*));
+ if (tmpnames == NULL)
+ {
+ return -ENOMEM;
+ }
+ clist->cnames = tmpnames;
+ tmpvalues = realloc(clist->cvalues, (clist->counters + 1) * sizeof(double));
+ if (tmpvalues == NULL)
+ {
+ return -ENOMEM;
+ }
+ clist->cvalues = tmpvalues;
+ clist->cnames[clist->counters] = malloc((strlen(counter)+2)*sizeof(char));
+ if (clist->cnames[clist->counters] == NULL)
+ {
+ return -ENOMEM;
+ }
+ sprintf(clist->cnames[clist->counters],"%s", counter);
+ clist->cvalues[clist->counters] = result;
+ clist->counters++;
+ return 0;
+}
+
+void destroy_clist(CounterList* clist)
+{
+ int i;
+ if (clist != NULL)
+ {
+ for (i=0;i<clist->counters;i++)
+ {
+ free(clist->cnames[i]);
+ }
+ free(clist->cnames);
+ free(clist->cvalues);
+ }
+}
+
+
+int calc_metric(char* formula, CounterList* clist, double *result)
+{
+ int i=0;
+ *result = 0.0;
+ int fail = 0;
+ int maxstrlen = 0, minstrlen = 10000;
+
+ if ((formula == NULL) || (clist == NULL))
+ return -EINVAL;
+
+ bstring f = bfromcstr(formula);
+ for(i=0;i<clist->counters;i++)
+ {
+ if (strlen(clist->cnames[i]) > maxstrlen)
+ maxstrlen = strlen(clist->cnames[i]);
+ if (strlen(clist->cnames[i]) < minstrlen)
+ minstrlen = strlen(clist->cnames[i]);
+ }
+
+ // try to replace each counter name in clist
+ while (maxstrlen >= minstrlen)
+ {
+ for(i=0;i<clist->counters;i++)
+ {
+ if (strlen(clist->cnames[i]) != maxstrlen)
+ continue;
+ // if we find the counter name, replace it with the value
+ bstring c = bfromcstr(clist->cnames[i]);
+ bstring v = bformat("%.20f", clist->cvalues[i]);
+ bfindreplace(f, c, v, 0);
+ bdestroy(c);
+ bdestroy(v);
+ }
+ maxstrlen--;
+ }
+ bstring test = bfromcstr("aAbBcCdDfFgGhHiIjJkKlLmMnNoOpPqQrRsStTuUvVwWxXyYzZ,_:;!'§$&=?°´`#<>");
+ if (binchr(f, 0, test) != BSTR_ERR)
+ {
+ fprintf(stderr, "Not all counter names in formula can be substituted\n");
+ fprintf(stderr, "%s\n", bdata(f));
+ i = -EINVAL;
+ fail = 1;
+ }
+ bdestroy(test);
+ // now we can calculate the formula
+ if (!fail)
+ i = calculate_infix(bdata(f), result);
+ bdestroy(f);
+ return i;
+}
diff --git a/src/perfmon.c b/src/perfmon.c
index 30cacba..ee4f80f 100644
--- a/src/perfmon.c
+++ b/src/perfmon.c
@@ -3,15 +3,16 @@
*
* Filename: perfmon.c
*
- * Description: Implementation of perfmon Module.
+ * Description: Main implementation of the performance monitoring module
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -28,8 +29,6 @@
* =======================================================================================
*/
-/* ##### HEADER FILE INCLUDES ######################################### */
-
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
@@ -37,100 +36,18 @@
#include <float.h>
#include <unistd.h>
#include <sys/types.h>
-#include <assert.h>
+
#include <types.h>
+#include <likwid.h>
#include <bitUtil.h>
-#include <bstrlib.h>
-#include <strUtil.h>
-#include <bitUtil.h>
-#include <error.h>
#include <timer.h>
-#include <accessClient.h>
-#include <msr.h>
-#include <pci.h>
#include <lock.h>
-#include <cpuid.h>
-#include <affinity.h>
-#include <tree.h>
-#include <power.h>
-#include <thermal.h>
#include <perfmon.h>
-#include <asciiTable.h>
#include <registers.h>
-
-
-/* ##### EXPORTED VARIABLES ########################################### */
-
-int perfmon_verbose = 0;
-int perfmon_csvoutput = 0;
-
-/* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */
-
-static PerfmonGroup groupSet = _NOGROUP;
-static PerfmonEvent* eventHash;
-static PerfmonCounterMap* counter_map;
-static PerfmonGroupMap* group_map;
-static PerfmonGroupHelp* group_help;
-static EventSetup * eventSetup;
-
-static TimerData timeData;
-static double rdtscTime;
-static PerfmonEventSet perfmon_set;
-static int perfmon_numGroups;
-static int perfmon_numCounters;
-static int perfmon_numArchEvents;
-static int perfmon_numThreads;
-static int perfmon_numRegions;
-static FILE* OUTSTREAM;
-static double** perfmon_threadState;
-static PerfmonThread* perfmon_threadData;
-
-static int socket_fd = -1;
-static int socket_lock[MAX_NUM_NODES];
-
-/* ##### PROTOTYPES - LOCAL TO THIS SOURCE FILE ##################### */
-
-static void initResultTable(PerfmonResultTable* tableData,
- bstrList* firstColumn,
- int numRows,
- int numColumns);
-
-static void initStatisticTable(PerfmonResultTable* tableData,
- bstrList* firstColumn,
- int numRows);
-
-static void printResultTable(PerfmonResultTable* tableData);
-static void freeResultTable(PerfmonResultTable* tableData);
-static void initThread(int , int );
-
-/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
-
-#define CHECKERROR \
- if (ret == EOF) \
- { \
- fprintf (stderr, "sscanf: Failed to read marker file!\n" ); \
- exit (EXIT_FAILURE);}
-
-#define bstrListAdd(bl,id,name) \
- label = bfromcstr(#name); \
- (bl)->entry[id] = bstrcpy(label); \
- (bl)->qty++; \
- bdestroy(label);
-
-#define INIT_EVENTS \
- fc = bstrListCreate(); \
- bstrListAlloc(fc, numRows+1); \
- bstrListAdd(fc,0,Event); \
- for (i=0; i<numRows; i++) \
- { \
- fc->entry[1+i] = \
- bfromcstr(perfmon_set.events[i].event.name); }
-
-#define INIT_BASIC \
- fc = bstrListCreate(); \
- bstrListAlloc(fc, numRows+1); \
- bstrListAdd(fc,0,Metric);
+#include <topology.h>
+#include <access.h>
+#include <perfgroup.h>
#include <perfmon_pm.h>
#include <perfmon_atom.h>
@@ -148,318 +65,176 @@ static void initThread(int , int );
#include <perfmon_interlagos.h>
#include <perfmon_kabini.h>
#include <perfmon_silvermont.h>
+#include <perfmon_broadwell.h>
+#include <perfmon_skylake.h>
+
+
+PerfmonEvent* eventHash = NULL;
+RegisterMap* counter_map = NULL;
+BoxMap* box_map = NULL;
+PciDevice* pci_devices = NULL;
+int perfmon_numCounters = 0;
+int perfmon_numCoreCounters = 0;
+int perfmon_numArchEvents = 0;
+int perfmon_initialized = 0;
+int perfmon_verbosity = DEBUGLEV_ONLY_ERROR;
+uint64_t currentConfig[MAX_NUM_THREADS][NUM_PMC] = { 0 };
+
+PerfmonGroupSet* groupSet = NULL;
+LikwidResults* markerResults = NULL;
+int markerRegions = 0;
+
+int (*perfmon_startCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+int (*perfmon_stopCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+int (*perfmon_readCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+int (*perfmon_setupCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+int (*perfmon_finalizeCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+
+int (*initThreadArch) (int cpu_id);
+
+void perfmon_delEventSet(int groupID);
+
+char* eventOptionTypeName[NUM_EVENT_OPTIONS] = {
+ "NONE",
+ "OPCODE",
+ "MATCH0",
+ "MATCH1",
+ "MATCH2",
+ "MATCH3",
+ "MASK0",
+ "MASK1",
+ "MASK2",
+ "MASK3",
+ "NID",
+ "TID",
+ "STATE",
+ "EDGEDETECT",
+ "THRESHOLD",
+ "INVERT",
+ "KERNEL",
+ "ANYTHREAD",
+ "OCCUPANCY",
+ "OCCUPANCY_FILTER",
+ "OCCUPANCY_EDGEDETECT",
+ "OCCUPANCY_INVERT",
+ "IN_TRANSACTION",
+ "IN_TRANSACTION_ABORTED"
+};
-/* ##### EXPORTED FUNCTION POINTERS ################################### */
-void (*perfmon_startCountersThread) (int thread_id);
-void (*perfmon_stopCountersThread) (int thread_id);
-void (*perfmon_readCountersThread) (int thread_id);
-void (*perfmon_setupCounterThread) (int thread_id,
- PerfmonEvent* event, PerfmonCounterIndex index);
-void (*printDerivedMetrics) (PerfmonGroup group);
-void (*logDerivedMetrics) (PerfmonGroup group, double time, double timeStamp);
-void (*perfmon_getDerivedCounterValuesArch)(PerfmonGroup group, float * values, float * out_max, float * out_min);
-
-
-/* ##### FUNCTION POINTERS - LOCAL TO THIS SOURCE FILE ################ */
-
-static void (*initThreadArch) (PerfmonThread *thread);
-
-/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
-
-static int getIndex (bstring reg, PerfmonCounterIndex* index)
+static int
+getIndexAndType (bstring reg, RegisterIndex* index, RegisterType* type, int force)
{
- int ret = FALSE;
int err = 0;
- uint64_t tmp;
+ int ret = FALSE;
+ uint64_t tmp = 0x0ULL;
+ int (*ownstrcmp)(const char*, const char*);
+ ownstrcmp = &strcmp;
+ int testcpu = groupSet->threads[0].processorId;
for (int i=0; i< perfmon_numCounters; i++)
{
if (biseqcstr(reg, counter_map[i].key))
{
*index = counter_map[i].index;
+ *type = counter_map[i].type;
ret = TRUE;
+ break;
}
}
- if ((ret) && (counter_map[*index].type != THERMAL) && (counter_map[*index].type != POWER))
- {
- if (counter_map[*index].device == 0)
- {
- tmp = msr_read(0, counter_map[*index].configRegister);
- msr_write(0, counter_map[*index].configRegister,0x0ULL);
- }
- else
- {
- tmp = pci_read(0, counter_map[*index].device, counter_map[*index].configRegister);
- pci_write(0, counter_map[*index].device, counter_map[*index].configRegister, 0x0U);
- }
- }
- else if ((ret) && (counter_map[*index].type == POWER))
- {
- tmp = msr_read(0, counter_map[*index].counterRegister);
- }
-
- return ret;
-}
-
-
-static int
-getEvent(bstring event_str, PerfmonEvent* event)
-{
- for (int i=0; i< perfmon_numArchEvents; i++)
- {
- if (biseqcstr(event_str, eventHash[i].name))
- {
- *event = eventHash[i];
-
- if (perfmon_verbose)
- {
- fprintf(OUTSTREAM,"Found event %s : \
- Event_id 0x%02X Umask 0x%02X CfgBits 0x%02X Cmask 0x%02X \n",
- bdata( event_str),
- event->eventId,
- event->umask,
- event->cfgBits,
- event->cmask);
- }
- return TRUE;
- }
- }
-
- return FALSE;
-}
-
-static void
-initThread(int thread_id, int cpu_id)
-{
- for (int i=0; i<NUM_PMC; i++)
+ if (ret == FALSE)
{
- perfmon_threadData[thread_id].counters[i].init = FALSE;
+ fprintf(stderr, "ERROR: Counter %s not available\n",bdata(reg));
+ *type = NOTYPE;
+ return FALSE;
}
-
- perfmon_threadData[thread_id].processorId = cpu_id;
- initThreadArch(&perfmon_threadData[thread_id]);
-}
-
-struct cbsScan{
- /* Parse state */
- bstring src;
- int line;
- LikwidResults* results;
-};
-
-static int lineCb (void* parm, int ofs, int len)
-{
- int ret;
- struct cbsScan* st = (struct cbsScan*) parm;
- struct bstrList* strList;
- bstring line;
-
- if (!len) return 1;
- strList = bstrListCreate();
-
- line = blk2bstr (st->src->data + ofs, len);
-
- if (st->line < perfmon_numRegions)
+ if (ret && (ownstrcmp(bdata(reg), counter_map[*index].key) != 0))
{
- int id;
- strList = bsplit(line,':');
-
- if( strList->qty < 2 )
- {
- ERROR_PLAIN_PRINT(Failed to read marker file);
- }
- ret = sscanf (bdata(strList->entry[0]), "%d", &id); CHECKERROR;
- st->results[id].tag = bstrcpy(line);
- bdelete(st->results[id].tag, 0, blength(strList->entry[0])+1);
+ *type = NOTYPE;
+ return FALSE;
}
- else
+ err = HPMcheck(counter_map[*index].device, 0);
+ if (!err)
{
- int tagId;
- int threadId;
-
- strList = bsplit(line,32);
-
- if( strList->qty < (3+NUM_PMC))
- {
- ERROR_PLAIN_PRINT(Failed to read marker file);
- }
-
- ret = sscanf(bdata(strList->entry[0]), "%d", &tagId); CHECKERROR;
- ret = sscanf(bdata(strList->entry[1]), "%d", &threadId); CHECKERROR;
- ret = sscanf(bdata(strList->entry[2]), "%u", &st->results[tagId].count[threadId]); CHECKERROR;
- ret = sscanf(bdata(strList->entry[3]), "%lf", &st->results[tagId].time[threadId]); CHECKERROR;
-
- for (int i=0;i<NUM_PMC; i++)
- {
- ret = sscanf(bdata(strList->entry[4+i]), "%lf", &st->results[tagId].counters[threadId][i]); CHECKERROR;
- }
+ *type = NOTYPE;
+ return FALSE;
}
-
- bstrListDestroy(strList);
- st->line++;
- bdestroy(line);
- return 1;
-}
-
-static void
-readMarkerFile(bstring filename, LikwidResults** resultsRef)
-{
- int numberOfThreads=0;
- int ret;
- int i,j,k;
- struct cbsScan sl;
- FILE * fp;
- LikwidResults* results = *resultsRef;
-
- if (NULL != (fp = fopen (bdata(filename), "r")))
+ if ((ret) && (*type != THERMAL) && (*type != POWER) && (*type != WBOX0FIX))
{
- bstring src = bread ((bNread) fread, fp);
-
- /* read header info */
- ret = sscanf (bdata(src), "%d %d", &numberOfThreads, &perfmon_numRegions); CHECKERROR;
- results = (LikwidResults*) malloc(perfmon_numRegions * sizeof(LikwidResults));
-
- if (perfmon_numRegions == 0)
- {
- fprintf(OUTSTREAM,"ERROR: No region results are listed in marker file\n");
- ERROR_PLAIN_PRINT(No region results in marker file);
- }
- else if (numberOfThreads != perfmon_numThreads)
+ int check_settings = 1;
+ uint32_t reg = counter_map[*index].configRegister;
+ if (reg == 0x0)
{
- fprintf(OUTSTREAM,"ERROR: Is the number of threads for likwid-perfctr equal to the number in the measured application?\n");
- fprintf(OUTSTREAM,"likwid_markerInit and likwid_markerClose must be called in serial region.\n");
-
- ERROR_PRINT(Number of threads %d in marker file unequal to number of threads in likwid-perfCtr %d,numberOfThreads,perfmon_numThreads);
+ reg = counter_map[*index].counterRegister;
+ check_settings = 0;
}
-
- /* allocate LikwidResults struct */
- for (i=0;i<perfmon_numRegions; i++)
+ err = HPMread(testcpu, counter_map[*index].device, reg, &tmp);
+ if (err != 0)
{
- results[i].time = (double*) malloc(numberOfThreads * sizeof(double));
- results[i].count = (uint32_t*) malloc(numberOfThreads * sizeof(uint32_t));
- results[i].counters = (double**) malloc(numberOfThreads * sizeof(double*));
-
- for (j=0;j<numberOfThreads; j++)
+ if (err == -ENODEV)
{
- results[i].time[j] = 0.0;
- results[i].counters[j] = (double*) malloc(NUM_PMC * sizeof(double));
-
- for (k=0;k<NUM_PMC; k++)
- {
- results[i].counters[j][k] = 0.0;
- }
+ DEBUG_PRINT(DEBUGLEV_DETAIL, Device %s not accessible on this machine,
+ pci_devices[box_map[*type].device].name);
}
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_DETAIL, Counter %s not readable on this machine,
+ counter_map[*index].key);
+ }
+ *type = NOTYPE;
+ ret = FALSE;
}
-
- sl.src = src;
- sl.line = 0;
- sl.results = results;
- bsplitcb (src, (char) '\n', bstrchr(src,10)+1, lineCb, &sl);
-
- fclose (fp);
- bdestroy (src);
- }
- else
- {
- fprintf(OUTSTREAM,"ERROR: The marker result file could not be found!\n");
- fprintf(OUTSTREAM,"Did you call likwid_markerClose() at the end of your measurement?\n");
- ERROR;
- }
-
- *resultsRef = results;
- bstring exeString = bformat("rm -f %s",bdata(filename));
- ret = system(bdata(exeString));
-
- if (ret == EOF)
- {
- ERROR;
- }
-
- bdestroy(exeString);
-}
-
-static void
-printResultTable(PerfmonResultTable * tableData)
-{
- if (perfmon_csvoutput)
- {
- int r, c;
- for (c = 0; c < tableData->header->qty; c++)
- {
- fprintf(OUTSTREAM, "%s%s", ((c == 0) ? "\n" : ","), tableData->header->entry[c]->data);
- }
- fprintf(OUTSTREAM, "%s", "\n");
-
- for (r = 0; r < tableData->numRows; r++)
+ else if (tmp == 0x0ULL)
{
- fprintf(OUTSTREAM, "%s", tableData->rows[r].label->data);
-
- for (c = 0; c < tableData->numColumns; c++)
+ err = HPMwrite(testcpu, counter_map[*index].device, reg, 0x0ULL);
+ if (err != 0)
{
- if (!isnan(tableData->rows[r].value[c]))
+ if (err == -ENODEV)
{
- fprintf(OUTSTREAM, ",%lf", tableData->rows[r].value[c]);
+ DEBUG_PRINT(DEBUGLEV_DETAIL, Device %s not accessible on this machine,
+ pci_devices[box_map[*type].device].name);
}
else
{
- fprintf(OUTSTREAM, ",%s", "nan");
+ DEBUG_PRINT(DEBUGLEV_DETAIL, Counter %s not writeable on this machine,
+ counter_map[*index].key);
}
+ *type = NOTYPE;
+ ret = FALSE;
}
- fprintf(OUTSTREAM, "%s", "\n");
+ check_settings = 0;
}
- fprintf(OUTSTREAM, "%s", "\n");
- }
- else
- {
- int i,j;
- TableContainer* table;
- bstrList* labelStrings = NULL;
- bstring label = bfromcstr("NO");
-
- table = asciiTable_allocate(tableData->numRows,
- tableData->numColumns+1,
- tableData->header);
- asciiTable_setOutput(OUTSTREAM);
-
- labelStrings = bstrListCreate();
- bstrListAlloc(labelStrings, tableData->numColumns+1);
-
- for (i=0; i<tableData->numRows; i++)
+ if ((check_settings) && (tmp != 0x0ULL))
{
- labelStrings->qty = 0;
- labelStrings->entry[0] = bstrcpy(tableData->rows[i].label);
- labelStrings->qty++;
-
- for (j=0; j<(tableData->numColumns);j++)
+ if (force == 1)
+ {
+ DEBUG_PRINT(DEBUGLEV_DETAIL, Counter %s has bits set (0x%llx) but we are forced to overwrite them,
+ counter_map[*index].key, tmp);
+ err = HPMwrite(testcpu, counter_map[*index].device, reg, 0x0ULL);
+ }
+ else if ((force == 0) && ((*type != FIXED)&&(*type != THERMAL)&&(*type != POWER)&&(*type != WBOX0FIX)))
{
- label = bformat("%g", tableData->rows[i].value[j]);
- labelStrings->entry[1+j] = bstrcpy(label);
- labelStrings->qty++;
+ fprintf(stderr, "ERROR: The selected register %s is in use.\n", counter_map[*index].key);
+ fprintf(stderr, "Please run likwid with force option (-f, --force) to overwrite settings\n");
+ exit(EXIT_SUCCESS);
}
- asciiTable_appendRow(table,labelStrings);
}
-
- asciiTable_print(table);
- bdestroy(label);
- bstrListDestroy(labelStrings);
- asciiTable_free(table);
}
-}
-
-static int
-getGroupId(bstring groupStr,PerfmonGroup* group)
-{
- *group = _NOGROUP;
-
- for (int i=0; i<perfmon_numGroups; i++)
+ else if ((ret) && ((*type == POWER) || (*type == WBOX0FIX) || (*type == THERMAL)))
{
- if (biseqcstr(groupStr,group_map[i].key))
+ err = HPMread(testcpu, MSR_DEV, counter_map[*index].counterRegister, &tmp);
+ if (err != 0)
{
- *group = group_map[i].index;
- return i;
+ DEBUG_PRINT(DEBUGLEV_DETAIL, Counter %s not readable on this machine,
+ counter_map[*index].key);
+ *type = NOTYPE;
+ ret = FALSE;
}
}
-
- return -1;
+ else
+ {
+ *type = NOTYPE;
+ ret = FALSE;
+ }
+ return ret;
}
static int
@@ -467,795 +242,377 @@ checkCounter(bstring counterName, const char* limit)
{
int i;
struct bstrList* tokens;
- int value = FALSE;
+ int ret = FALSE;
bstring limitString = bfromcstr(limit);
- tokens = bstrListCreate();
tokens = bsplit(limitString,'|');
-
for(i=0; i<tokens->qty; i++)
{
if(bstrncmp(counterName, tokens->entry[i], blength(tokens->entry[i])))
{
- value = FALSE;
+ ret = FALSE;
}
else
{
- value = TRUE;
+ ret = TRUE;
break;
}
}
-
bdestroy(limitString);
bstrListDestroy(tokens);
- return value;
+ return ret;
}
-static void
-freeResultTable(PerfmonResultTable* tableData)
+static int
+getEvent(bstring event_str, bstring counter_str, PerfmonEvent* event)
{
- int i;
-
- bstrListDestroy(tableData->header);
-
- for (i=0; i<tableData->numRows; i++)
+ int ret = FALSE;
+ int (*ownstrncmp)(const char *, const char *, size_t);
+ ownstrncmp = &strncmp;
+ for (int i=0; i< perfmon_numArchEvents; i++)
{
- free(tableData->rows[i].value);
+ if (biseqcstr(event_str, eventHash[i].name))
+ {
+ if (!checkCounter(counter_str, eventHash[i].limit))
+ {
+ continue;
+ }
+ *event = eventHash[i];
+ ret = TRUE;
+ break;
+ }
}
- free(tableData->rows);
+ return ret;
}
-static void
-initResultTable(PerfmonResultTable* tableData,
- bstrList* firstColumn,
- int numRows,
- int numColumns)
+static int
+assignOption(PerfmonEvent* event, bstring entry, int index, EventOptionType type, int zero_value)
{
- int i;
- bstrList* header;
- bstring label;
-
- header = bstrListCreate();
- bstrListAlloc(header, numColumns+1);
- header->entry[0] = bstrcpy(firstColumn->entry[0]); header->qty++;
-
- for (i=0; i<perfmon_numThreads;i++)
+ int found_double = -1;
+ int return_index = index;
+ long long unsigned int value;
+ for (int k = 0; k < index; k++)
{
- label = bformat("core %d",perfmon_threadData[i].processorId);
- header->entry[1+i] = bstrcpy(label); header->qty++;
+ if (event->options[k].type == type)
+ {
+ found_double = k;
+ break;
+ }
}
-
- tableData->numRows = numRows;
- tableData->numColumns = numColumns;
- tableData->header = header;
- tableData->rows = (PerfmonResult*) malloc(numRows*sizeof(PerfmonResult));
-
- for (i=0; i<numRows; i++)
+ if (found_double >= 0)
{
- tableData->rows[i].label = firstColumn->entry[1+i];
- tableData->rows[i].value =
- (double*) malloc((numColumns)*sizeof(double));
+ index = found_double;
}
-}
-
-static void
-initStatisticTable(PerfmonResultTable* tableData,
- bstrList* firstColumn,
- int numRows)
-{
- int i;
- int numColumns = 4;
- bstrList* header;
- bstring label;
-
- header = bstrListCreate();
- bstrListAlloc(header, numColumns+1);
- header->entry[0] = bstrcpy(firstColumn->entry[0]); header->qty++;
-
- label = bformat("Sum");
- header->entry[1] = bstrcpy(label); header->qty++;
- label = bformat("Max");
- header->entry[2] = bstrcpy(label); header->qty++;
- label = bformat("Min");
- header->entry[3] = bstrcpy(label); header->qty++;
- label = bformat("Avg");
- header->entry[4] = bstrcpy(label); header->qty++;
-
- tableData->numRows = numRows;
- tableData->numColumns = numColumns;
- tableData->header = header;
- tableData->rows = (PerfmonResult*) malloc(numRows*sizeof(PerfmonResult));
-
- for (i=0; i<numRows; i++)
+ else
{
- tableData->rows[i].label = firstColumn->entry[1+i];
- bcatcstr(tableData->rows[i].label," STAT");
- tableData->rows[i].value =
- (double*) malloc((numColumns)*sizeof(double));
+ return_index++;
}
-}
-
-static void printDerivedMetricsFixed(void)
-{
- int threadId;
- double time = rdtscTime;
- double inverseClock = 1.0 /(double) timer_getCpuClock();
- PerfmonResultTable tableData;
- int numRows;
- int numColumns = perfmon_numThreads;
- bstring label;
- bstrList* fc;
- double tmpValue;
-
- numRows = 4;
- INIT_BASIC;
-
- bstrListAdd(fc,1,Runtime (RDTSC) [s]);
- bstrListAdd(fc,2,Runtime unhalted [s]);
- bstrListAdd(fc,3,Clock [MHz]);
- bstrListAdd(fc,4,CPI);
-
- initResultTable(&tableData, fc, numRows, numColumns);
-
- for(threadId=0; threadId < perfmon_numThreads; threadId++)
- {
- tmpValue = time;
- if (!isnan(tmpValue))
- {
- tableData.rows[0].value[threadId] = tmpValue;
- }
- else
- {
- tableData.rows[0].value[threadId] = 0.0;
- }
-
- tmpValue = perfmon_getResult(threadId,"FIXC1")*inverseClock;
- if (!isnan(tmpValue))
- {
- tableData.rows[1].value[threadId] = tmpValue;
- }
- else
- {
- tableData.rows[1].value[threadId] = 0.0;
- }
-
- tmpValue = 1.E-06*(perfmon_getResult(threadId,"FIXC1")/perfmon_getResult(threadId,"FIXC2"))/inverseClock;
- if (!isnan(tmpValue))
- {
- tableData.rows[2].value[threadId] = tmpValue;
- }
- else
- {
- tableData.rows[2].value[threadId] = 0.0;
- }
-
- tmpValue = perfmon_getResult(threadId,"FIXC1")/perfmon_getResult(threadId,"FIXC0");
- if (!isnan(tmpValue))
- {
- tableData.rows[3].value[threadId] = tmpValue;
- }
- else
- {
- tableData.rows[3].value[threadId] = 0.0;
- }
-
+ event->options[index].type = type;
+ if (zero_value)
+ {
+ event->options[index].value = 0;
}
- printResultTable(&tableData);
- freeResultTable(&tableData);
-}
-
-/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
-
-void
-perfmon_setCSVMode(int v)
-{
- perfmon_csvoutput = v;
-}
-
-void
-perfmon_printCounters(void)
-{
- fprintf(OUTSTREAM,"This architecture has %d counters.\n", perfmon_numCounters);
- fprintf(OUTSTREAM,"Counters names: ");
-
- for (int i=0; i<perfmon_numCounters; i++)
+ else
{
- fprintf(OUTSTREAM,"%s\t",counter_map[i].key);
+ value = 0;
+ sscanf(bdata(entry), "%llx", &value);
+ event->options[index].value = value;
}
- fprintf(OUTSTREAM,".\n");
+ return return_index;
}
-void
-perfmon_printEvents(void)
+static int
+parseOptions(struct bstrList* tokens, PerfmonEvent* event, RegisterIndex index)
{
- int i;
+ int i,j;
+ struct bstrList* subtokens;
- fprintf(OUTSTREAM,"This architecture has %d events.\n", perfmon_numArchEvents);
- fprintf(OUTSTREAM,"Event tags (tag, id, umask, counters):\n");
-
- for (i=0; i<perfmon_numArchEvents; i++)
+ for (i = event->numberOfOptions; i < MAX_EVENT_OPTIONS; i++)
{
- fprintf(OUTSTREAM,"%s, 0x%X, 0x%X, %s \n",
- eventHash[i].name,
- eventHash[i].eventId,
- eventHash[i].umask,
- eventHash[i].limit);
+ event->options[i].type = EVENT_OPTION_NONE;
}
-}
-
-
-double
-perfmon_getResult(int threadId, char* counterString)
-{
- bstring counter = bfromcstr(counterString);
- PerfmonCounterIndex index;
-
- if (getIndex(counter,&index))
- {
- return perfmon_threadData[threadId].counters[index].counterData;
- }
-
- fprintf (stderr, "perfmon_getResult: Failed to get counter Index!\n" );
- return 0.0;
-}
-
-
-void
-perfmon_initEventSet(StrUtilEventSet* eventSetConfig, PerfmonEventSet* set)
-{
- set->numberOfEvents = eventSetConfig->numberOfEvents;
- set->events = (PerfmonEventSetEntry*)
- malloc(set->numberOfEvents * sizeof(PerfmonEventSetEntry));
- for (int i=0; i<set->numberOfEvents; i++)
+ if (tokens->qty-2 > MAX_EVENT_OPTIONS)
{
- /* get register index */
- if (!getIndex(eventSetConfig->events[i].counterName,
- &set->events[i].index))
- {
- ERROR_PRINT(Counter register %s not supported,bdata(
- eventSetConfig->events[i].counterName));
- }
-
- /* setup event */
- if (!getEvent(eventSetConfig->events[i].eventName,
- &set->events[i].event))
- {
- ERROR_PRINT(Event %s not found for current architecture,
- bdata(eventSetConfig->events[i].eventName));
- }
-
- /* is counter allowed for event */
- if (!checkCounter(eventSetConfig->events[i].counterName,
- set->events[i].event.limit))
- {
- ERROR_PRINT(Register not allowed for event %s,
- bdata(eventSetConfig->events[i].eventName));
- }
+ return -ERANGE;
}
-}
-void
-perfmon_printMarkerResults(bstring filepath)
-{
- int i;
- int j;
- int region;
- LikwidResults* results = NULL;
- PerfmonResultTable tableData;
- PerfmonResultTable regionData;
- int numRows = perfmon_set.numberOfEvents;
- int numColumns = perfmon_numThreads;
- bstrList* fc;
- bstrList* regionLabels;
- bstring label;
- INIT_EVENTS;
-
- readMarkerFile(filepath, &results);
- initResultTable(&tableData, fc, numRows, numColumns);
- regionLabels = bstrListCreate();
- bstrListAlloc(regionLabels, 3);
- bstrListAdd(regionLabels, 0, Region Info);
- bstrListAdd(regionLabels, 1, RDTSC Runtime [s]);
- bstrListAdd(regionLabels, 2, call count);
-
- for (region=0; region<perfmon_numRegions; region++)
- {
- initResultTable(&tableData, fc, numRows, numColumns);
- fprintf(OUTSTREAM,"\n=====================\n");
- fprintf(OUTSTREAM,"Region: %s \n", bdata(results[region].tag));
- fprintf(OUTSTREAM,"=====================\n");
- initResultTable(®ionData, regionLabels, 2, numColumns);
-
- for (j=0; j<numColumns; j++)
- {
- regionData.rows[0].value[j] = results[region].time[j];
- regionData.rows[1].value[j] = (double) results[region].count[j];
- }
- printResultTable(®ionData);
- for (i=0; i<numRows; i++)
+ for (i=2;i<tokens->qty;i++)
+ {
+ subtokens = bsplit(tokens->entry[i],'=');
+ btolower(subtokens->entry[0]);
+ if (subtokens->qty == 1)
{
- for (j=0; j<numColumns; j++)
+ if (biseqcstr(subtokens->entry[0], "edgedetect") == 1)
{
- tableData.rows[i].value[j] =
- results[region].counters[j][perfmon_set.events[i].index];
+ event->numberOfOptions = assignOption(event, subtokens->entry[1],
+ event->numberOfOptions, EVENT_OPTION_EDGE, 1);
}
- }
-
- printResultTable(&tableData);
-
- for (j=0; j<numColumns; j++)
- {
- for (i=0; i<numRows; i++)
+ else if (biseqcstr(subtokens->entry[0], "invert") == 1)
{
- perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData =
- results[region].counters[j][perfmon_set.events[i].index];
+ event->numberOfOptions = assignOption(event, subtokens->entry[1],
+ event->numberOfOptions, EVENT_OPTION_INVERT, 1);
}
+ else if (biseqcstr(subtokens->entry[0], "kernel") == 1)
+ {
+ event->numberOfOptions = assignOption(event, subtokens->entry[1],
+ event->numberOfOptions, EVENT_OPTION_COUNT_KERNEL, 1);
+ }
+ else if (biseqcstr(subtokens->entry[0], "anythread") == 1)
+ {
+ event->numberOfOptions = assignOption(event, subtokens->entry[1],
+ event->numberOfOptions, EVENT_OPTION_ANYTHREAD, 1);
+ }
+ else if (biseqcstr(subtokens->entry[0], "occ_edgedetect") == 1)
+ {
+ event->numberOfOptions = assignOption(event, subtokens->entry[1],
+ event->numberOfOptions, EVENT_OPTION_OCCUPANCY_EDGE, 1);
+ }
+ else if (biseqcstr(subtokens->entry[0], "occ_invert") == 1)
+ {
+ event->numberOfOptions = assignOption(event, subtokens->entry[1],
+ event->numberOfOptions, EVENT_OPTION_OCCUPANCY_INVERT, 1);
+ }
+ else if (biseqcstr(subtokens->entry[0], "in_trans") == 1)
+ {
+ event->numberOfOptions = assignOption(event, subtokens->entry[1],
+ event->numberOfOptions, EVENT_OPTION_IN_TRANS, 1);
+ }
+ else if (biseqcstr(subtokens->entry[0], "in_trans_aborted") == 1)
+ {
+ event->numberOfOptions = assignOption(event, subtokens->entry[1],
+ event->numberOfOptions, EVENT_OPTION_IN_TRANS_ABORT, 1);
+ }
+ else
+ {
+ continue;
+ }
+ event->options[event->numberOfOptions].value = 0;
}
- rdtscTime = results[region].time[0];
- if (groupSet != _NOGROUP)
- {
- printDerivedMetrics(groupSet);
- }
- else if ( cpuid_info.family == P6_FAMILY )
+ else if (subtokens->qty == 2)
{
- printDerivedMetricsFixed();
- }
- }
-
- for (i=0;i<perfmon_numRegions; i++)
- {
- for (j=0;j<perfmon_numThreads; j++)
- {
- free(results[i].counters[j]);
- }
-
- free(results[i].counters);
- free(results[i].time);
- }
-
- freeResultTable(&tableData);
- freeResultTable(®ionData);
- bstrListDestroy(fc);
- bstrListDestroy(regionLabels);
-}
-
-void
-perfmon_logCounterResults(double time)
-{
- int i;
- int j;
- double tmp;
- static double timeStamp = 0.0;
-
- timeStamp += time;
-
- for (i=0; i<perfmon_set.numberOfEvents; i++)
- {
- fprintf(OUTSTREAM, "%s %e ", perfmon_set.events[i].event.name, timeStamp);
- for (j=0; j<perfmon_numThreads; j++)
- {
- fprintf(OUTSTREAM, "%e ",
- (double) (perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData) - perfmon_threadState[j][perfmon_set.events[i].index]);
- perfmon_threadState[j][perfmon_set.events[i].index] = perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData;
- }
- fprintf(OUTSTREAM,"\n");
- }
-
- if (groupSet != _NOGROUP)
- {
- logDerivedMetrics(groupSet, time, timeStamp);
- }
-
- fflush(OUTSTREAM);
-}
-
-void
-perfmon_printCounterResults()
-{
- int i;
- int j;
- PerfmonResultTable tableData;
- int numRows = perfmon_set.numberOfEvents;
- int numColumns = perfmon_numThreads;
- double stat[perfmon_set.numberOfEvents][4]; /* 0:sum, 1:max, 2:min, 3:avg */
- bstrList* fc;
- bstring label;
- INIT_EVENTS;
-
- for (i=0; i<numRows; i++)
- {
- stat[i][0] = 0;
- stat[i][1] = 0;
- stat[i][2] = DBL_MAX;
- }
-
- initResultTable(&tableData, fc, numRows, numColumns);
-
- /* print raw event data */
- for (i=0; i<numRows; i++)
- {
- for (j=0; j<numColumns; j++)
- {
- tableData.rows[i].value[j] =
- (double) perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData;
- stat[i][0] +=
- (double) perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData;
- stat[i][1] = MAX(stat[i][1],
- (double) perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData);
- stat[i][2] = MIN(stat[i][2],
- (double) perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData);
- }
- }
- printResultTable(&tableData);
- freeResultTable(&tableData);
-
-
- /* for threaded results print sum, max, min and avg */
- if (perfmon_numThreads > 1)
- {
- initStatisticTable(&tableData, fc, numRows);
-
- for (i=0; i<numRows; i++)
- {
- stat[i][3] = stat[i][0]/perfmon_numThreads;
-
- for (j=0; j<4; j++)
+ if (biseqcstr(subtokens->entry[0], "opcode") == 1)
+ {
+ event->numberOfOptions = assignOption(event, subtokens->entry[1],
+ event->numberOfOptions, EVENT_OPTION_OPCODE, 0);
+ }
+ else if (biseqcstr(subtokens->entry[0], "match0") == 1)
+ {
+ event->numberOfOptions = assignOption(event, subtokens->entry[1],
+ event->numberOfOptions, EVENT_OPTION_MATCH0, 0);
+ }
+ else if (biseqcstr(subtokens->entry[0], "match1") == 1)
+ {
+ event->numberOfOptions = assignOption(event, subtokens->entry[1],
+ event->numberOfOptions, EVENT_OPTION_MATCH1, 0);
+ }
+ else if (biseqcstr(subtokens->entry[0], "match2") == 1)
+ {
+ event->numberOfOptions = assignOption(event, subtokens->entry[1],
+ event->numberOfOptions, EVENT_OPTION_MATCH2, 0);
+ }
+ else if (biseqcstr(subtokens->entry[0], "match3") == 1)
+ {
+ event->numberOfOptions = assignOption(event, subtokens->entry[1],
+ event->numberOfOptions, EVENT_OPTION_MATCH3, 0);
+ }
+ else if (biseqcstr(subtokens->entry[0], "mask0") == 1)
+ {
+ event->numberOfOptions = assignOption(event, subtokens->entry[1],
+ event->numberOfOptions, EVENT_OPTION_MASK0, 0);
+ }
+ else if (biseqcstr(subtokens->entry[0], "mask1") == 1)
+ {
+ event->numberOfOptions = assignOption(event, subtokens->entry[1],
+ event->numberOfOptions, EVENT_OPTION_MASK1, 0);
+ }
+ else if (biseqcstr(subtokens->entry[0], "mask2") == 1)
+ {
+ event->numberOfOptions = assignOption(event, subtokens->entry[1],
+ event->numberOfOptions, EVENT_OPTION_MASK2, 0);
+ }
+ else if (biseqcstr(subtokens->entry[0], "mask3") == 1)
+ {
+ event->numberOfOptions = assignOption(event, subtokens->entry[1],
+ event->numberOfOptions, EVENT_OPTION_MASK3, 0);
+ }
+ else if (biseqcstr(subtokens->entry[0], "nid") == 1)
+ {
+ event->numberOfOptions = assignOption(event, subtokens->entry[1],
+ event->numberOfOptions, EVENT_OPTION_NID, 0);
+ }
+ else if (biseqcstr(subtokens->entry[0], "tid") == 1)
+ {
+ event->numberOfOptions = assignOption(event, subtokens->entry[1],
+ event->numberOfOptions, EVENT_OPTION_TID, 0);
+ }
+ else if (biseqcstr(subtokens->entry[0], "state") == 1)
+ {
+ event->numberOfOptions = assignOption(event, subtokens->entry[1],
+ event->numberOfOptions, EVENT_OPTION_STATE, 0);
+ }
+ else if (biseqcstr(subtokens->entry[0], "threshold") == 1)
{
- tableData.rows[i].value[j] = stat[i][j];
+ event->numberOfOptions = assignOption(event, subtokens->entry[1],
+ event->numberOfOptions, EVENT_OPTION_THRESHOLD, 0);
+ }
+ else if (biseqcstr(subtokens->entry[0], "occupancy") == 1)
+ {
+ event->numberOfOptions = assignOption(event, subtokens->entry[1],
+ event->numberOfOptions, EVENT_OPTION_OCCUPANCY, 0);
+ }
+ else if (biseqcstr(subtokens->entry[0], "occ_filter") == 1)
+ {
+ event->numberOfOptions = assignOption(event, subtokens->entry[1],
+ event->numberOfOptions, EVENT_OPTION_OCCUPANCY_FILTER, 0);
+ }
+ else
+ {
+ continue;
}
}
- printResultTable(&tableData);
- freeResultTable(&tableData);
- }
-
- if (groupSet != _NOGROUP)
- {
- /* print derived metrics */
- printDerivedMetrics(groupSet);
+ bstrListDestroy(subtokens);
}
- else if ( cpuid_info.family == P6_FAMILY )
+ for(i=event->numberOfOptions-1;i>=0;i--)
{
- printDerivedMetricsFixed();
- }
-}
-
-double
-perfmon_getEventResult(int thread, int index)
-{
- return (double) perfmon_threadData[thread].counters[perfmon_set.events[index].index].counterData;
-}
-
-EventSetup perfmon_prepareEventSetup(char* eventGroupString){
- EventSetup setup;
- bstring eventString = bfromcstr(eventGroupString);
-
- setup.eventSetConfig = malloc(sizeof(setup.eventSetConfig));
- setup.perfmon_set = malloc(sizeof(setup.perfmon_set));
-
- int groupId = getGroupId(eventString, & setup.groupSet);
- setup.groupName = strdup(eventGroupString);
- setup.groupIndex = groupId;
- if (setup.groupSet == _NOGROUP)
- {
- /* eventString is a custom eventSet */
- bstr_to_eventset(setup.eventSetConfig, eventString);
- }
- else
- {
- /* eventString is a group */
- eventString = bfromcstr(group_map[groupId].config);
- bstr_to_eventset(setup.eventSetConfig, eventString);
- }
-
- perfmon_initEventSet(setup.eventSetConfig, setup.perfmon_set);
- bdestroy(eventString);
-
- setup.eventNames = (const char**) malloc(setup.perfmon_set->numberOfEvents * sizeof(const char*));
-
- setup.numberOfEvents = setup.perfmon_set->numberOfEvents;
- for (int i=0; i< setup.perfmon_set->numberOfEvents; i++)
- {
- setup.eventNames[i] = setup.perfmon_set->events[i].event.name;
- }
-
- setup.numberOfDerivedCounters = group_map[groupId].derivedCounters;
- setup.derivedNames = (const char**) malloc(setup.numberOfDerivedCounters * sizeof(const char*));
-
- for(int i=0; i < group_map[groupId].derivedCounters; i++){
- setup.derivedNames[i] = group_map[groupId].derivedCounterNames[i];
- }
-
- return setup;
-}
-
-
-void perfmon_setupCountersForEventSet(EventSetup * setup){
- perfmon_set = *setup->perfmon_set;
- groupSet = setup->groupSet;
- eventSetup = setup;
- perfmon_setupCounters();
-}
-
-void perfmon_getEventCounterValues(uint64_t * values, uint64_t * out_max, uint64_t * out_min){
-
- for(int e = 0; e < perfmon_set.numberOfEvents; e++ ){
- uint64_t sum = 0;
- uint64_t min = (uint64_t) -1;
- uint64_t max = 0;
-
- for(int i = 0; i < perfmon_numThreads; i++){
- uint64_t cur = perfmon_threadData[i].counters[e].counterData;
- sum += cur;
- max = max > cur ? max : cur;
- min = min < cur ? min : cur;
+ if (!(OPTIONS_TYPE_MASK(event->options[i].type) & (counter_map[index].optionMask|event->optionMask)))
+ {
+ DEBUG_PRINT(DEBUGLEV_INFO,Removing Option %s not valid for register %s,
+ eventOptionTypeName[event->options[i].type],
+ counter_map[index].key);
+ event->options[i].type = EVENT_OPTION_NONE;
+ event->numberOfOptions--;
}
- values[e] = sum / perfmon_numThreads;
- out_min[e] = min;
- out_max[e] = max;
- }
-}
-
-void perfmon_getDerivedCounterValues(float * values, float * out_max, float * out_min){
- perfmon_getDerivedCounterValuesArch(eventSetup->groupSet, values, out_max, out_min);
-}
-
-int
-perfmon_setupEventSetC(char* eventCString, const char*** eventnames)
-{
- int i;
- bstring eventString = bfromcstr(eventCString);
- StrUtilEventSet eventSetConfig;
- int groupId;
-
- groupId = getGroupId(eventString, &groupSet);
- if (groupSet == _NOGROUP)
- {
- /* eventString is a custom eventSet */
- bstr_to_eventset(&eventSetConfig, eventString);
- }
- else
- {
- /* eventString is a group */
- eventString = bfromcstr(group_map[groupId].config);
- bstr_to_eventset(&eventSetConfig, eventString);
- }
-
- perfmon_initEventSet(&eventSetConfig, &perfmon_set);
- perfmon_setupCounters();
- bdestroy(eventString);
-
- (*eventnames) = (const char**) malloc(perfmon_set.numberOfEvents * sizeof(const char*));
-
- for (i=0; i<perfmon_set.numberOfEvents; i++)
- {
- (*eventnames)[i] = perfmon_set.events[i].event.name;
- }
-
- return perfmon_set.numberOfEvents;
-}
-
-void
-perfmon_setupEventSet(bstring eventString, BitMask* counterMask)
-{
- int groupId;
- int eventBool = FALSE;
- StrUtilEventSet eventSetConfig;
- PerfmonEvent eventSet;
- struct bstrList* subStr;
-
- groupId = getGroupId(eventString, &groupSet);
-
- if (groupSet == _NOGROUP)
- {
- subStr = bstrListCreate();
- subStr = bsplit(eventString,':');
- eventBool = getEvent(subStr->entry[0], &eventSet);
- bstrListDestroy(subStr);
}
- if (groupSet == _NOGROUP && eventBool != FALSE)
+ for(i=0;i<event->numberOfOptions;i++)
{
- /* eventString is a custom eventSet */
- /* append fixed counters for Intel processors */
- if ( cpuid_info.family == P6_FAMILY )
+ if (event->options[i].type == EVENT_OPTION_EDGE)
{
- if (cpuid_info.perf_num_fixed_ctr > 0)
+ int threshold_set = FALSE;
+ for (j=0;j<event->numberOfOptions;j++)
{
- bcatcstr(eventString,",INSTR_RETIRED_ANY:FIXC0");
+ if (event->options[i].type == EVENT_OPTION_THRESHOLD)
+ {
+ threshold_set = TRUE;
+ break;
+ }
}
- if (cpuid_info.perf_num_fixed_ctr > 1)
+ if ((threshold_set == FALSE) && (event->numberOfOptions < MAX_EVENT_OPTIONS))
{
- bcatcstr(eventString,",CPU_CLK_UNHALTED_CORE:FIXC1");
+ event->options[event->numberOfOptions].type = EVENT_OPTION_THRESHOLD;
+ event->options[event->numberOfOptions].value = 0x1;
+ event->numberOfOptions++;
}
- if (cpuid_info.perf_num_fixed_ctr > 2)
+ else
{
- bcatcstr(eventString,",CPU_CLK_UNHALTED_REF:FIXC2");
+ ERROR_PLAIN_PRINT(Cannot set threshold option to default. no more space in options list);
}
}
- bstr_to_eventset(&eventSetConfig, eventString);
- }
- else if (groupId < 0 && eventBool == FALSE)
- {
- ERROR_PLAIN_PRINT(Unsupported group or event for this architecture!);
- exit(EXIT_FAILURE);
- }
- else
- {
- if ( group_map[groupId].isUncore )
+ else if (event->options[i].type == EVENT_OPTION_OCCUPANCY)
{
- if ( (cpuid_info.model != SANDYBRIDGE_EP) &&
- (cpuid_info.model != IVYBRIDGE_EP) &&
- (cpuid_info.model != WESTMERE_EX) &&
- (cpuid_info.model != NEHALEM_EX))
+ int threshold_set = FALSE;
+ int edge_set = FALSE;
+ int invert_set = FALSE;
+ for (j=0;j<event->numberOfOptions;j++)
+ {
+ if (event->options[i].type == EVENT_OPTION_THRESHOLD)
+ {
+ threshold_set = TRUE;
+ break;
+ }
+ if (event->options[i].type == EVENT_OPTION_EDGE)
+ {
+ edge_set = TRUE;
+ break;
+ }
+ if (event->options[i].type == EVENT_OPTION_INVERT)
+ {
+ invert_set = TRUE;
+ break;
+ }
+ }
+ if ((threshold_set == FALSE) && (event->numberOfOptions < MAX_EVENT_OPTIONS) &&
+ (edge_set == TRUE || invert_set == TRUE ))
{
- ERROR_PLAIN_PRINT(Uncore not supported on Desktop processors!);
- exit(EXIT_FAILURE);
+ event->options[event->numberOfOptions].type = EVENT_OPTION_THRESHOLD;
+ event->options[event->numberOfOptions].value = 0x1;
+ event->numberOfOptions++;
}
- }
-
- fprintf(OUTSTREAM,"Measuring group %s\n", group_map[groupId].key);
- /* eventString is a group */
- eventString = bfromcstr(group_map[groupId].config);
- bstr_to_eventset(&eventSetConfig, eventString);
- }
-
- perfmon_initEventSet(&eventSetConfig, &perfmon_set);
- perfmon_setupCounters();
-
- if ( counterMask != NULL )
- {
- bitMask_init((*counterMask));
- /* Extract counter mask from first thread */
- for (int index=0; index<perfmon_numCounters; index++)
- {
- if ( perfmon_threadData[0].counters[index].init == TRUE )
+ else
{
- bitMask_set((*counterMask),index);
+ ERROR_PLAIN_PRINT(Cannot set threshold option to default. no more space in options list);
}
}
}
-}
+
+ return event->numberOfOptions;
+}
-void
-perfmon_setupCounters()
+static double
+calculateResult(int groupId, int eventId, int threadId)
{
- for (int j=0; j<perfmon_set.numberOfEvents; j++)
+ PerfmonEventSetEntry* event;
+ PerfmonCounter* counter;
+ int cpu_id;
+ double result = 0.0;
+ if (groupSet->groups[groupId].events[eventId].type == NOTYPE)
+ return result;
+
+ event = &(groupSet->groups[groupId].events[eventId]);
+ counter = &(event->threadCounter[threadId]);
+ if (counter->overflows == 0)
{
- for (int i=0; i<perfmon_numThreads; i++)
- {
- perfmon_setupCounterThread(i,
- &perfmon_set.events[j].event,
- perfmon_set.events[j].index);
- }
+ result = (double) (counter->counterData - counter->startData);
}
-}
-
-void
-perfmon_startCounters(void)
-{
- for (int i=0;i<perfmon_numThreads;i++)
+ else if (counter->overflows > 0)
{
- perfmon_startCountersThread(i);
+ result += (double) ((perfmon_getMaxCounterValue(counter_map[event->index].type) - counter->startData) + counter->counterData);
+ counter->overflows--;
}
-
- timer_start(&timeData);
-}
-
-void
-perfmon_stopCounters(void)
-{
- int i;
-
- timer_stop(&timeData);
-
- for (i=0;i<perfmon_numThreads;i++)
+ result += (double) (counter->overflows * perfmon_getMaxCounterValue(counter_map[event->index].type));
+ if (counter_map[event->index].type == POWER)
{
- perfmon_stopCountersThread(i);
+ result *= power_getEnergyUnit(getCounterTypeOffset(event->index));
}
-
- rdtscTime = timer_print(&timeData);
-}
-
-void
-perfmon_readCounters(void)
-{
- int i;
-
- for (i=0;i<perfmon_numThreads;i++)
+ else if (counter_map[event->index].type == THERMAL)
{
- perfmon_readCountersThread(i);
+ result = (double)counter->counterData;
}
+ return result;
}
-
-void
-perfmon_printAvailableGroups()
+int
+getCounterTypeOffset(int index)
{
- int i;
-
- fprintf(OUTSTREAM,"Available groups on %s:\n",cpuid_info.name);
-
- for(i=0; i<perfmon_numGroups; i++)
+ int off = 0;
+ for (int j=index-1;j>=0;j--)
{
- if ( group_map[i].isUncore )
+ if (counter_map[index].type == counter_map[j].type)
{
- if ( (cpuid_info.model == SANDYBRIDGE_EP) ||
- (cpuid_info.model == IVYBRIDGE_EP) ||
- (cpuid_info.model == WESTMERE_EX) ||
- (cpuid_info.model == NEHALEM_EX))
- {
- fprintf(OUTSTREAM,"%s: %s\n",group_map[i].key,
- group_map[i].info);
- }
+ off++;
}
else
{
- fprintf(OUTSTREAM,"%s: %s\n",group_map[i].key,
- group_map[i].info);
+ break;
}
}
+ return off;
}
-void
-perfmon_printGroupHelp(bstring group)
+void perfmon_setVerbosity(int level)
{
- int i;
- PerfmonGroup groupDummy;
-
- if ((i = getGroupId(group,&groupDummy))<0)
- {
- ERROR_PLAIN_PRINT(Group not found);
- }
- else
- {
- fprintf(OUTSTREAM,"Group %s:\n",bdata(group));
- fprintf(OUTSTREAM,"%s",group_help[i].msg);
- }
+ if ((level >= DEBUGLEV_ONLY_ERROR) && (level <= DEBUGLEV_DEVELOP))
+ perfmon_verbosity = level;
}
-
-
void
-perfmon_init(int numThreads_local, int threads[], FILE* outstream)
+perfmon_init_maps(void)
{
- if (!lock_check())
- {
- fprintf(stderr,"Access to performance counters is locked.\n");
- exit(EXIT_FAILURE);
- }
-
- perfmon_numThreads = numThreads_local;
- perfmon_threadData = (PerfmonThread*)
- malloc(perfmon_numThreads * sizeof(PerfmonThread));
- /* This is specific for daemon mode. */
- perfmon_threadState = (double**)
- malloc(perfmon_numThreads * sizeof(double*));
-
- for (int i=0; i<perfmon_numThreads; i++)
- {
- perfmon_threadState[i] = (double*)
- malloc(NUM_PMC * sizeof(double));
- for(int j=0; j<NUM_PMC;j++)
- {
- perfmon_threadState[i][j] = 0.0;
- }
- }
-
- OUTSTREAM = outstream;
-
- for(int i=0; i<MAX_NUM_NODES; i++) socket_lock[i] = LOCK_INIT;
-
- if (accessClient_mode != DAEMON_AM_DIRECT)
- {
- accessClient_init(&socket_fd);
- }
-
- msr_init(socket_fd);
-
+ box_map = NULL;
switch ( cpuid_info.family )
{
case P6_FAMILY:
@@ -1263,75 +620,37 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
switch ( cpuid_info.model )
{
case PENTIUM_M_BANIAS:
-
case PENTIUM_M_DOTHAN:
-
eventHash = pm_arch_events;
perfmon_numArchEvents = perfmon_numArchEvents_pm;
-
- group_map = pm_group_map;
- // group_help = pm_group_help;
- perfmon_numGroups = perfmon_numGroups_pm;
-
counter_map = pm_counter_map;
+ box_map = pm_box_map;
perfmon_numCounters = perfmon_numCounters_pm;
-
- initThreadArch = perfmon_init_pm;
- printDerivedMetrics = perfmon_printDerivedMetrics_pm;
- assert(FALSE && "NOT SUPPORTED");
- perfmon_startCountersThread = perfmon_startCountersThread_pm;
- perfmon_stopCountersThread = perfmon_stopCountersThread_pm;
- perfmon_setupCounterThread = perfmon_setupCounterThread_pm;
break;
case ATOM_45:
-
case ATOM_32:
-
case ATOM_22:
-
case ATOM:
-
eventHash = atom_arch_events;
perfmon_numArchEvents = perfmon_numArchEventsAtom;
-
- group_map = atom_group_map;
- group_help = atom_group_help;
- perfmon_numGroups = perfmon_numGroupsAtom;
-
counter_map = core2_counter_map;
perfmon_numCounters = perfmon_numCountersCore2;
-
- initThreadArch = perfmon_init_core2;
- printDerivedMetrics = perfmon_printDerivedMetricsAtom;
- perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesAtom;
- perfmon_startCountersThread = perfmon_startCountersThread_core2;
- perfmon_stopCountersThread = perfmon_stopCountersThread_core2;
- perfmon_setupCounterThread = perfmon_setupCounterThread_core2;
+ box_map = core2_box_map;
break;
- case ATOM_SILVERMONT_C:
case ATOM_SILVERMONT_E:
- case ATOM_SILVERMONT_F1:
- case ATOM_SILVERMONT_F2:
- case ATOM_SILVERMONT_F3:
- power_init(0);
- thermal_init(0);
+ case ATOM_SILVERMONT_C:
+ case ATOM_SILVERMONT_Z1:
+ case ATOM_SILVERMONT_Z2:
+ case ATOM_SILVERMONT_F:
+ case ATOM_SILVERMONT_AIR:
eventHash = silvermont_arch_events;
perfmon_numArchEvents = perfmon_numArchEventsSilvermont;
-
- group_map = silvermont_group_map;
- group_help = silvermont_group_help;
- perfmon_numGroups = perfmon_numGroupsSilvermont;
-
counter_map = silvermont_counter_map;
+ box_map = silvermont_box_map;
perfmon_numCounters = perfmon_numCountersSilvermont;
-
- initThreadArch = perfmon_init_silvermont;
- printDerivedMetrics = perfmon_printDerivedMetricsSilvermont;
- perfmon_startCountersThread = perfmon_startCountersThread_silvermont;
- perfmon_stopCountersThread = perfmon_stopCountersThread_silvermont;
- perfmon_setupCounterThread = perfmon_setupCounterThread_silvermont;
+ perfmon_numCoreCounters = perfmon_numCoreCountersSilvermont;
break;
case CORE_DUO:
@@ -1339,216 +658,370 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
break;
case XEON_MP:
-
case CORE2_65:
-
case CORE2_45:
-
eventHash = core2_arch_events;
perfmon_numArchEvents = perfmon_numArchEventsCore2;
-
- group_map = core2_group_map;
- group_help = core2_group_help;
- perfmon_numGroups = perfmon_numGroupsCore2;
-
counter_map = core2_counter_map;
perfmon_numCounters = perfmon_numCountersCore2;
-
- initThreadArch = perfmon_init_core2;
- printDerivedMetrics = perfmon_printDerivedMetricsCore2;
- perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesCore2;
-
- logDerivedMetrics = perfmon_logDerivedMetricsCore2;
- perfmon_startCountersThread = perfmon_startCountersThread_core2;
- perfmon_stopCountersThread = perfmon_stopCountersThread_core2;
- perfmon_readCountersThread = perfmon_readCountersThread_core2;
- perfmon_setupCounterThread = perfmon_setupCounterThread_core2;
+ box_map = core2_box_map;
break;
case NEHALEM_EX:
-
eventHash = nehalemEX_arch_events;
perfmon_numArchEvents = perfmon_numArchEventsNehalemEX;
-
- group_map = nehalemEX_group_map;
- group_help = nehalemEX_group_help;
- perfmon_numGroups = perfmon_numGroupsNehalemEX;
-
- counter_map = westmereEX_counter_map;
- perfmon_numCounters = perfmon_numCountersWestmereEX;
-
- initThreadArch = perfmon_init_nehalemEX;
- printDerivedMetrics = perfmon_printDerivedMetricsNehalemEX;
- perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesNehalemEX;
- logDerivedMetrics = perfmon_logDerivedMetricsNehalemEX;
- perfmon_startCountersThread = perfmon_startCountersThread_nehalemEX;
- perfmon_stopCountersThread = perfmon_stopCountersThread_nehalemEX;
- perfmon_readCountersThread = perfmon_readCountersThread_nehalemEX;
- perfmon_setupCounterThread = perfmon_setupCounterThread_nehalemEX;
+ counter_map = nehalemEX_counter_map;
+ perfmon_numCounters = perfmon_numCountersNehalemEX;
+ box_map = nehalemEX_box_map;
break;
case WESTMERE_EX:
-
eventHash = westmereEX_arch_events;
perfmon_numArchEvents = perfmon_numArchEventsWestmereEX;
-
- group_map = westmereEX_group_map;
- group_help = westmereEX_group_help;
- perfmon_numGroups = perfmon_numGroupsWestmereEX;
-
counter_map = westmereEX_counter_map;
perfmon_numCounters = perfmon_numCountersWestmereEX;
-
- initThreadArch = perfmon_init_westmereEX;
- printDerivedMetrics = perfmon_printDerivedMetricsWestmereEX;
- perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesWestmereEX;
- logDerivedMetrics = perfmon_logDerivedMetricsWestmereEX;
- perfmon_startCountersThread = perfmon_startCountersThread_westmereEX;
- perfmon_stopCountersThread = perfmon_stopCountersThread_westmereEX;
- perfmon_readCountersThread = perfmon_readCountersThread_westmereEX;
- perfmon_setupCounterThread = perfmon_setupCounterThread_westmereEX;
+ box_map = westmereEX_box_map;
break;
case NEHALEM_BLOOMFIELD:
-
case NEHALEM_LYNNFIELD:
-
- thermal_init(0);
-
+ case NEHALEM_LYNNFIELD_M:
eventHash = nehalem_arch_events;
perfmon_numArchEvents = perfmon_numArchEventsNehalem;
-
- group_map = nehalem_group_map;
- group_help = nehalem_group_help;
- perfmon_numGroups = perfmon_numGroupsNehalem;
-
counter_map = nehalem_counter_map;
perfmon_numCounters = perfmon_numCountersNehalem;
+ box_map = nehalem_box_map;
+ break;
- initThreadArch = perfmon_init_nehalem;
- printDerivedMetrics = perfmon_printDerivedMetricsNehalem;
- perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesNehalem;
+ case NEHALEM_WESTMERE_M:
+ case NEHALEM_WESTMERE:
+ eventHash = westmere_arch_events;
+ perfmon_numArchEvents = perfmon_numArchEventsWestmere;
+ counter_map = nehalem_counter_map;
+ perfmon_numCounters = perfmon_numCountersNehalem;
+ box_map = nehalem_box_map;
+ break;
- logDerivedMetrics = perfmon_logDerivedMetricsNehalem;
- perfmon_startCountersThread = perfmon_startCountersThread_nehalem;
- perfmon_stopCountersThread = perfmon_stopCountersThread_nehalem;
- perfmon_readCountersThread = perfmon_readCountersThread_nehalem;
- perfmon_setupCounterThread = perfmon_setupCounterThread_nehalem;
+ case IVYBRIDGE_EP:
+ pci_devices = ivybridgeEP_pci_devices;
+ box_map = ivybridgeEP_box_map;
+ eventHash = ivybridgeEP_arch_events;
+ perfmon_numArchEvents = perfmon_numArchEventsIvybridgeEP;
+ counter_map = ivybridgeEP_counter_map;
+ perfmon_numCounters = perfmon_numCountersIvybridgeEP;
+ perfmon_numCoreCounters = perfmon_numCoreCountersIvybridgeEP;
+ break;
+ case IVYBRIDGE:
+ eventHash = ivybridge_arch_events;
+ box_map = ivybridge_box_map;
+ perfmon_numArchEvents = perfmon_numArchEventsIvybridge;
+ counter_map = ivybridge_counter_map;
+ perfmon_numCounters = perfmon_numCountersIvybridge;
+ perfmon_numCoreCounters = perfmon_numCoreCountersIvybridge;
break;
- case NEHALEM_WESTMERE_M:
+ case HASWELL_EP:
+ eventHash = haswellEP_arch_events;
+ perfmon_numArchEvents = perfmon_numArchEventsHaswellEP;
+ counter_map = haswellEP_counter_map;
+ perfmon_numCounters = perfmon_numCountersHaswellEP;
+ perfmon_numCoreCounters = perfmon_numCoreCountersHaswellEP;
+ box_map = haswellEP_box_map;
+ pci_devices = haswellEP_pci_devices;
+ break;
+ case HASWELL:
+ case HASWELL_M1:
+ case HASWELL_M2:
+ eventHash = haswell_arch_events;
+ perfmon_numArchEvents = perfmon_numArchEventsHaswell;
+ counter_map = haswell_counter_map;
+ perfmon_numCounters = perfmon_numCountersHaswell;
+ perfmon_numCoreCounters = perfmon_numCoreCountersHaswell;
+ box_map = haswell_box_map;
+ break;
- case NEHALEM_WESTMERE:
+ case SANDYBRIDGE_EP:
+ pci_devices = sandybridgeEP_pci_devices;
+ box_map = sandybridgeEP_box_map;
+ eventHash = sandybridgeEP_arch_events;
+ perfmon_numArchEvents = perfmon_numArchEventsSandybridgeEP;
+ counter_map = sandybridgeEP_counter_map;
+ perfmon_numCounters = perfmon_numCountersSandybridgeEP;
+ perfmon_numCoreCounters = perfmon_numCoreCountersSandybridgeEP;
+ break;
+ case SANDYBRIDGE:
+ box_map = sandybridge_box_map;
+ eventHash = sandybridge_arch_events;
+ perfmon_numArchEvents = perfmon_numArchEventsSandybridge;
+ counter_map = sandybridge_counter_map;
+ perfmon_numCounters = perfmon_numCountersSandybridge;
+ perfmon_numCoreCounters = perfmon_numCoreCountersSandybridge;
+ break;
- thermal_init(0);
+ case BROADWELL:
+ box_map = broadwell_box_map;
+ eventHash = broadwell_arch_events;
+ counter_map = broadwell_counter_map;
+ perfmon_numArchEvents = perfmon_numArchEventsBroadwell;
+ perfmon_numCounters = perfmon_numCountersBroadwell;
+ perfmon_numCoreCounters = perfmon_numCoreCountersBroadwell;
+ break;
+ case BROADWELL_D:
+ box_map = broadwelld_box_map;
+ eventHash = broadwelld_arch_events;
+ counter_map = broadwelld_counter_map;
+ perfmon_numArchEvents = perfmon_numArchEventsBroadwellD;
+ perfmon_numCounters = perfmon_numCountersBroadwellD;
+ perfmon_numCoreCounters = perfmon_numCoreCountersBroadwellD;
+ break;
+ case BROADWELL_E:
+ box_map = broadwellEP_box_map;
+ eventHash = broadwellEP_arch_events;
+ counter_map = broadwellEP_counter_map;
+ perfmon_numArchEvents = perfmon_numArchEventsBroadwellEP;
+ perfmon_numCounters = perfmon_numCountersBroadwellEP;
+ perfmon_numCoreCounters = perfmon_numCoreCountersBroadwellEP;
+ break;
- eventHash = westmere_arch_events;
- perfmon_numArchEvents = perfmon_numArchEventsWestmere;
+ case SKYLAKE1:
+ case SKYLAKE2:
+ box_map = skylake_box_map;
+ eventHash = skylake_arch_events;
+ counter_map = skylake_counter_map;
+ perfmon_numArchEvents = perfmon_numArchEventsSkylake;
+ perfmon_numCounters = perfmon_numCountersSkylake;
+ perfmon_numCoreCounters = perfmon_numCoreCountersSkylake;
+ break;
- group_map = westmere_group_map;
- group_help = westmere_group_help;
- perfmon_numGroups = perfmon_numGroupsWestmere;
+ default:
+ ERROR_PLAIN_PRINT(Unsupported Processor);
+ break;
+ }
+ break;
- counter_map = nehalem_counter_map;
- perfmon_numCounters = perfmon_numCountersNehalem;
+ case MIC_FAMILY:
- initThreadArch = perfmon_init_nehalem;
- printDerivedMetrics = perfmon_printDerivedMetricsWestmere;
- perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesWestmere;
+ switch ( cpuid_info.model )
+ {
+ case XEON_PHI:
+ eventHash = phi_arch_events;
+ perfmon_numArchEvents = perfmon_numArchEventsPhi;
+ counter_map = phi_counter_map;
+ box_map = phi_box_map;
+ perfmon_numCounters = perfmon_numCountersPhi;
+ break;
- logDerivedMetrics = perfmon_logDerivedMetricsWestmere;
- perfmon_startCountersThread = perfmon_startCountersThread_nehalem;
- perfmon_stopCountersThread = perfmon_stopCountersThread_nehalem;
- perfmon_readCountersThread = perfmon_readCountersThread_nehalem;
- perfmon_setupCounterThread = perfmon_setupCounterThread_nehalem;
+ default:
+ ERROR_PLAIN_PRINT(Unsupported Processor);
break;
+ }
+ break;
- case IVYBRIDGE:
+ case K8_FAMILY:
+ eventHash = k8_arch_events;
+ perfmon_numArchEvents = perfmon_numArchEventsK8;
+ counter_map = k10_counter_map;
+ box_map = k10_box_map;
+ perfmon_numCounters = perfmon_numCountersK10;
+ break;
- case IVYBRIDGE_EP:
+ case K10_FAMILY:
+ eventHash = k10_arch_events;
+ perfmon_numArchEvents = perfmon_numArchEventsK10;
+ counter_map = k10_counter_map;
+ box_map = k10_box_map;
+ perfmon_numCounters = perfmon_numCountersK10;
+ break;
- power_init(0); /* FIXME Static coreId is dangerous */
- thermal_init(0);
- pci_init(socket_fd);
+ case K15_FAMILY:
+ eventHash = interlagos_arch_events;
+ perfmon_numArchEvents = perfmon_numArchEventsInterlagos;
+ counter_map = interlagos_counter_map;
+ box_map = interlagos_box_map;
+ perfmon_numCounters = perfmon_numCountersInterlagos;
+ break;
- eventHash = ivybridge_arch_events;
- perfmon_numArchEvents = perfmon_numArchEventsIvybridge;
+ case K16_FAMILY:
+ eventHash = kabini_arch_events;
+ perfmon_numArchEvents = perfmon_numArchEventsKabini;
+ counter_map = kabini_counter_map;
+ box_map = kabini_box_map;
+ perfmon_numCounters = perfmon_numCountersKabini;
+ break;
- group_map = ivybridge_group_map;
- group_help = ivybridge_group_help;
- perfmon_numGroups = perfmon_numGroupsIvybridge;
+ default:
+ ERROR_PLAIN_PRINT(Unsupported Processor);
+ break;
+ }
+ return;
+}
- counter_map = ivybridge_counter_map;
- perfmon_numCounters = perfmon_numCountersIvybridge;
+void
+perfmon_init_funcs(int* init_power, int* init_temp)
+{
+ int initialize_power = FALSE;
+ int initialize_thermal = FALSE;
+ switch ( cpuid_info.family )
+ {
+ case P6_FAMILY:
- initThreadArch = perfmon_init_ivybridge;
- printDerivedMetrics = perfmon_printDerivedMetricsIvybridge;
- perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesIvybridge;
+ switch ( cpuid_info.model )
+ {
+ case PENTIUM_M_BANIAS:
+ case PENTIUM_M_DOTHAN:
+ initThreadArch = perfmon_init_pm;
+ perfmon_startCountersThread = perfmon_startCountersThread_pm;
+ perfmon_stopCountersThread = perfmon_stopCountersThread_pm;
+ perfmon_setupCountersThread = perfmon_setupCounterThread_pm;
+ perfmon_readCountersThread = perfmon_readCountersThread_pm;
+ perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_pm;
+ break;
- logDerivedMetrics = perfmon_logDerivedMetricsIvybridge;
- perfmon_startCountersThread = perfmon_startCountersThread_ivybridge;
- perfmon_stopCountersThread = perfmon_stopCountersThread_ivybridge;
- perfmon_readCountersThread = perfmon_readCountersThread_ivybridge;
- perfmon_setupCounterThread = perfmon_setupCounterThread_ivybridge;
+ case ATOM_45:
+ case ATOM_32:
+ case ATOM_22:
+ case ATOM:
+ initThreadArch = perfmon_init_core2;
+ perfmon_startCountersThread = perfmon_startCountersThread_core2;
+ perfmon_stopCountersThread = perfmon_stopCountersThread_core2;
+ perfmon_setupCountersThread = perfmon_setupCounterThread_core2;
+ perfmon_readCountersThread = perfmon_readCountersThread_core2;
+ perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_core2;
break;
- case HASWELL:
+ case ATOM_SILVERMONT_E:
+ case ATOM_SILVERMONT_C:
+ case ATOM_SILVERMONT_Z1:
+ case ATOM_SILVERMONT_Z2:
+ case ATOM_SILVERMONT_F:
+ case ATOM_SILVERMONT_AIR:
+ initialize_power = TRUE;
+ initialize_thermal = TRUE;
+ initThreadArch = perfmon_init_silvermont;
+ perfmon_startCountersThread = perfmon_startCountersThread_silvermont;
+ perfmon_stopCountersThread = perfmon_stopCountersThread_silvermont;
+ perfmon_setupCountersThread = perfmon_setupCountersThread_silvermont;
+ perfmon_readCountersThread = perfmon_readCountersThread_silvermont;
+ perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_silvermont;
+ break;
- case HASWELL_EX:
- case HASWELL_M1:
+ case CORE_DUO:
+ ERROR_PLAIN_PRINT(Unsupported Processor);
+ break;
- case HASWELL_M2:
+ case XEON_MP:
+ case CORE2_65:
+ case CORE2_45:
+ initThreadArch = perfmon_init_core2;
+ perfmon_startCountersThread = perfmon_startCountersThread_core2;
+ perfmon_stopCountersThread = perfmon_stopCountersThread_core2;
+ perfmon_readCountersThread = perfmon_readCountersThread_core2;
+ perfmon_setupCountersThread = perfmon_setupCounterThread_core2;
+ perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_core2;
+ break;
- power_init(0); /* FIXME Static coreId is dangerous */
- thermal_init(0);
+ case NEHALEM_EX:
+ initThreadArch = perfmon_init_nehalemEX;
+ perfmon_startCountersThread = perfmon_startCountersThread_nehalemEX;
+ perfmon_stopCountersThread = perfmon_stopCountersThread_nehalemEX;
+ perfmon_readCountersThread = perfmon_readCountersThread_nehalemEX;
+ perfmon_setupCountersThread = perfmon_setupCounterThread_nehalemEX;
+ perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_nehalemEX;
+ break;
- eventHash = haswell_arch_events;
- perfmon_numArchEvents = perfmon_numArchEventsHaswell;
+ case WESTMERE_EX:
+ initThreadArch = perfmon_init_westmereEX;
+ perfmon_startCountersThread = perfmon_startCountersThread_westmereEX;
+ perfmon_stopCountersThread = perfmon_stopCountersThread_westmereEX;
+ perfmon_readCountersThread = perfmon_readCountersThread_westmereEX;
+ perfmon_setupCountersThread = perfmon_setupCounterThread_westmereEX;
+ perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_westmereEX;
+ break;
- group_map = haswell_group_map;
- group_help = haswell_group_help;
- perfmon_numGroups = perfmon_numGroupsHaswell;
+ case NEHALEM_BLOOMFIELD:
+ case NEHALEM_LYNNFIELD:
+ initialize_thermal = TRUE;
+ initThreadArch = perfmon_init_nehalem;
+ perfmon_startCountersThread = perfmon_startCountersThread_nehalem;
+ perfmon_stopCountersThread = perfmon_stopCountersThread_nehalem;
+ perfmon_readCountersThread = perfmon_readCountersThread_nehalem;
+ perfmon_setupCountersThread = perfmon_setupCounterThread_nehalem;
+ perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_nehalem;
+ break;
- counter_map = haswell_counter_map;
- perfmon_numCounters = perfmon_numCountersHaswell;
+ case NEHALEM_WESTMERE_M:
+ case NEHALEM_WESTMERE:
+ initialize_thermal = TRUE;
+ initThreadArch = perfmon_init_nehalem;
+ perfmon_startCountersThread = perfmon_startCountersThread_nehalem;
+ perfmon_stopCountersThread = perfmon_stopCountersThread_nehalem;
+ perfmon_readCountersThread = perfmon_readCountersThread_nehalem;
+ perfmon_setupCountersThread = perfmon_setupCounterThread_nehalem;
+ perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_nehalem;
+ break;
+
+ case IVYBRIDGE_EP:
+ case IVYBRIDGE:
+ initialize_power = TRUE;
+ initialize_thermal = TRUE;
+ initThreadArch = perfmon_init_ivybridge;
+ perfmon_startCountersThread = perfmon_startCountersThread_ivybridge;
+ perfmon_stopCountersThread = perfmon_stopCountersThread_ivybridge;
+ perfmon_readCountersThread = perfmon_readCountersThread_ivybridge;
+ perfmon_setupCountersThread = perfmon_setupCounterThread_ivybridge;
+ perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_ivybridge;
+ break;
+ case HASWELL_EP:
+ case HASWELL:
+ case HASWELL_M1:
+ case HASWELL_M2:
+ initialize_power = TRUE;
+ initialize_thermal = TRUE;
initThreadArch = perfmon_init_haswell;
- printDerivedMetrics = perfmon_printDerivedMetricsHaswell;
- perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesHaswell;
- logDerivedMetrics = perfmon_logDerivedMetricsHaswell;
perfmon_startCountersThread = perfmon_startCountersThread_haswell;
perfmon_stopCountersThread = perfmon_stopCountersThread_haswell;
perfmon_readCountersThread = perfmon_readCountersThread_haswell;
- perfmon_setupCounterThread = perfmon_setupCounterThread_haswell;
+ perfmon_setupCountersThread = perfmon_setupCounterThread_haswell;
+ perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_haswell;
break;
- case SANDYBRIDGE:
-
case SANDYBRIDGE_EP:
-
- power_init(0); /* FIXME Static coreId is dangerous */
- thermal_init(0);
- pci_init(socket_fd);
-
- eventHash = sandybridge_arch_events;
- perfmon_numArchEvents = perfmon_numArchEventsSandybridge;
-
- group_map = sandybridge_group_map;
- group_help = sandybridge_group_help;
- perfmon_numGroups = perfmon_numGroupsSandybridge;
-
- counter_map = sandybridge_counter_map;
- perfmon_numCounters = perfmon_numCountersSandybridge;
-
+ case SANDYBRIDGE:
+ initialize_power = TRUE;
+ initialize_thermal = TRUE;
initThreadArch = perfmon_init_sandybridge;
- printDerivedMetrics = perfmon_printDerivedMetricsSandybridge;
- perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesSandybridge;
- logDerivedMetrics = perfmon_logDerivedMetricsSandybridge;
perfmon_startCountersThread = perfmon_startCountersThread_sandybridge;
perfmon_stopCountersThread = perfmon_stopCountersThread_sandybridge;
perfmon_readCountersThread = perfmon_readCountersThread_sandybridge;
- perfmon_setupCounterThread = perfmon_setupCounterThread_sandybridge;
+ perfmon_setupCountersThread = perfmon_setupCounterThread_sandybridge;
+ perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_sandybridge;
+ break;
+
+ case BROADWELL:
+ case BROADWELL_E:
+ case BROADWELL_D:
+ initialize_power = TRUE;
+ initialize_thermal = TRUE;
+ initThreadArch = perfmon_init_broadwell;
+ perfmon_startCountersThread = perfmon_startCountersThread_broadwell;
+ perfmon_stopCountersThread = perfmon_stopCountersThread_broadwell;
+ perfmon_readCountersThread = perfmon_readCountersThread_broadwell;
+ perfmon_setupCountersThread = perfmon_setupCounterThread_broadwell;
+ perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_broadwell;
+ break;
+
+ case SKYLAKE1:
+ case SKYLAKE2:
+ initialize_power = TRUE;
+ initialize_thermal = TRUE;
+ initThreadArch = perfmon_init_skylake;
+ perfmon_startCountersThread = perfmon_startCountersThread_skylake;
+ perfmon_stopCountersThread = perfmon_stopCountersThread_skylake;
+ perfmon_readCountersThread = perfmon_readCountersThread_skylake;
+ perfmon_setupCountersThread = perfmon_setupCounterThread_skylake;
+ perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_skylake;
break;
default:
@@ -1562,25 +1035,12 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
switch ( cpuid_info.model )
{
case XEON_PHI:
-
- eventHash = phi_arch_events;
- perfmon_numArchEvents = perfmon_numArchEventsPhi;
-
- group_map = phi_group_map;
- group_help = phi_group_help;
- perfmon_numGroups = perfmon_numGroupsPhi;
-
- counter_map = phi_counter_map;
- perfmon_numCounters = perfmon_numCountersPhi;
-
initThreadArch = perfmon_init_phi;
- printDerivedMetrics = perfmon_printDerivedMetricsPhi;
- perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesPhi;
- logDerivedMetrics = perfmon_logDerivedMetricsPhi;
perfmon_startCountersThread = perfmon_startCountersThread_phi;
perfmon_stopCountersThread = perfmon_stopCountersThread_phi;
perfmon_readCountersThread = perfmon_readCountersThread_phi;
- perfmon_setupCounterThread = perfmon_setupCounterThread_phi;
+ perfmon_setupCountersThread = perfmon_setupCounterThread_phi;
+ perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_phi;
break;
default:
@@ -1590,115 +1050,1640 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
break;
case K8_FAMILY:
- eventHash = k8_arch_events;
- perfmon_numArchEvents = perfmon_numArchEventsK8;
-
- group_map = k8_group_map;
- group_help = k8_group_help;
- perfmon_numGroups = perfmon_numGroupsK8;
-
- counter_map = k10_counter_map;
- perfmon_numCounters = perfmon_numCountersK10;
-
initThreadArch = perfmon_init_k10;
- printDerivedMetrics = perfmon_printDerivedMetricsK8;
- perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesK8;
- logDerivedMetrics = perfmon_logDerivedMetricsK8;
perfmon_startCountersThread = perfmon_startCountersThread_k10;
perfmon_stopCountersThread = perfmon_stopCountersThread_k10;
perfmon_readCountersThread = perfmon_readCountersThread_k10;
- perfmon_setupCounterThread = perfmon_setupCounterThread_k10;
+ perfmon_setupCountersThread = perfmon_setupCounterThread_k10;
+ perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_k10;
break;
case K10_FAMILY:
- eventHash = k10_arch_events;
- perfmon_numArchEvents = perfmon_numArchEventsK10;
-
- group_map = k10_group_map;
- group_help = k10_group_help;
- perfmon_numGroups = perfmon_numGroupsK10;
-
- counter_map = k10_counter_map;
- perfmon_numCounters = perfmon_numCountersK10;
-
initThreadArch = perfmon_init_k10;
- printDerivedMetrics = perfmon_printDerivedMetricsK10;
- perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesK10;
- logDerivedMetrics = perfmon_logDerivedMetricsK10;
perfmon_startCountersThread = perfmon_startCountersThread_k10;
perfmon_stopCountersThread = perfmon_stopCountersThread_k10;
perfmon_readCountersThread = perfmon_readCountersThread_k10;
- perfmon_setupCounterThread = perfmon_setupCounterThread_k10;
+ perfmon_setupCountersThread = perfmon_setupCounterThread_k10;
+ perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_k10;
break;
case K15_FAMILY:
- eventHash = interlagos_arch_events;
- perfmon_numArchEvents = perfmon_numArchEventsInterlagos;
-
- group_map = interlagos_group_map;
- group_help = interlagos_group_help;
- perfmon_numGroups = perfmon_numGroupsInterlagos;
-
- counter_map = interlagos_counter_map;
- perfmon_numCounters = perfmon_numCountersInterlagos;
-
initThreadArch = perfmon_init_interlagos;
- printDerivedMetrics = perfmon_printDerivedMetricsInterlagos;
- perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesInterlagos;
- logDerivedMetrics = perfmon_logDerivedMetricsInterlagos;
perfmon_startCountersThread = perfmon_startCountersThread_interlagos;
perfmon_stopCountersThread = perfmon_stopCountersThread_interlagos;
perfmon_readCountersThread = perfmon_readCountersThread_interlagos;
- perfmon_setupCounterThread = perfmon_setupCounterThread_interlagos;
+ perfmon_setupCountersThread = perfmon_setupCounterThread_interlagos;
+ perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_interlagos;
break;
case K16_FAMILY:
- eventHash = kabini_arch_events;
- perfmon_numArchEvents = perfmon_numArchEventsKabini;
-
- group_map = kabini_group_map;
- group_help = kabini_group_help;
- perfmon_numGroups = perfmon_numGroupsKabini;
-
- counter_map = kabini_counter_map;
- perfmon_numCounters = perfmon_numCountersKabini;
-
initThreadArch = perfmon_init_kabini;
- printDerivedMetrics = perfmon_printDerivedMetricsKabini;
- perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesKabini;
- logDerivedMetrics = perfmon_logDerivedMetricsKabini;
perfmon_startCountersThread = perfmon_startCountersThread_kabini;
perfmon_stopCountersThread = perfmon_stopCountersThread_kabini;
perfmon_readCountersThread = perfmon_readCountersThread_kabini;
- perfmon_setupCounterThread = perfmon_setupCounterThread_kabini;
+ perfmon_setupCountersThread = perfmon_setupCounterThread_kabini;
+ perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_kabini;
break;
default:
ERROR_PLAIN_PRINT(Unsupported Processor);
break;
}
-
-
- for (int i=0; i<perfmon_numThreads; i++)
- {
- initThread(i,threads[i]);
- }
+ *init_power = initialize_power;
+ *init_temp = initialize_thermal;
}
-void
-perfmon_finalize()
+
+int
+perfmon_init(int nrThreads, int threadsToCpu[])
{
int i;
+ int ret;
+ int initialize_power = FALSE;
+ int initialize_thermal = FALSE;
- free(perfmon_threadData);
+ if (perfmon_initialized == 1)
+ {
+ return 0;
+ }
- for (i=0; i<perfmon_numThreads; i++)
+ if (nrThreads <= 0)
{
- free(perfmon_threadState[i]);
+ ERROR_PRINT(Number of threads must be greater than 0 but only %d given,nrThreads);
+ return -EINVAL;
}
- free(perfmon_threadState);
- msr_finalize();
- pci_finalize();
- accessClient_finalize(socket_fd);
-}
+ if (!lock_check())
+ {
+ ERROR_PLAIN_PRINT(Access to performance monitoring registers locked);
+ return -EINVAL;
+ }
+
+ if ((cpuid_info.family == 0) && (cpuid_info.model == 0))
+ {
+ ERROR_PLAIN_PRINT(Topology module not inialized. Needed to determine current CPU type);
+ return -ENODEV;
+ }
+
+ /* Check threadsToCpu array if only valid cpu_ids are listed */
+ if (groupSet != NULL)
+ {
+ /* TODO: Decision whether setting new thread count and adjust processorIds
+ * or just exit like implemented now
+ */
+ return -EEXIST;
+ }
+
+ groupSet = (PerfmonGroupSet*) malloc(sizeof(PerfmonGroupSet));
+ if (groupSet == NULL)
+ {
+ ERROR_PLAIN_PRINT(Cannot allocate group descriptor);
+ return -ENOMEM;
+ }
+ groupSet->threads = (PerfmonThread*) malloc(nrThreads * sizeof(PerfmonThread));
+ if (groupSet->threads == NULL)
+ {
+ ERROR_PLAIN_PRINT(Cannot allocate set of threads);
+ free(groupSet);
+ return -ENOMEM;
+ }
+ groupSet->numberOfThreads = nrThreads;
+ groupSet->numberOfGroups = 0;
+ groupSet->numberOfActiveGroups = 0;
+ groupSet->groups = NULL;
+ groupSet->activeGroup = -1;
+
+ for(i=0; i<MAX_NUM_NODES; i++) socket_lock[i] = LOCK_INIT;
+ for(i=0; i<MAX_NUM_THREADS; i++) tile_lock[i] = LOCK_INIT;
+
+ /* Initialize maps pointer to current architecture maps */
+ perfmon_init_maps();
+
+ /* Initialize access interface */
+ ret = HPMinit();
+ if (ret)
+ {
+ ERROR_PLAIN_PRINT(Cannot set access functions);
+ free(groupSet->threads);
+ free(groupSet);
+ exit(EXIT_FAILURE);
+ return ret;
+ }
+ timer_init();
+
+
+ /* Initialize function pointer to current architecture functions */
+ perfmon_init_funcs(&initialize_power, &initialize_thermal);
+
+ /* Store thread information and reset counters for processor*/
+ /* If the arch supports it, initialize power and thermal measurements */
+ for(i=0;i<nrThreads;i++)
+ {
+ if (HPMaddThread(threadsToCpu[i]) != 0)
+ {
+ ERROR_PLAIN_PRINT(Cannot get access to performance counters);
+ }
+ groupSet->threads[i].thread_id = i;
+ groupSet->threads[i].processorId = threadsToCpu[i];
+
+ if (HPMcheck(MSR_DEV, threadsToCpu[i]) == 0)
+ {
+ fprintf(stderr, "Cannot get access to MSRs. Please check permissions to the MSRs\n");
+ exit(EXIT_FAILURE);
+ }
+ if (initialize_power == TRUE)
+ {
+ power_init(threadsToCpu[i]);
+ }
+ if (initialize_thermal == TRUE)
+ {
+ thermal_init(threadsToCpu[i]);
+ }
+ initThreadArch(threadsToCpu[i]);
+ }
+ perfmon_initialized = 1;
+ return 0;
+}
+
+void
+perfmon_finalize(void)
+{
+ int group, event;
+ int thread;
+ if (perfmon_initialized == 0)
+ {
+ return;
+ }
+ if (groupSet == NULL)
+ {
+ return;
+ }
+ for(group=0;group < groupSet->numberOfActiveGroups; group++)
+ {
+
+ for (thread=0;thread< groupSet->numberOfThreads; thread++)
+ {
+ perfmon_finalizeCountersThread(thread, &(groupSet->groups[group]));
+ }
+ for (event=0;event < groupSet->groups[group].numberOfEvents; event++)
+ {
+ if (groupSet->groups[group].events[event].threadCounter)
+ free(groupSet->groups[group].events[event].threadCounter);
+ }
+ if (groupSet->groups[group].events != NULL)
+ free(groupSet->groups[group].events);
+ perfmon_delEventSet(group);
+ groupSet->groups[group].state = STATE_NONE;
+ }
+ if (groupSet->groups != NULL)
+ free(groupSet->groups);
+ if (groupSet->threads != NULL)
+ free(groupSet->threads);
+ groupSet->activeGroup = -1;
+ if (groupSet)
+ free(groupSet);
+ for (group=0; group < MAX_NUM_THREADS; group++)
+ {
+ memset(currentConfig[group], 0, NUM_PMC * sizeof(uint64_t));
+ }
+ if (markerResults != NULL)
+ {
+ perfmon_destroyMarkerResults();
+ }
+ power_finalize();
+ HPMfinalize();
+ perfmon_initialized = 0;
+ groupSet = NULL;
+ return;
+}
+
+int
+perfmon_addEventSet(char* eventCString)
+{
+ int i, j, err;
+ bstring eventBString;
+ struct bstrList* eventtokens;
+ PerfmonEventSet* eventSet;
+ PerfmonEventSetEntry* event;
+ char* cstringcopy;
+ Configuration_t config;
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ config = get_configuration();
+
+ if (eventCString == NULL)
+ {
+ DEBUG_PLAIN_PRINT(DEBUGLEV_INFO, Event string is empty. Trying environment variable LIKWID_EVENTS);
+ eventCString = getenv("LIKWID_EVENTS");
+ if (eventCString == NULL)
+ {
+ ERROR_PLAIN_PRINT(Cannot read event string. Also event string from environment variable is empty);
+ return -EINVAL;
+ }
+ }
+
+ if (strchr(eventCString, '-') != NULL)
+ {
+ ERROR_PLAIN_PRINT(Event string contains invalid character -);
+ return -EINVAL;
+ }
+ if (strchr(eventCString, '.') != NULL)
+ {
+ ERROR_PLAIN_PRINT(Event string contains invalid character .);
+ return -EINVAL;
+ }
+ if (groupSet->numberOfActiveGroups == 0)
+ {
+ groupSet->groups = (PerfmonEventSet*) malloc(sizeof(PerfmonEventSet));
+ if (groupSet->groups == NULL)
+ {
+ ERROR_PLAIN_PRINT(Cannot allocate initialize of event group list);
+ return -ENOMEM;
+ }
+ groupSet->numberOfGroups = 1;
+ groupSet->numberOfActiveGroups = 0;
+ groupSet->activeGroup = -1;
+
+ /* Only one group exists by now */
+ groupSet->groups[0].rdtscTime = 0;
+ groupSet->groups[0].runTime = 0;
+ groupSet->groups[0].numberOfEvents = 0;
+ }
+
+ if ((groupSet->numberOfActiveGroups > 0) && (groupSet->numberOfActiveGroups == groupSet->numberOfGroups))
+ {
+ groupSet->numberOfGroups++;
+ groupSet->groups = (PerfmonEventSet*)realloc(groupSet->groups, groupSet->numberOfGroups*sizeof(PerfmonEventSet));
+ if (groupSet->groups == NULL)
+ {
+ ERROR_PLAIN_PRINT(Cannot allocate additional group);
+ return -ENOMEM;
+ }
+ groupSet->groups[groupSet->numberOfActiveGroups].rdtscTime = 0;
+ groupSet->groups[groupSet->numberOfActiveGroups].runTime = 0;
+ groupSet->groups[groupSet->numberOfActiveGroups].numberOfEvents = 0;
+ DEBUG_PLAIN_PRINT(DEBUGLEV_INFO, Allocating new group structure for group.);
+ }
+ DEBUG_PRINT(DEBUGLEV_INFO, Currently %d groups of %d active,
+ groupSet->numberOfActiveGroups+1,
+ groupSet->numberOfGroups+1);
+
+ if (strchr(eventCString, ':') == NULL)
+ {
+ err = read_group(config->groupPath, cpuid_info.short_name,
+ eventCString,
+ &groupSet->groups[groupSet->numberOfActiveGroups].group);
+ if (err)
+ {
+ ERROR_PRINT(Cannot read performance group %s, eventCString);
+ return err;
+ }
+ }
+ else
+ {
+ err = custom_group(eventCString, &groupSet->groups[groupSet->numberOfActiveGroups].group);
+ if (err)
+ {
+ ERROR_PRINT(Cannot transform %s to performance group, eventCString);
+ return err;
+ }
+ }
+ char * evstr = get_eventStr(&groupSet->groups[groupSet->numberOfActiveGroups].group);
+ eventBString = bfromcstr(evstr);
+ eventtokens = bsplit(eventBString,',');
+ free(evstr);
+ bdestroy(eventBString);
+
+ eventSet = &(groupSet->groups[groupSet->numberOfActiveGroups]);
+ eventSet->events = (PerfmonEventSetEntry*) malloc(eventtokens->qty * sizeof(PerfmonEventSetEntry));
+ if (eventSet->events == NULL)
+ {
+ ERROR_PRINT(Cannot allocate event list for group %d\n, groupSet->numberOfActiveGroups);
+ return -ENOMEM;
+ }
+ eventSet->numberOfEvents = 0;
+ eventSet->regTypeMask = 0x0ULL;
+
+
+ int forceOverwrite = 0;
+ if (getenv("LIKWID_FORCE") != NULL)
+ {
+ forceOverwrite = atoi(getenv("LIKWID_FORCE"));
+ }
+ for(i=0;i<eventtokens->qty;i++)
+ {
+ event = &(eventSet->events[i]);
+ struct bstrList* subtokens = bsplit(eventtokens->entry[i],':');
+ if (subtokens->qty < 2)
+ {
+ ERROR_PRINT(Cannot parse event descriptor %s, bdata(eventtokens->entry[i]));
+ bstrListDestroy(subtokens);
+ continue;
+ }
+ else
+ {
+ if (!getIndexAndType(subtokens->entry[1], &event->index, &event->type, forceOverwrite))
+ {
+ DEBUG_PRINT(DEBUGLEV_INFO, Counter register %s not supported or PCI device not available,
+ bdata(subtokens->entry[1]));
+ event->type = NOTYPE;
+ goto past_checks;
+ }
+
+ if (!getEvent(subtokens->entry[0], subtokens->entry[1], &event->event))
+ {
+ DEBUG_PRINT(DEBUGLEV_INFO, Event %s not found for current architecture,
+ bdata(subtokens->entry[0]));
+ event->type = NOTYPE;
+ goto past_checks;
+ }
+
+ if (!checkCounter(subtokens->entry[1], event->event.limit))
+ {
+ DEBUG_PRINT(DEBUGLEV_INFO, Register %s not allowed for event %s,
+ bdata(subtokens->entry[1]),bdata(subtokens->entry[0]));
+ event->type = NOTYPE;
+ goto past_checks;
+ }
+ if (parseOptions(subtokens, &event->event, event->index) < 0)
+ {
+ DEBUG_PRINT(DEBUGLEV_INFO, Cannot parse options in %s, bdata(eventtokens->entry[i]));
+ event->type = NOTYPE;
+ goto past_checks;
+ }
+
+ eventSet->regTypeMask |= REG_TYPE_MASK(event->type);
+past_checks:
+ event->threadCounter = (PerfmonCounter*) malloc(
+ groupSet->numberOfThreads * sizeof(PerfmonCounter));
+
+ if (event->threadCounter == NULL)
+ {
+ ERROR_PRINT(Cannot allocate counter for all threads in group %d,groupSet->numberOfActiveGroups);
+ //bstrListDestroy(subtokens);
+ continue;
+ }
+ for(j=0;j<groupSet->numberOfThreads;j++)
+ {
+ event->threadCounter[j].counterData = 0;
+ event->threadCounter[j].startData = 0;
+ event->threadCounter[j].fullResult = 0.0;
+ event->threadCounter[j].lastResult = 0.0;
+ event->threadCounter[j].overflows = 0;
+ event->threadCounter[j].init = FALSE;
+ }
+ eventSet->numberOfEvents++;
+
+ if (event->type != NOTYPE)
+ {
+ DEBUG_PRINT(DEBUGLEV_INFO,
+ Added event %s for counter %s to group %d,
+ event->event.name,
+ counter_map[event->index].key,
+ groupSet->numberOfActiveGroups);
+ }
+ }
+ bstrListDestroy(subtokens);
+ }
+ bstrListDestroy(eventtokens);
+ if ((eventSet->numberOfEvents > 0) && (eventSet->regTypeMask != 0x0ULL))
+ {
+ eventSet->state = STATE_NONE;
+ groupSet->numberOfActiveGroups++;
+ return groupSet->numberOfActiveGroups-1;
+ }
+ else
+ {
+ fprintf(stderr,"No event in given event string can be configured\n");
+ return -EINVAL;
+ }
+}
+
+void
+perfmon_delEventSet(int groupID)
+{
+ if (groupID >= groupSet->numberOfGroups || groupID < 0)
+ return;
+ return_group(&groupSet->groups[groupID].group);
+ return;
+}
+
+int
+__perfmon_setupCountersThread(int thread_id, int groupId)
+{
+ int i;
+ if (groupId >= groupSet->numberOfActiveGroups)
+ {
+ ERROR_PRINT(Group %d does not exist in groupSet, groupId);
+ return -ENOENT;
+ }
+
+ CHECK_AND_RETURN_ERROR(perfmon_setupCountersThread(thread_id, &groupSet->groups[groupId]),
+ Setup of counters failed);
+
+ groupSet->activeGroup = groupId;
+ return 0;
+}
+
+int
+perfmon_setupCounters(int groupId)
+{
+ int i;
+ int ret = 0;
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ if (unlikely(groupSet == NULL))
+ {
+ return -EINVAL;
+ }
+ if (groupId >= groupSet->numberOfActiveGroups)
+ {
+ ERROR_PRINT(Group %d does not exist in groupSet, groupId);
+ return -ENOENT;
+ }
+
+ for(i=0;i<groupSet->numberOfThreads;i++)
+ {
+ ret = __perfmon_setupCountersThread(groupSet->threads[i].thread_id, groupId);
+ if (ret != 0)
+ {
+ return ret;
+ }
+ }
+ groupSet->groups[groupId].state = STATE_SETUP;
+ return 0;
+}
+
+int
+__perfmon_startCounters(int groupId)
+{
+ int i = 0;
+ int ret = 0;
+ if (groupSet->groups[groupId].state != STATE_SETUP)
+ {
+ return -EINVAL;
+ }
+ for(;i<groupSet->numberOfThreads;i++)
+ {
+ ret = perfmon_startCountersThread(groupSet->threads[i].thread_id, &groupSet->groups[groupId]);
+ if (ret)
+ {
+ return -groupSet->threads[i].thread_id-1;
+ }
+ }
+ groupSet->groups[groupId].state = STATE_START;
+ timer_start(&groupSet->groups[groupId].timer);
+ return 0;
+}
+
+int perfmon_startCounters(void)
+{
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ if (unlikely(groupSet == NULL))
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ if (groupSet->activeGroup < 0)
+ {
+ ERROR_PLAIN_PRINT(Cannot find group to start);
+ return -EINVAL;
+ }
+ return __perfmon_startCounters(groupSet->activeGroup);
+}
+
+int perfmon_startGroupCounters(int groupId)
+{
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ if (unlikely(groupSet == NULL))
+ {
+ return -EINVAL;
+ }
+ if (((groupId < 0) || (groupId >= groupSet->numberOfActiveGroups)) && (groupSet->activeGroup >= 0))
+ {
+ groupId = groupSet->activeGroup;
+ }
+ else
+ {
+ ERROR_PLAIN_PRINT(Cannot find group to start);
+ return -EINVAL;
+ }
+ return __perfmon_startCounters(groupId);
+}
+
+int
+__perfmon_stopCounters(int groupId)
+{
+ int i = 0;
+ int j = 0;
+ int ret = 0;
+ double result = 0.0;
+
+ timer_stop(&groupSet->groups[groupId].timer);
+
+ for (i = 0; i<groupSet->numberOfThreads; i++)
+ {
+ ret = perfmon_stopCountersThread(groupSet->threads[i].thread_id, &groupSet->groups[groupId]);
+ if (ret)
+ {
+ return -groupSet->threads[i].thread_id-1;
+ }
+ }
+
+ for (i=0; i<perfmon_getNumberOfEvents(groupId); i++)
+ {
+ for (j=0; j<perfmon_getNumberOfThreads(); j++)
+ {
+ result = calculateResult(groupId, i, j);
+ groupSet->groups[groupId].events[i].threadCounter[j].lastResult = result;
+ groupSet->groups[groupId].events[i].threadCounter[j].fullResult += result;
+ }
+ }
+ groupSet->groups[groupId].state = STATE_SETUP;
+ groupSet->groups[groupId].rdtscTime =
+ timer_print(&groupSet->groups[groupId].timer);
+ groupSet->groups[groupId].runTime += groupSet->groups[groupId].rdtscTime;
+ return 0;
+}
+
+int perfmon_stopCounters(void)
+{
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ if (unlikely(groupSet == NULL))
+ {
+ return -EINVAL;
+ }
+ if (groupSet->activeGroup < 0)
+ {
+ ERROR_PLAIN_PRINT(Cannot find group to start);
+ return -EINVAL;
+ }
+ if (groupSet->groups[groupSet->activeGroup].state != STATE_START)
+ {
+ return -EINVAL;
+ }
+ return __perfmon_stopCounters(groupSet->activeGroup);
+}
+
+int perfmon_stopGroupCounters(int groupId)
+{
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ if (unlikely(groupSet == NULL))
+ {
+ return -EINVAL;
+ }
+ if (((groupId < 0) || (groupId >= groupSet->numberOfActiveGroups)) && (groupSet->activeGroup >= 0))
+ {
+ groupId = groupSet->activeGroup;
+ }
+ else
+ {
+ ERROR_PLAIN_PRINT(Cannot find group to start);
+ return -EINVAL;
+ }
+ if (groupSet->groups[groupId].state != STATE_START)
+ {
+ return -EINVAL;
+ }
+ return __perfmon_stopCounters(groupId);
+}
+
+int
+__perfmon_readCounters(int groupId, int threadId)
+{
+ int ret = 0;
+ int i = 0, j = 0;
+ double result = 0.0;
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ if (((groupId < 0) || (groupId >= groupSet->numberOfActiveGroups)) && (groupSet->activeGroup >= 0))
+ {
+ groupId = groupSet->activeGroup;
+ }
+ if (groupSet->groups[groupId].state != STATE_START)
+ {
+ return -EINVAL;
+ }
+ timer_stop(&groupSet->groups[groupId].timer);
+ groupSet->groups[groupId].rdtscTime = timer_print(&groupSet->groups[groupId].timer);
+ groupSet->groups[groupId].runTime += groupSet->groups[groupId].rdtscTime;
+ if (threadId == -1)
+ {
+ for (threadId = 0; threadId<groupSet->numberOfThreads; threadId++)
+ {
+ ret = perfmon_readCountersThread(threadId, &groupSet->groups[groupId]);
+ if (ret)
+ {
+ return -threadId-1;
+ }
+ for (j=0; j < groupSet->groups[groupId].numberOfEvents; j++)
+ {
+ groupSet->groups[groupId].events[j].threadCounter[threadId].lastResult = (double)calculateResult(groupId, j, threadId);
+ groupSet->groups[groupId].events[j].threadCounter[threadId].lastResult = result;
+ groupSet->groups[groupId].events[j].threadCounter[threadId].fullResult += result;
+ }
+ }
+ }
+ else if ((threadId >= 0) && (threadId < groupSet->numberOfThreads))
+ {
+ ret = perfmon_readCountersThread(threadId, &groupSet->groups[groupId]);
+ if (ret)
+ {
+ return -threadId-1;
+ }
+ for (j=0; j < groupSet->groups[groupId].numberOfEvents; j++)
+ {
+ groupSet->groups[groupId].events[j].threadCounter[threadId].lastResult = (double)calculateResult(groupId, j, threadId);
+ groupSet->groups[groupId].events[j].threadCounter[threadId].lastResult = result;
+ groupSet->groups[groupId].events[j].threadCounter[threadId].fullResult += result;
+ }
+}
+ timer_start(&groupSet->groups[groupId].timer);
+ return 0;
+}
+
+int perfmon_readCounters(void)
+{
+ return __perfmon_readCounters(-1,-1);
+}
+
+int perfmon_readCountersCpu(int cpu_id)
+{
+ int i;
+ int thread_id = -1;
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ for(i=0;i<groupSet->numberOfThreads;i++)
+ {
+ if (groupSet->threads[i].processorId == cpu_id)
+ {
+ thread_id = groupSet->threads[i].thread_id;
+ break;
+ }
+ }
+ if (thread_id < 0)
+ {
+ ERROR_PRINT(Failed to read counters for CPU %d, cpu_id);
+ return -thread_id;
+ }
+ i = __perfmon_readCounters(groupSet->activeGroup, thread_id);
+ return i;
+}
+
+int perfmon_readGroupCounters(int groupId)
+{
+ return __perfmon_readCounters(groupId, -1);
+}
+int perfmon_readGroupThreadCounters(int groupId, int threadId)
+{
+ return __perfmon_readCounters(groupId, threadId);
+}
+
+
+double
+perfmon_getResult(int groupId, int eventId, int threadId)
+{
+ if (unlikely(groupSet == NULL))
+ {
+ return 0;
+ }
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return 0;
+ }
+ if (groupSet->numberOfActiveGroups == 0)
+ {
+ return 0;
+ }
+ if ((groupId < 0) && (groupSet->activeGroup >= 0))
+ {
+ groupId = groupSet->activeGroup;
+ }
+ if (eventId >= groupSet->groups[groupId].numberOfEvents)
+ {
+ printf("ERROR: EventID greater than defined events\n");
+ return 0;
+ }
+ if (threadId >= groupSet->numberOfThreads)
+ {
+ printf("ERROR: ThreadID greater than defined threads\n");
+ return 0;
+ }
+
+ if (groupSet->groups[groupId].events[eventId].threadCounter[threadId].fullResult == 0)
+ {
+ return groupSet->groups[groupId].events[eventId].threadCounter[threadId].lastResult;
+ }
+ return groupSet->groups[groupId].events[eventId].threadCounter[threadId].fullResult;
+}
+
+double
+perfmon_getLastResult(int groupId, int eventId, int threadId)
+{
+ if (unlikely(groupSet == NULL))
+ {
+ return 0;
+ }
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return 0;
+ }
+ if (groupSet->numberOfActiveGroups == 0)
+ {
+ return 0;
+ }
+ if ((groupId < 0) && (groupSet->activeGroup >= 0))
+ {
+ groupId = groupSet->activeGroup;
+ }
+ if (eventId >= groupSet->groups[groupId].numberOfEvents)
+ {
+ printf("ERROR: EventID greater than defined events\n");
+ return 0;
+ }
+ if (threadId >= groupSet->numberOfThreads)
+ {
+ printf("ERROR: ThreadID greater than defined threads\n");
+ return 0;
+ }
+
+ return groupSet->groups[groupId].events[eventId].threadCounter[threadId].lastResult;
+}
+
+double
+perfmon_getMetric(int groupId, int metricId, int threadId)
+{
+ int e = 0;
+ double result = 0;
+ CounterList clist;
+ char* teststr = malloc(1024 * sizeof(char));
+ if (unlikely(groupSet == NULL))
+ {
+ return 0;
+ }
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return 0;
+ }
+ if (groupSet->numberOfActiveGroups == 0)
+ {
+ return 0;
+ }
+ if ((groupId < 0) && (groupSet->activeGroup >= 0))
+ {
+ groupId = groupSet->activeGroup;
+ }
+ if (groupSet->groups[groupId].group.nmetrics == 0)
+ {
+ return 0.0;
+ }
+ if ((metricId < 0) || (metricId >= groupSet->groups[groupId].group.nmetrics))
+ {
+ return 0.0;
+ }
+ timer_init();
+ init_clist(&clist);
+ for (e=0;e<groupSet->groups[groupId].numberOfEvents;e++)
+ {
+ add_to_clist(&clist,groupSet->groups[groupId].group.counters[e],
+ perfmon_getResult(groupId, e, threadId));
+ }
+ add_to_clist(&clist, "time", perfmon_getLastTimeOfGroup(groupId));
+ add_to_clist(&clist, "inverseClock", 1.0/timer_getCycleClock());
+ e = calc_metric(groupSet->groups[groupId].group.metricformulas[metricId], &clist, &result);
+ if (e < 0)
+ {
+ result = 0.0;
+ ERROR_PRINT(Cannot calculate formula %s, groupSet->groups[groupId].group.metricformulas[metricId]);
+ }
+ destroy_clist(&clist);
+ return result;
+}
+
+double
+perfmon_getLastMetric(int groupId, int metricId, int threadId)
+{
+ int e = 0;
+ double result = 0;
+ CounterList clist;
+ if (unlikely(groupSet == NULL))
+ {
+ return 0;
+ }
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return 0;
+ }
+ if (groupSet->numberOfActiveGroups == 0)
+ {
+ return 0;
+ }
+ if ((groupId < 0) && (groupSet->activeGroup >= 0))
+ {
+ groupId = groupSet->activeGroup;
+ }
+ if (groupSet->groups[groupId].group.nmetrics == 0)
+ {
+ return 0.0;
+ }
+ if ((metricId < 0) || (metricId >= groupSet->groups[groupId].group.nmetrics))
+ {
+ return 0.0;
+ }
+ timer_init();
+ init_clist(&clist);
+ for (e=0;e<groupSet->groups[groupId].numberOfEvents;e++)
+ {
+ add_to_clist(&clist,groupSet->groups[groupId].group.counters[e],
+ perfmon_getLastResult(groupId, e, threadId));
+ }
+ add_to_clist(&clist, "time", perfmon_getLastTimeOfGroup(groupId));
+ add_to_clist(&clist, "inverseClock", 1.0/timer_getCycleClock());
+ e = calc_metric(groupSet->groups[groupId].group.metricformulas[metricId], &clist, &result);
+ if (e < 0)
+ {
+ result = 0.0;
+ ERROR_PRINT(Cannot calculate formula %s, groupSet->groups[groupId].group.metricformulas[metricId]);
+ }
+ destroy_clist(&clist);
+ return result;
+}
+
+
+int __perfmon_switchActiveGroupThread(int thread_id, int new_group)
+{
+ int ret = 0;
+ int i = 0;
+ GroupState state;
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+
+ timer_stop(&groupSet->groups[groupSet->activeGroup].timer);
+ groupSet->groups[groupSet->activeGroup].rdtscTime =
+ timer_print(&groupSet->groups[groupSet->activeGroup].timer);
+ groupSet->groups[groupSet->activeGroup].runTime += groupSet->groups[groupSet->activeGroup].rdtscTime;
+ state = groupSet->groups[groupSet->activeGroup].state;
+
+ if (state == STATE_START)
+ {
+ ret = perfmon_stopCounters();
+ }
+
+ if (state == STATE_SETUP)
+ {
+ for(i=0; i<groupSet->groups[groupSet->activeGroup].numberOfEvents;i++)
+ {
+ groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].init = FALSE;
+ }
+ }
+ ret = perfmon_setupCounters(new_group);
+ if (ret != 0)
+ {
+ return ret;
+ }
+ if (groupSet->groups[groupSet->activeGroup].state == STATE_SETUP)
+ {
+ ret = perfmon_startCounters();
+ if (ret != 0)
+ {
+ return ret;
+ }
+ }
+ return 0;
+}
+
+int
+perfmon_switchActiveGroup(int new_group)
+{
+ int i = 0;
+ int ret = 0;
+ for(i=0;i<groupSet->numberOfThreads;i++)
+ {
+ ret = __perfmon_switchActiveGroupThread(groupSet->threads[i].thread_id, new_group);
+ if (ret != 0)
+ {
+ return ret;
+ }
+ }
+ return 0;
+}
+
+int
+perfmon_getNumberOfGroups(void)
+{
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ return groupSet->numberOfActiveGroups;
+}
+
+int
+perfmon_getIdOfActiveGroup(void)
+{
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ return groupSet->activeGroup;
+}
+
+int
+perfmon_getNumberOfThreads(void)
+{
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ return groupSet->numberOfThreads;
+}
+
+int
+perfmon_getNumberOfEvents(int groupId)
+{
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ if (groupId < 0)
+ {
+ groupId = groupSet->activeGroup;
+ }
+ return groupSet->groups[groupId].numberOfEvents;
+}
+
+double
+perfmon_getTimeOfGroup(int groupId)
+{
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ if (groupId < 0)
+ {
+ groupId = groupSet->activeGroup;
+ }
+ return groupSet->groups[groupId].runTime;
+}
+
+double
+perfmon_getLastTimeOfGroup(int groupId)
+{
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ if (groupId < 0)
+ {
+ groupId = groupSet->activeGroup;
+ }
+ return groupSet->groups[groupId].rdtscTime;
+}
+
+uint64_t
+perfmon_getMaxCounterValue(RegisterType type)
+{
+ int width = 48;
+ uint64_t tmp = 0x0ULL;
+ if (box_map && (box_map[type].regWidth > 0))
+ {
+ width = box_map[type].regWidth;
+ }
+ for(int i=0;i<width;i++)
+ {
+ tmp |= (1ULL<<i);
+ }
+ return tmp;
+}
+
+char* perfmon_getEventName(int groupId, int eventId)
+{
+ if (unlikely(groupSet == NULL))
+ {
+ return NULL;
+ }
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return NULL;
+ }
+ if (groupSet->numberOfActiveGroups == 0)
+ {
+ return NULL;
+ }
+ if ((groupId < 0) && (groupSet->activeGroup >= 0))
+ {
+ groupId = groupSet->activeGroup;
+ }
+ if ((groupSet->groups[groupId].group.nevents == 0) ||
+ (eventId > groupSet->groups[groupId].group.nevents))
+ {
+ return NULL;
+ }
+ return groupSet->groups[groupId].group.events[eventId];
+}
+
+char* perfmon_getCounterName(int groupId, int eventId)
+{
+ if (unlikely(groupSet == NULL))
+ {
+ return NULL;
+ }
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return NULL;
+ }
+ if (groupSet->numberOfActiveGroups == 0)
+ {
+ return NULL;
+ }
+ if ((groupId < 0) && (groupSet->activeGroup >= 0))
+ {
+ groupId = groupSet->activeGroup;
+ }
+ if ((groupSet->groups[groupId].group.nevents == 0) ||
+ (eventId > groupSet->groups[groupId].group.nevents))
+ {
+ return NULL;
+ }
+ return groupSet->groups[groupId].group.counters[eventId];
+}
+
+char* perfmon_getMetricName(int groupId, int metricId)
+{
+ if (unlikely(groupSet == NULL))
+ {
+ return NULL;
+ }
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return NULL;
+ }
+ if (groupSet->numberOfActiveGroups == 0)
+ {
+ return NULL;
+ }
+ if ((groupId < 0) && (groupSet->activeGroup >= 0))
+ {
+ groupId = groupSet->activeGroup;
+ }
+ if (groupSet->groups[groupId].group.nmetrics == 0)
+ {
+ return NULL;
+ }
+ return groupSet->groups[groupId].group.metricnames[metricId];
+}
+
+char* perfmon_getGroupName(int groupId)
+{
+ if (unlikely(groupSet == NULL))
+ {
+ return NULL;
+ }
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return NULL;
+ }
+ if (groupSet->numberOfActiveGroups == 0)
+ {
+ return NULL;
+ }
+ if ((groupId < 0) && (groupSet->activeGroup >= 0))
+ {
+ groupId = groupSet->activeGroup;
+ }
+ return groupSet->groups[groupId].group.groupname;
+}
+
+char* perfmon_getGroupInfoShort(int groupId)
+{
+ if (unlikely(groupSet == NULL))
+ {
+ return NULL;
+ }
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return NULL;
+ }
+ if (groupSet->numberOfActiveGroups == 0)
+ {
+ return NULL;
+ }
+ if ((groupId < 0) && (groupSet->activeGroup >= 0))
+ {
+ groupId = groupSet->activeGroup;
+ }
+ return groupSet->groups[groupId].group.shortinfo;
+}
+
+char* perfmon_getGroupInfoLong(int groupId)
+{
+ if (unlikely(groupSet == NULL))
+ {
+ return NULL;
+ }
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return NULL;
+ }
+ if (groupSet->numberOfActiveGroups == 0)
+ {
+ return NULL;
+ }
+ if ((groupId < 0) && (groupSet->activeGroup >= 0))
+ {
+ groupId = groupSet->activeGroup;
+ }
+ return groupSet->groups[groupId].group.longinfo;
+}
+
+int perfmon_getGroups(char*** groups, char*** shortinfos, char*** longinfos)
+{
+ int ret = 0;
+ init_configuration();
+ Configuration_t config = get_configuration();
+ ret = get_groups(config->groupPath, cpuid_info.short_name, groups, shortinfos, longinfos);
+ return ret;
+}
+
+void perfmon_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos)
+{
+ return_groups(nrgroups, groups, shortinfos, longinfos);
+}
+
+int perfmon_getNumberOfMetrics(int groupId)
+{
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ if (groupId < 0)
+ {
+ groupId = groupSet->activeGroup;
+ }
+ return groupSet->groups[groupId].group.nmetrics;
+}
+
+void perfmon_printMarkerResults()
+{
+ int i = 0, j = 0, k = 0;
+ for (i=0; i<markerRegions; i++)
+ {
+ printf("Region %d : %s\n", i, bdata(markerResults[i].tag));
+ printf("Group %d\n", markerResults[i].groupID);
+ for (j=0;j<markerResults[i].threadCount; j++)
+ {
+ printf("Thread %d on CPU %d\n", j, markerResults[i].cpulist[j]);
+ printf("\t Measurement time %f sec\n", markerResults[i].time[j]);
+ printf("\t Call count %d\n", markerResults[i].count[j]);
+ for(k=0;k<markerResults[i].eventCount;k++)
+ {
+ printf("\t Event %d : %f\n", k, markerResults[i].counters[j][k]);
+ }
+ }
+ }
+}
+
+int perfmon_getNumberOfRegions()
+{
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ if (markerResults == NULL)
+ {
+ return 0;
+ }
+ return markerRegions;
+}
+
+
+int perfmon_getGroupOfRegion(int region)
+{
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ if (region < 0 || region >= markerRegions)
+ {
+ return -EINVAL;
+ }
+ if (markerResults == NULL)
+ {
+ return 0;
+ }
+ return markerResults[region].groupID;
+}
+
+char* perfmon_getTagOfRegion(int region)
+{
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return NULL;
+ }
+ if (region < 0 || region >= markerRegions)
+ {
+ return NULL;
+ }
+ if (markerResults == NULL)
+ {
+ return NULL;
+ }
+ return bdata(markerResults[region].tag);
+}
+
+
+int perfmon_getEventsOfRegion(int region)
+{
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ if (region < 0 || region >= markerRegions)
+ {
+ return -EINVAL;
+ }
+ if (markerResults == NULL)
+ {
+ return 0;
+ }
+ return markerResults[region].eventCount;
+}
+
+int perfmon_getMetricsOfRegion(int region)
+{
+
+ if (region < 0 || region >= markerRegions)
+ {
+ return -EINVAL;
+ }
+ if (markerResults == NULL)
+ {
+ return 0;
+ }
+ return perfmon_getNumberOfMetrics(markerResults[region].groupID);
+}
+
+
+int perfmon_getThreadsOfRegion(int region)
+{
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ if (region < 0 || region >= markerRegions)
+ {
+ return -EINVAL;
+ }
+ if (markerResults == NULL)
+ {
+ return 0;
+ }
+ return markerResults[region].threadCount;
+}
+
+int perfmon_getCpulistOfRegion(int region, int count, int* cpulist)
+{
+ int i;
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ if (region < 0 || region >= markerRegions)
+ {
+ return -EINVAL;
+ }
+ if (markerResults == NULL)
+ {
+ return 0;
+ }
+ if (cpulist == NULL)
+ {
+ return -EINVAL;
+ }
+ for (i=0; i< MIN(count, markerResults[region].threadCount); i++)
+ {
+ cpulist[i] = markerResults[region].cpulist[i];
+ }
+ return MIN(count, markerResults[region].threadCount);
+}
+
+
+double perfmon_getTimeOfRegion(int region, int thread)
+{
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ if (region < 0 || region >= markerRegions)
+ {
+ return -EINVAL;
+ }
+ if (thread < 0 || thread >= groupSet->numberOfThreads)
+ {
+ return -EINVAL;
+ }
+ if (markerResults == NULL || markerResults[region].time == NULL)
+ {
+ return 0.0;
+ }
+ return markerResults[region].time[thread];
+}
+
+int perfmon_getCountOfRegion(int region, int thread)
+{
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ if (region < 0 || region >= markerRegions)
+ {
+ return -EINVAL;
+ }
+ if (thread < 0 || thread >= groupSet->numberOfThreads)
+ {
+ return -EINVAL;
+ }
+ if (markerResults == NULL || markerResults[region].count == NULL)
+ {
+ return 0.0;
+ }
+ return markerResults[region].count[thread];
+}
+
+double perfmon_getResultOfRegionThread(int region, int event, int thread)
+{
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ if (region < 0 || region >= markerRegions)
+ {
+ return -EINVAL;
+ }
+ if (markerResults == NULL)
+ {
+ return 0;
+ }
+ if (thread < 0 || thread >= markerResults[region].threadCount)
+ {
+ return -EINVAL;
+ }
+ if (event < 0 || event >= markerResults[region].eventCount)
+ {
+ return -EINVAL;
+ }
+ if (markerResults[region].counters[thread] == NULL)
+ {
+ return 0.0;
+ }
+ return markerResults[region].counters[thread][event];
+}
+
+double
+perfmon_getMetricOfRegionThread(int region, int metricId, int threadId)
+{
+ int e = 0, err = 0;
+ double result = 0.0;
+ CounterList clist;
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ if (region < 0 || region >= markerRegions)
+ {
+ return -EINVAL;
+ }
+ if (markerResults == NULL)
+ {
+ return 0.0;
+ }
+ if (threadId < 0 || threadId >= markerResults[region].threadCount)
+ {
+ return -EINVAL;
+ }
+ if (metricId < 0 || metricId >= groupSet->groups[markerResults[region].groupID].group.nmetrics)
+ {
+ return -EINVAL;
+ }
+ timer_init();
+ init_clist(&clist);
+ for (e=0;e<markerResults[region].eventCount;e++)
+ {
+ err = add_to_clist(&clist,
+ groupSet->groups[markerResults[region].groupID].group.counters[e],
+ perfmon_getResultOfRegionThread(region, e, threadId));
+ if (err)
+ {
+ printf("Cannot add counter %s to counter list for metric calculation\n",
+ counter_map[groupSet->groups[markerResults[region].groupID].events[e].index].key);
+ destroy_clist(&clist);
+ return 0;
+ }
+ }
+ add_to_clist(&clist, "time", perfmon_getTimeOfRegion(region, threadId));
+ add_to_clist(&clist, "inverseClock", 1.0/timer_getCycleClock());
+ err = calc_metric(groupSet->groups[markerResults[region].groupID].group.metricformulas[metricId], &clist, &result);
+ if (err < 0)
+ {
+ ERROR_PRINT(Cannot calculate formula %s, groupSet->groups[markerResults[region].groupID].group.metricformulas[metricId]);
+ }
+ destroy_clist(&clist);
+ return result;
+}
+
+int perfmon_readMarkerFile(const char* filename)
+{
+ FILE* fp = NULL;
+ int i = 0;
+ char buf[2048];
+ buf[0] = '\0';
+ char *ptr = NULL;
+ int cpus = 0, groups = 0, regions = 0;
+
+ if (filename == NULL)
+ {
+ return -EINVAL;
+ }
+ if (access(filename, R_OK))
+ {
+ return -EINVAL;
+ }
+ fp = fopen(filename, "r");
+ if (fp == NULL)
+ {
+ fprintf(stderr, "Error opening file %s\n", filename);
+ }
+ ptr = fgets(buf, sizeof(buf), fp);
+ sscanf(buf, "%d %d %d", &cpus, ®ions, &groups);
+ //markerResults = malloc(regions * sizeof(LikwidResults));
+ markerResults = realloc(markerResults, regions * sizeof(LikwidResults));
+ if (markerResults == NULL)
+ {
+ fprintf(stderr, "Failed to allocate %lu bytes for the marker results storage\n", regions * sizeof(LikwidResults));
+ return -ENOMEM;
+ }
+ int* regionCPUs = (int*)malloc(regions * sizeof(int));
+ if (regionCPUs == NULL)
+ {
+ fprintf(stderr, "Failed to allocate %lu bytes for temporal cpu count storage\n", regions * sizeof(int));
+ return -ENOMEM;
+ }
+ markerRegions = regions;
+ groupSet->numberOfThreads = cpus;
+ for ( uint32_t i=0; i < regions; i++ )
+ {
+ regionCPUs[i] = 0;
+ markerResults[i].threadCount = cpus;
+ markerResults[i].time = (double*) malloc(cpus * sizeof(double));
+ if (!markerResults[i].time)
+ {
+ fprintf(stderr, "Failed to allocate %lu bytes for the time storage\n", cpus * sizeof(double));
+ break;
+ }
+ markerResults[i].count = (uint32_t*) malloc(cpus * sizeof(uint32_t));
+ if (!markerResults[i].count)
+ {
+ fprintf(stderr, "Failed to allocate %lu bytes for the count storage\n", cpus * sizeof(uint32_t));
+ break;
+ }
+ markerResults[i].cpulist = (int*) malloc(cpus * sizeof(int));
+ if (!markerResults[i].count)
+ {
+ fprintf(stderr, "Failed to allocate %lu bytes for the cpulist storage\n", cpus * sizeof(int));
+ break;
+ }
+ markerResults[i].counters = (double**) malloc(cpus * sizeof(double*));
+ if (!markerResults[i].counters)
+ {
+ fprintf(stderr, "Failed to allocate %lu bytes for the counter result storage\n", cpus * sizeof(double*));
+ break;
+ }
+ }
+ while (fgets(buf, sizeof(buf), fp))
+ {
+ if (strchr(buf,':'))
+ {
+ int regionid = 0, groupid = 0;
+ char regiontag[100];
+ regiontag[0] = '\0';
+ sscanf(buf, "%d:%s-%d", ®ionid, regiontag, &groupid);
+ snprintf(regiontag, strlen(buf)-4, "%s", &(buf[2]));
+ markerResults[regionid].groupID = groupid;
+ markerResults[regionid].tag = bfromcstr(regiontag);
+ }
+ else
+ {
+ int regionid = 0, groupid = 0, cpu = 0, count = 0, nevents = 0;
+ int cpuidx = 0, eventidx = 0;
+ double time = 0;
+ char remain[1024];
+ remain[0] = '\0';
+ sscanf(buf, "%d %d %d %d %lf %d %[^\t\n]", ®ionid, &groupid, &cpu, &count, &time, &nevents, remain);
+ if (cpu >= 0)
+ {
+ cpuidx = regionCPUs[regionid];
+ markerResults[regionid].cpulist[cpuidx] = cpu;
+ markerResults[regionid].eventCount = nevents;
+ markerResults[regionid].time[cpuidx] = time;
+ markerResults[regionid].count[cpuidx] = count;
+ markerResults[regionid].counters[cpuidx] = malloc(nevents * sizeof(double));
+
+ eventidx = 0;
+ ptr = strtok(remain, " ");
+ while (ptr != NULL && eventidx < nevents)
+ {
+ sscanf(ptr, "%lf", &(markerResults[regionid].counters[cpuidx][eventidx]));
+ ptr = strtok(NULL, " ");
+ eventidx++;
+ }
+ regionCPUs[regionid]++;
+ }
+ }
+ }
+ for ( uint32_t i=0; i < regions; i++ )
+ {
+ markerResults[i].threadCount = regionCPUs[i];
+ }
+ free(regionCPUs);
+ fclose(fp);
+ return 0;
+}
+
+void perfmon_destroyMarkerResults()
+{
+ int i = 0, j = 0;
+ if (markerResults != NULL)
+ {
+ for (i = 0; i < markerRegions; i++)
+ {
+ free(markerResults[i].time);
+ free(markerResults[i].count);
+ free(markerResults[i].cpulist);
+ for (j = 0; j < markerResults[i].threadCount; j++)
+ {
+ free(markerResults[i].counters[j]);
+ }
+ free(markerResults[i].counters);
+ bdestroy(markerResults[i].tag);
+ }
+ free(markerResults);
+ }
+}
diff --git a/src/perfmon_perf.c b/src/perfmon_perf.c
new file mode 100644
index 0000000..17a56c0
--- /dev/null
+++ b/src/perfmon_perf.c
@@ -0,0 +1,260 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: perfmon_perf.c
+ *
+ * Description: Example perfmon module for software events through perf_event
+ * Currently not integrated in perfmon.
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Author: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <linux/perf_event.h>
+#include <asm/unistd.h>
+
+#include <topology.h>
+#include <error.h>
+#include <perfmon.h>
+#include <perfmon_perf.h>
+
+static int* cpu_event_fds[MAX_NUM_THREADS] = { NULL };
+
+const uint64_t configList[MAX_SW_EVENTS] = {
+ [0x00] = PERF_COUNT_SW_CPU_CLOCK,
+ [0x01] = PERF_COUNT_SW_TASK_CLOCK,
+ [0x02] = PERF_COUNT_SW_PAGE_FAULTS,
+ [0x03] = PERF_COUNT_SW_CONTEXT_SWITCHES,
+ [0x04] = PERF_COUNT_SW_CPU_MIGRATIONS,
+ [0x05] = PERF_COUNT_SW_PAGE_FAULTS_MIN,
+ [0x06] = PERF_COUNT_SW_PAGE_FAULTS_MAJ,
+ [0x07] = PERF_COUNT_SW_ALIGNMENT_FAULTS,
+ [0x08] = PERF_COUNT_SW_EMULATION_FAULTS,
+};
+
+static long
+perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
+ int cpu, int group_fd, unsigned long flags)
+{
+ int ret;
+
+ ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
+ group_fd, flags);
+ return ret;
+}
+
+int init_perf_event(int cpu_id)
+{
+ if (cpu_event_fds[cpu_id] == NULL)
+ {
+ cpu_event_fds[cpu_id] = (int*) malloc(MAX_SW_EVENTS * sizeof(int));
+ if (cpu_event_fds[cpu_id] == NULL)
+ {
+ return -ENOMEM;
+ }
+ memset(cpu_event_fds[cpu_id], -1, MAX_SW_EVENTS * sizeof(int));
+ }
+ return 0;
+}
+
+int setup_perf_event(int cpu_id, PerfmonEvent* event)
+{
+ struct perf_event_attr attr;
+ if (event == NULL)
+ {
+ return -EINVAL;
+ }
+ if (cpu_event_fds[cpu_id] == NULL)
+ {
+ return -EFAULT;
+ }
+ if (cpu_event_fds[cpu_id][event->umask] != -1)
+ {
+ return 0;
+ }
+ memset(&attr, 0, sizeof(struct perf_event_attr));
+ attr.type = PERF_TYPE_SOFTWARE;
+ attr.size = sizeof(struct perf_event_attr);
+ attr.config = configList[event->umask];
+ attr.exclude_kernel = 1;
+ attr.exclude_hv = 1;
+ attr.disabled = 1;
+ attr.inherit = 1;
+ if (event->numberOfOptions > 0)
+ {
+ for(int j = 0; j < event->numberOfOptions; j++)
+ {
+ switch (event->options[j].type)
+ {
+ case EVENT_OPTION_COUNT_KERNEL:
+ attr.exclude_kernel = 0;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ cpu_event_fds[cpu_id][event->umask] = perf_event_open(&attr, 0, cpu_id, -1, 0);
+ if (cpu_event_fds[cpu_id][event->umask] < 0)
+ {
+ printf("Setup of event %llu failed\n", event->umask);
+ return -EFAULT;
+ }
+ return 0;
+}
+
+int read_perf_event(int cpu_id, uint64_t eventID, uint64_t *data)
+{
+ int ret = 0;
+ long long tmp = 0;
+ *data = 0x0ULL;
+ if ((cpu_event_fds[cpu_id] != NULL) && (cpu_event_fds[cpu_id][eventID] != -1))
+ {
+ ret = read(cpu_event_fds[cpu_id][eventID], &tmp, sizeof(long long));
+ if (ret == sizeof(long long))
+ {
+ *data = (uint64_t) tmp;
+ }
+ }
+ else
+ {
+ printf("FD for event %llu not initialized\n", eventID);
+ return -ENODEV;
+ }
+ return 0;
+}
+
+int stop_perf_event(int cpu_id, uint64_t eventID)
+{
+ if ((cpu_event_fds[cpu_id] != NULL) && (cpu_event_fds[cpu_id][eventID] != -1))
+ {
+ ioctl(cpu_event_fds[cpu_id][eventID], PERF_EVENT_IOC_DISABLE, 0);
+ }
+ else
+ {
+ return -ENODEV;
+ }
+ return 0;
+}
+
+int stop_all_perf_event(int cpu_id)
+{
+ if (cpu_event_fds[cpu_id] != NULL)
+ {
+ for (int i = 0; i< MAX_SW_EVENTS; i++)
+ {
+ if (cpu_event_fds[cpu_id][i] != -1)
+ {
+ stop_perf_event(cpu_id, i);
+ }
+ }
+ }
+ return 0;
+}
+
+int clear_perf_event(int cpu_id, uint64_t eventID)
+{
+ if ((cpu_event_fds[cpu_id] != NULL) && (cpu_event_fds[cpu_id][eventID] != -1))
+ {
+ ioctl(cpu_event_fds[cpu_id][eventID], PERF_EVENT_IOC_RESET, 0);
+ }
+ else
+ {
+ return -ENODEV;
+ }
+ return 0;
+}
+
+int clear_all_perf_event(int cpu_id)
+{
+ if (cpu_event_fds[cpu_id] != NULL)
+ {
+ for (int i = 0; i< MAX_SW_EVENTS; i++)
+ {
+ if (cpu_event_fds[cpu_id][i] != -1)
+ {
+ clear_perf_event(cpu_id, i);
+ }
+ }
+ }
+ return 0;
+}
+
+int start_perf_event(int cpu_id, uint64_t eventID)
+{
+ if ((cpu_event_fds[cpu_id] != NULL) && (cpu_event_fds[cpu_id][eventID] != -1))
+ {
+ ioctl(cpu_event_fds[cpu_id][eventID], PERF_EVENT_IOC_ENABLE, 0);
+ }
+ else
+ {
+ return -ENODEV;
+ }
+ return 0;
+}
+
+int start_all_perf_event(int cpu_id)
+{
+ if (cpu_event_fds[cpu_id] != NULL)
+ {
+ for (int i = 0; i< MAX_SW_EVENTS; i++)
+ {
+ if (cpu_event_fds[cpu_id][i] != -1)
+ {
+ start_perf_event(cpu_id, i);
+ }
+ }
+ }
+ return 0;
+}
+
+int close_perf_event(int cpu_id, uint64_t eventID)
+{
+ if ((cpu_event_fds[cpu_id] != NULL) && (cpu_event_fds[cpu_id][eventID] != -1))
+ {
+ close(cpu_event_fds[cpu_id][eventID]);
+ cpu_event_fds[cpu_id][eventID] = -1;
+ }
+ return 0;
+}
+
+int finalize_perf_event(int cpu_id)
+{
+ if (cpu_event_fds[cpu_id] != NULL)
+ {
+ for (int i = 0; i< MAX_SW_EVENTS; i++)
+ {
+ if (cpu_event_fds[cpu_id][i] != -1)
+ {
+ close_perf_event(cpu_id, i);
+ }
+ }
+ free(cpu_event_fds[cpu_id]);
+ }
+
+ return 0;
+}
diff --git a/src/power.c b/src/power.c
index 3f4118c..d76c965 100644
--- a/src/power.c
+++ b/src/power.c
@@ -5,13 +5,14 @@
*
* Description: Module implementing Intel RAPL interface
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Authors: Jan Treibig (jt), jan.treibig at gmail.com,
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -33,136 +34,476 @@
#include <types.h>
#include <power.h>
-#include <cpuid.h>
+#include <topology.h>
/* ##### EXPORTED VARIABLES ########################################### */
PowerInfo power_info;
-const uint32_t power_regs[4] = {MSR_PKG_ENERGY_STATUS,
- MSR_PP0_ENERGY_STATUS,
- MSR_PP1_ENERGY_STATUS,
- MSR_DRAM_ENERGY_STATUS};
/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
-
+static int power_initialized = 0;
/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
-void
+int
power_init(int cpuId)
{
uint64_t flags;
- int hasRAPL = 0;
- uint32_t info_register = 0x0;
+ int i;
+ int err;
/* determine Turbo Mode features */
double busSpeed;
+ if (power_initialized)
+ {
+ return 0;
+ }
+
+ power_info.baseFrequency = 0;
+ power_info.minFrequency = 0;
+ power_info.turbo.numSteps = 0;
+ power_info.turbo.steps = NULL;
+ power_info.powerUnit = 0;
+ power_info.timeUnit = 0;
+ power_info.hasRAPL = 0;
- if ((cpuid_info.model == SANDYBRIDGE_EP) ||
- (cpuid_info.model == SANDYBRIDGE) ||
- (cpuid_info.model == HASWELL) ||
- (cpuid_info.model == HASWELL_EX) ||
- (cpuid_info.model == IVYBRIDGE_EP) ||
- (cpuid_info.model == IVYBRIDGE))
+ switch (cpuid_info.model)
{
- hasRAPL = 1;
- info_register = MSR_PKG_POWER_INFO;
+ case SANDYBRIDGE:
+ case IVYBRIDGE:
+ case HASWELL:
+ case SANDYBRIDGE_EP:
+ case IVYBRIDGE_EP:
+ case HASWELL_EP:
+ case ATOM_SILVERMONT_E:
+ case ATOM_SILVERMONT_Z1:
+ case ATOM_SILVERMONT_Z2:
+ case ATOM_SILVERMONT_F:
+ case BROADWELL:
+ case BROADWELL_E:
+ case BROADWELL_D:
+ case HASWELL_M1:
+ case HASWELL_M2:
+ case SKYLAKE1:
+ case SKYLAKE2:
+ power_info.hasRAPL = 1;
+ break;
+ case ATOM_SILVERMONT_C:
+ power_info.hasRAPL = 1;
+ /* The info_regs list needs an update for Silvermont Type C
+ because it uses another info register */
+ info_regs[PKG] = MSR_PKG_POWER_INFO_SILVERMONT;
+ break;
+ default:
+ DEBUG_PLAIN_PRINT(DEBUGLEV_INFO, NO RAPL SUPPORT);
+ return 0;
+ break;
}
- else if (cpuid_info.model == ATOM_SILVERMONT_C)
+
+ perfmon_init_maps();
+ if (!HPMinitialized())
{
- hasRAPL = 1;
- info_register = MSR_PKG_POWER_INFO_SILVERMONT;
+ HPMinit();
+ HPMaddThread(cpuId);
}
- else if ((cpuid_info.model == ATOM_SILVERMONT_E) ||
- (cpuid_info.model == ATOM_SILVERMONT_F1) ||
- (cpuid_info.model == ATOM_SILVERMONT_F2) ||
- (cpuid_info.model == ATOM_SILVERMONT_F3))
+ if ( power_info.hasRAPL )
{
- hasRAPL = 1;
+ busSpeed = 100.0;
+ }
+ else
+ {
+ busSpeed = 133.33;
}
-
if (cpuid_info.turbo)
{
- flags = msr_read(cpuId, MSR_PLATFORM_INFO);
-
- if ( hasRAPL )
- {
- busSpeed = 100.0;
- }
- else
+ err = HPMread(cpuId, MSR_DEV, MSR_PLATFORM_INFO, &flags);
+ if (err == 0)
{
- busSpeed = 133.33;
- }
-
- power_info.baseFrequency = busSpeed * (double) extractBitField(flags,8,8);
- power_info.minFrequency = busSpeed * (double) extractBitField((flags>>(32)),8,8);
+ power_info.baseFrequency = busSpeed * (double) extractBitField(flags,8,8);
+ power_info.minFrequency = busSpeed * (double) extractBitField((flags>>(32)),8,8);
- power_info.turbo.numSteps = cpuid_topology.numCoresPerSocket;
- power_info.turbo.steps = (double*) malloc(power_info.turbo.numSteps * sizeof(double));
-
- flags = msr_read(cpuId, MSR_TURBO_RATIO_LIMIT);
+ power_info.turbo.numSteps = cpuid_topology.numCoresPerSocket;
+ if (cpuid_info.model == WESTMERE_EX)
+ {
+ power_info.turbo.numSteps = 4;
+ }
+ power_info.turbo.steps = (double*) malloc(power_info.turbo.numSteps * sizeof(double));
+ if (!power_info.turbo.steps)
+ {
+ return -ENOMEM;
+ }
- for (int i=0; i < power_info.turbo.numSteps; i++)
- {
- if (i < 8)
+ err = HPMread(cpuId, MSR_DEV, MSR_TURBO_RATIO_LIMIT, &flags);
+ if (err)
{
- power_info.turbo.steps[i] = busSpeed * (double) field64(flags,i*8, 8);
+ fprintf(stderr,"Cannot gather values from MSR_TURBO_RATIO_LIMIT,\n");
}
else
{
- power_info.turbo.steps[i] = power_info.turbo.steps[7];
+ for (int i=0; i < power_info.turbo.numSteps; i++)
+ {
+ if (i < 8)
+ {
+ power_info.turbo.steps[i] = busSpeed * (double) field64(flags,i*8, 8);
+ }
+ else
+ {
+ power_info.turbo.steps[i] = power_info.turbo.steps[7];
+ }
+ }
}
+ //TODO: Haswell EP and possibly Broadwell EP support multiple turbo
+ // registers besides MSR_TURBO_RATIO_LIMIT:
+ // MSR_TURBO_RATIO_LIMIT1 and MSR_TURBO_RATIO_LIMIT2
+ }
+ else
+ {
+ fprintf(stderr,"Cannot gather values from MSR_PLATFORM_INFO,\n");
}
- }
- else
- {
- power_info.turbo.numSteps = 0;
}
/* determine RAPL parameters */
- if ( hasRAPL )
+ if ( power_info.hasRAPL )
{
- flags = msr_read(cpuId, MSR_RAPL_POWER_UNIT);
-
- power_info.powerUnit = pow(0.5,(double) extractBitField(flags,4,0));
- power_info.energyUnit = pow(0.5,(double) extractBitField(flags,5,8));
- power_info.timeUnit = pow(0.5,(double) extractBitField(flags,4,16));
-
- if (info_register != 0x0)
+ err = HPMread(cpuId, MSR_DEV, MSR_RAPL_POWER_UNIT, &flags);
+ if (err == 0)
{
- flags = msr_read(cpuId, info_register);
- power_info.tdp = (double) extractBitField(flags,15,0) * power_info.powerUnit;
- if (cpuid_info.model != ATOM_SILVERMONT_C)
+ double energyUnit;
+ power_info.powerUnit = 1000000 / (1<<(flags & 0xF));
+ power_info.timeUnit = 1000000 / (1 << ((flags>>16) & 0xF));
+ if (cpuid_info.model != ATOM_SILVERMONT_E)
{
- power_info.minPower = (double) extractBitField(flags,15,16) * power_info.powerUnit;
- power_info.maxPower = (double) extractBitField(flags,15,32) * power_info.powerUnit;
- power_info.maxTimeWindow = (double) extractBitField(flags,7,48) * power_info.timeUnit;
+ energyUnit = 1.0 / (1 << ((flags >> 8) & 0x1F));
}
else
{
- power_info.minPower = 0.0;
- power_info.maxPower = 0.0;
- power_info.maxTimeWindow = 0.0;
+ energyUnit = 1.0 * (1 << ((flags >> 8) & 0x1F)) / 1000000;
+ }
+
+ for (i = 0; i < NUM_POWER_DOMAINS; i++)
+ {
+ power_info.domains[i].energyUnit = energyUnit;
+ power_info.domains[i].type = i;
+ power_info.domains[i].supportFlags = 0x0U;
+ power_info.domains[i].tdp = 0.0;
+ power_info.domains[i].minPower = 0.0;
+ power_info.domains[i].maxPower = 0.0;
+ power_info.domains[i].maxTimeWindow = 0.0;
+ }
+
+ if ((cpuid_info.model == HASWELL_EP) ||
+ (cpuid_info.model == HASWELL_M1) ||
+ (cpuid_info.model == HASWELL_M2))
+ {
+ power_info.domains[DRAM].energyUnit = 15.3E-6;
+ }
+
+ for(i = 0; i < NUM_POWER_DOMAINS; i++)
+ {
+ err = HPMread(cpuId, MSR_DEV, power_regs[i], &flags);
+ if (err == 0)
+ {
+ power_info.domains[i].supportFlags |= POWER_DOMAIN_SUPPORT_STATUS;
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_DETAIL, RAPL domain %s not supported, power_names[i]);
+ continue;
+ }
+ if (limit_regs[i] != 0x0)
+ {
+ err = HPMread(cpuId, MSR_DEV, limit_regs[i], &flags);
+ if (err == 0)
+ {
+ power_info.domains[i].supportFlags |= POWER_DOMAIN_SUPPORT_LIMIT;
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_DETAIL, Deactivating limit register for RAPL domain %s, power_names[i]);
+ limit_regs[i] = 0x0;
+ }
+ }
+ if (info_regs[i] != 0x0)
+ {
+ err = HPMread(cpuId, MSR_DEV, info_regs[i], &flags);
+ if (err == 0)
+ {
+ power_info.domains[i].supportFlags |= POWER_DOMAIN_SUPPORT_INFO;
+ power_info.domains[i].tdp = (double) extractBitField(flags,15,0) * power_info.powerUnit;
+ if (cpuid_info.model != ATOM_SILVERMONT_C)
+ {
+ power_info.domains[i].minPower = (double) extractBitField(flags,15,16) * power_info.powerUnit;
+ power_info.domains[i].maxPower = (double) extractBitField(flags,15,32) * power_info.powerUnit;
+ power_info.domains[i].maxTimeWindow = (double) extractBitField(flags,7,48) * power_info.timeUnit;
+ }
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_DETAIL, Deactivating info register for RAPL domain %s, power_names[i]);
+ info_regs[i] = 0x0;
+ }
+ }
+ if (policy_regs[i] != 0x0)
+ {
+ err = HPMread(cpuId, MSR_DEV, policy_regs[i], &flags);
+ if (err == 0)
+ {
+ power_info.domains[i].supportFlags |= POWER_DOMAIN_SUPPORT_POLICY;
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_DETAIL, Deactivating policy register for RAPL domain %s, power_names[i]);
+ policy_regs[i] = 0x0;
+ }
+ }
+ if (perf_regs[i] != 0x0)
+ {
+ err = HPMread(cpuId, MSR_DEV, perf_regs[i], &flags);
+ if (err == 0)
+ {
+ power_info.domains[i].supportFlags |= POWER_DOMAIN_SUPPORT_PERF;
+ }
+ else
+ {
+ DEBUG_PRINT(DEBUGLEV_DETAIL, Deactivating perf register for RAPL domain %s, power_names[i]);
+ perf_regs[i] = 0x0;
+ }
+ }
}
}
else
{
- power_info.tdp = 0;
- power_info.minPower = 0.0;
- power_info.maxPower = 0.0;
- power_info.maxTimeWindow = 0.0;
+ fprintf(stderr,"Cannot gather values from MSR_RAPL_POWER_UNIT, deactivating RAPL support\n");
+ power_info.hasRAPL = 0;
}
+ power_initialized = 1;
+ return power_info.hasRAPL;
}
else
{
- power_info.powerUnit = 0.0;
- power_info.energyUnit = 0.0;
- power_info.timeUnit = 0.0;
- power_info.tdp = 0;
- power_info.minPower = 0.0;
- power_info.maxPower = 0.0;
- power_info.maxTimeWindow = 0.0;
+ return power_info.hasRAPL;
+ }
+ return 0;
+}
+
+/* All functions below are experimental and probably don't work */
+int power_perfGet(int cpuId, PowerType domain, uint32_t* status)
+{
+ int err = 0;
+ *status = 0x0U;
+ if (domain >= NUM_POWER_DOMAINS)
+ {
+ return -EINVAL;
+ }
+ if (power_info.domains[domain].supportFlags & POWER_DOMAIN_SUPPORT_PERF)
+ {
+ err = HPMread(cpuId, MSR_DEV, perf_regs[domain], (uint64_t*)status);
+ if (err)
+ {
+ ERROR_PRINT(Failed to get power perf value for domain %s on CPU %d,power_names[domain], cpuId);
+ return -EFAULT;
+ }
+ }
+ return 0;
+}
+
+int power_limitSet(int cpuId, PowerType domain, double power, double time, int doClamping)
+{
+ int err = 0;
+ if (domain >= NUM_POWER_DOMAINS)
+ {
+ return -EINVAL;
+ }
+ fprintf(stderr, "Not implemented\n");
+ return 0;
+
+ uint32_t X = (log(time) - log(power_info.timeUnit))/log(2);
+ uint32_t powerField = (uint32_t)(power/(power_info.domains[domain].energyUnit));
+ uint64_t flags = (powerField & 0xFFFF)|((X & (0x1F))<<17);
+ // Construct flags missing. How is timeField calculated?
+ if (doClamping)
+ {
+ flags |= (1ULL<<16);
+ }
+ if (power_info.domains[domain].supportFlags & POWER_DOMAIN_SUPPORT_LIMIT)
+ {
+ err = HPMwrite(cpuId, MSR_DEV, limit_regs[domain], flags);
+ if (err)
+ {
+ fprintf(stderr, "Failed to set power limit for domain %s on CPU %d\n",power_names[domain], cpuId);
+ return -EFAULT;
+ }
}
+ return 0;
}
+int power_limitGet(int cpuId, PowerType domain, double* power, double* time)
+{
+ int err = 0;
+ *power = 0;
+ *time = 0;
+ unsigned int Y,Z;
+ if (domain >= NUM_POWER_DOMAINS)
+ {
+ return -EINVAL;
+ }
+ uint64_t flags = 0x0ULL;
+ if (power_info.domains[domain].supportFlags & POWER_DOMAIN_SUPPORT_LIMIT)
+ {
+ err = HPMread(cpuId, MSR_DEV, limit_regs[domain], &flags);
+ if (err)
+ {
+ fprintf(stderr, "Failed to set power limit for domain %s on CPU %d\n",power_names[domain], cpuId);
+ return -EFAULT;
+ }
+ *power = ((double)extractBitField(flags, 15, 0)) * power_info.domains[domain].energyUnit;
+ Y = extractBitField(flags, 5, 17);
+ Z = extractBitField(flags, 2, 22);
+ *time = pow(2,((double)Y)) * (1.0 + (((double)Z)/4.0)) * power_info.timeUnit;
+ }
+ return 0;
+}
+
+int power_limitState(int cpuId, PowerType domain)
+{
+ int err = 0;
+ if (domain >= NUM_POWER_DOMAINS)
+ {
+ return -EINVAL;
+ }
+ uint64_t flags = 0x0ULL;
+
+ if (power_info.domains[domain].supportFlags & POWER_DOMAIN_SUPPORT_LIMIT)
+ {
+ err = HPMread(cpuId, MSR_DEV, limit_regs[domain], &flags);
+ if (err)
+ {
+ ERROR_PRINT(Failed to activate power limit for domain %s on CPU %d,power_names[domain], cpuId);
+ return -EFAULT;
+ }
+ }
+ if (flags & (1ULL<<15))
+ {
+ return 1;
+ }
+ return 0;
+}
+
+int power_limitActivate(int cpuId, PowerType domain)
+{
+ int err = 0;
+ if (domain >= NUM_POWER_DOMAINS)
+ {
+ return -EINVAL;
+ }
+ uint64_t flags = 0x0ULL;
+
+ if (power_info.domains[domain].supportFlags & POWER_DOMAIN_SUPPORT_LIMIT)
+ {
+ err = HPMread(cpuId, MSR_DEV, limit_regs[domain], &flags);
+ if (err)
+ {
+ ERROR_PRINT(Failed to activate power limit for domain %s on CPU %d,power_names[domain], cpuId);
+ return -EFAULT;
+ }
+ flags |= (1ULL<<15);
+ err = HPMwrite(cpuId, MSR_DEV, limit_regs[domain], flags);
+ if (err)
+ {
+ ERROR_PRINT(Failed to activate power limit for domain %s on CPU %d,power_names[domain], cpuId);
+ return -EFAULT;
+ }
+ }
+ return 0;
+}
+
+int power_limitDectivate(int cpuId, PowerType domain)
+{
+ int err = 0;
+ uint64_t flags = 0x0ULL;
+
+ if (power_info.domains[domain].supportFlags & POWER_DOMAIN_SUPPORT_LIMIT)
+ {
+ err = HPMread(cpuId, MSR_DEV, limit_regs[domain], &flags);
+ if (err)
+ {
+ ERROR_PRINT(Failed to deactivate power limit for domain %s on CPU %d,power_names[domain], cpuId);
+ return -EFAULT;
+ }
+ flags &= ~(1ULL<<15);
+ err = HPMwrite(cpuId, MSR_DEV, limit_regs[domain], flags);
+ if (err)
+ {
+ ERROR_PRINT(Failed to deactivate power limit for domain %s on CPU %d,power_names[domain], cpuId);
+ return -EFAULT;
+ }
+ }
+ return 0;
+}
+
+int power_policySet(int cpuId, PowerType domain, uint32_t priority)
+{
+ int err = 0;
+ if (domain >= NUM_POWER_DOMAINS)
+ {
+ return -EINVAL;
+ }
+ priority = extractBitField(priority, 5, 0);
+ if (power_info.domains[domain].supportFlags & POWER_DOMAIN_SUPPORT_POLICY)
+ {
+ err = HPMwrite(cpuId, MSR_DEV, policy_regs[domain], priority);
+ if (err)
+ {
+ ERROR_PRINT(Failed to set power policy for domain %s on CPU %d,power_names[domain], cpuId);
+ return -EFAULT;
+ }
+ }
+ return 0;
+}
+
+int power_policyGet(int cpuId, PowerType domain, uint32_t* priority)
+{
+ int err = 0;
+ *priority = 0x0U;
+ if (domain >= NUM_POWER_DOMAINS)
+ {
+ return -EINVAL;
+ }
+ if (power_info.domains[domain].supportFlags & POWER_DOMAIN_SUPPORT_POLICY)
+ {
+ err = HPMread(cpuId, MSR_DEV, policy_regs[domain], (uint64_t*)priority);
+ if (err)
+ {
+ ERROR_PRINT(Failed to get power policy for domain %s on CPU %d,power_names[domain], cpuId);
+ return -EFAULT;
+ }
+ }
+ return 0;
+}
+
+
+void power_finalize(void)
+{
+ if (power_initialized == 0)
+ {
+ return;
+ }
+ if (power_info.turbo.steps != NULL)
+ {
+ free(power_info.turbo.steps);
+ }
+ power_info.turbo.steps = NULL;
+ power_info.baseFrequency = 0;
+ power_info.minFrequency = 0;
+ power_info.turbo.numSteps = 0;
+ power_info.powerUnit = 0;
+ power_info.timeUnit = 0;
+ power_info.hasRAPL = 0;
+ memset(power_info.domains, 0, NUM_POWER_DOMAINS*sizeof(PowerDomain));
+}
+
+PowerInfo_t get_powerInfo(void)
+{
+ return &power_info;
+}
diff --git a/src/pthread-overload/Makefile b/src/pthread-overload/Makefile
index 5f460a5..889d824 100644
--- a/src/pthread-overload/Makefile
+++ b/src/pthread-overload/Makefile
@@ -1,16 +1,18 @@
# =======================================================================================
-#
+#
# Filename: Makefile
-#
+#
# Description: pthread-overload Makefile
-#
-# Version: 3.1.3
-# Released: 4.11.2014
-#
+#
+# Version: 4.1
+# Released: 19.5.2016
+#
# Author: Jan Treibig (jt), jan.treibig at gmail.com
+# Thomas Roehl (tr), thomas.roehl at googlemail.com
+#
# Project: likwid
#
-# Copyright (C) 2014 Jan Treibig
+# Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
@@ -28,14 +30,17 @@
include ../../config.mk
include ../../make/include_$(COMPILER).mk
+include ../../make/config_checks.mk
+include ../../make/config_defines.mk
+
-TARGET = liblikwidpin.so
+TARGET = $(PINLIB)
ifneq ($(COLOR),NONE)
DEFINES += -DCOLOR=$(COLOR)
endif
-DEFINES += -DMAX_NUM_THREADS=$(MAX_NUM_THREADS)
+DEFINES += -DMAX_NUM_THREADS=$(MAX_NUM_THREADS) -D_GNU_SOURCE
INCLUDES += -I../includes
LIBS += -ldl
CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES)
@@ -43,5 +48,5 @@ CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES)
all: $(TARGET)
$(TARGET): pthread-overload.c
- $(CC) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $(INCLUDES) $(SHARED_CFLAGS) $(SHARED_LFLAGS) -o ../../$(TARGET) pthread-overload.c $(LIBS)
+ $(CC) -Wl,-soname,$(TARGET).$(VERSION).$(RELEASE) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $(INCLUDES) $(SHARED_CFLAGS) $(SHARED_LFLAGS) -o ../../$(TARGET) pthread-overload.c $(LIBS)
diff --git a/src/pthread-overload/pthread-overload.c b/src/pthread-overload/pthread-overload.c
index e9d5dcc..f076b08 100644
--- a/src/pthread-overload/pthread-overload.c
+++ b/src/pthread-overload/pthread-overload.c
@@ -3,16 +3,16 @@
*
* Filename: pthread-overload.c
*
- * Description: Overloaded library for pthread_create call.
+ * Description: Overloaded library for pthread_create call.
* Implements pinning of threads together with likwid-pin.
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
* Author: Jan Treibig (jt), jan.treibig at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -60,7 +60,20 @@ static char * sosearchpaths[] = {
NULL
};
-int
+
+#ifdef COLOR
+#define color_print(format,...) do { \
+ color_on(BRIGHT, COLOR); \
+ printf(format, ##__VA_ARGS__); \
+ color_reset(); \
+ } while(0)
+#else
+#define color_print(format,...) do { \
+ printf(format, ##__VA_ARGS__); \
+ } while(0)
+#endif
+
+int __attribute__ ((visibility ("default") ))
pthread_create(pthread_t* thread,
const pthread_attr_t* attr,
void* (*start_routine)(void *),
@@ -74,28 +87,28 @@ pthread_create(pthread_t* thread,
static int npinned = 0;
static int ncalled = 0;
static int overflow = 0;
+ static int overflowed = 0;
static int silent = 0;
static int pin_ids[MAX_NUM_THREADS];
- static uint64_t skipMask = 0;
- static int got_skipMask = 0;
+ static uint64_t skipMask = 0x0;
+ static int ncpus = 0;
/* On first entry: Get Evironment Variable and initialize pin_ids */
if (ncalled == 0)
{
- char *str = getenv("LIKWID_SKIP");
+ char *str;
char *token, *saveptr;
char *delimiter = ",";
int i = 0;
- int ncpus = 0;
+ cpu_set_t cpuset;
+ str = getenv("LIKWID_SKIP");
if (str != NULL)
{
- skipMask = strtoul(str, &str, 10);
- got_skipMask = 1;
+ skipMask = strtoul(str, &str, 16);
}
-
- if ( got_skipMask == 0 && skipMask == 0x0 )
+ else if ( skipMask == 0x0 )
{
dlerror(); /* Clear any existing error */
dlsym(RTLD_DEFAULT,"__kmpc_begin");
@@ -104,18 +117,16 @@ pthread_create(pthread_t* thread,
skipMask = 0x1;
}
}
+
+
if (getenv("LIKWID_SILENT") != NULL)
{
silent = 1;
}
- else
- {
- color_on(BRIGHT, COLOR);
- }
if (!silent)
{
- printf("[pthread wrapper] ");
+ color_print("[pthread wrapper] \n");
}
str = getenv("LIKWID_PIN");
@@ -132,35 +143,32 @@ pthread_create(pthread_t* thread,
pin_ids[i++] = strtoul(token, &token, 10);
}
}
- ncpus--; /* last ID is the first (the process was pinned to) */
+ CPU_ZERO(&cpuset);
+ CPU_SET(pin_ids[ncpus-1], &cpuset);
+ ret = sched_setaffinity(getpid(), sizeof(cpu_set_t), &cpuset);
+ if (!silent)
+ {
+ color_print("[pthread wrapper] MAIN -> %d\n",pin_ids[ncpus-1]);
+ }
+ //ncpus--; /* last ID is the first (the process was pinned to) */
}
else
{
- printf("[pthread wrapper] ERROR: Environment Variabel LIKWID_PIN not set!\n");
+ color_print("[pthread wrapper] ERROR: Environment Variabel LIKWID_PIN not set!\n");
}
if (!silent)
{
- printf("[pthread wrapper] PIN_MASK: ");
+ color_print("[pthread wrapper] PIN_MASK: ");
- for (int i=0;i<ncpus;i++)
+ for (int i=0;i<ncpus-1;i++)
{
- printf("%d->%d ",i,pin_ids[i]);
+ color_print("%d->%d ",i,pin_ids[i]);
}
- printf("\n");
- printf("[pthread wrapper] SKIP MASK: 0x%llX\n",LLU_CAST skipMask);
+ color_print("\n[pthread wrapper] SKIP MASK: 0x%llX\n",LLU_CAST skipMask);
}
- overflow = ncpus;
- }
- else
- {
-#ifdef COLOR
- if (!silent)
- {
- color_on(BRIGHT, COLOR);
- }
-#endif
+ overflow = ncpus-1;
}
/* Handle dll related stuff */
@@ -171,7 +179,7 @@ pthread_create(pthread_t* thread,
{
break;
}
- if (sosearchpaths[reallpthrindex] != NULL)
+ if (sosearchpaths[reallpthrindex] != NULL)
{
reallpthrindex++;
}
@@ -181,7 +189,7 @@ pthread_create(pthread_t* thread,
if (!handle)
{
- printf("%s\n", dlerror());
+ color_print("%s\n", dlerror());
return -1;
}
@@ -190,7 +198,7 @@ pthread_create(pthread_t* thread,
if ((error = dlerror()) != NULL)
{
- printf("%s\n", error);
+ color_print("%s\n", error);
return -2;
}
@@ -205,39 +213,39 @@ pthread_create(pthread_t* thread,
{
if (!silent)
{
- printf("\tthreadid %lu -> SKIP \n", *thread);
+ color_print("\tthreadid %lu -> SKIP \n", *thread);
}
}
else
{
CPU_ZERO(&cpuset);
- CPU_SET(pin_ids[npinned], &cpuset);
+ CPU_SET(pin_ids[npinned%ncpus], &cpuset);
pthread_setaffinity_np(*thread, sizeof(cpu_set_t), &cpuset);
-
- if (npinned == overflow)
+ if ((npinned == overflow) && (!overflowed))
{
if (!silent)
{
- printf("Roundrobin placement triggered\n");
- printf("\tthreadid %lu -> core %d - OK", *thread, pin_ids[npinned]);
+ color_print("Roundrobin placement triggered\n\tthreadid %lu -> core %d - OK", *thread, pin_ids[npinned%ncpus]);
}
- npinned = 0;
+ overflowed = 1;
+ npinned = (npinned+1)%ncpus;
}
else
{
if (!silent)
{
- printf("\tthreadid %lu -> core %d - OK", *thread, pin_ids[npinned]);
+ color_print("\tthreadid %lu -> core %d - OK", *thread, pin_ids[npinned%ncpus]);
}
npinned++;
+ if ((npinned >= ncpus) && (overflowed))
+ {
+ npinned = 0;
+ }
}
if (!silent)
{
-#ifdef COLOR
- color_reset();
-#endif
- printf("\n");
+ color_print("\n");
}
}
}
diff --git a/src/strUtil.c b/src/strUtil.c
deleted file mode 100644
index cf37920..0000000
--- a/src/strUtil.c
+++ /dev/null
@@ -1,975 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: strUtil.c
- *
- * Description: Utility routines for strings. Depends on bstring lib.
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <sched.h>
-
-#include <error.h>
-#include <types.h>
-#include <bstrlib.h>
-#include <strUtil.h>
-#include <affinity.h>
-#include <cpuid.h>
-#include <pci.h>
-
-/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
-static int
-cpu_count(cpu_set_t* set)
-{
- uint32_t i;
- int s = 0;
- const __cpu_mask *p = set->__bits;
- const __cpu_mask *end = &set->__bits[sizeof(cpu_set_t) / sizeof (__cpu_mask)];
-
- while (p < end)
- {
- __cpu_mask l = *p++;
-
- if (l == 0)
- {
- continue;
- }
-
- for (i=0; i< (sizeof(__cpu_mask)*8); i++)
- {
- if (l&(1UL<<i))
- {
- s++;
- }
- }
- }
-
- return s;
-}
-
-/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
-int str2int(const char* str)
-{
- char* endptr;
- errno = 0;
- unsigned long val;
- val = strtoul(str, &endptr, 10);
- if ((errno == ERANGE && val == LONG_MAX )
- || (errno != 0 && val == 0))
- {
- ERROR;
- }
-
- if (endptr == str)
- {
- ERROR_PRINT(Cannot parse string %s to digits, str);
- }
-
- return (int) val;
-}
-
-uint32_t
-bstr_to_cpuset_physical(uint32_t* threads, const_bstring q)
-{
- int i;
- unsigned int rangeBegin;
- unsigned int rangeEnd;
- uint32_t numThreads=0;
- struct bstrList* tokens;
- struct bstrList* subtokens;
-
- tokens = bsplit(q,',');
-
- for (i=0;i<tokens->qty;i++)
- {
- subtokens = bsplit(tokens->entry[i],'-');
-
- if( subtokens->qty == 1 )
- {
- threads[numThreads] = str2int((char *) bdata(subtokens->entry[0]));
- numThreads++;
- }
- else if ( subtokens->qty == 2 )
- {
- rangeBegin = str2int((char*) bdata(subtokens->entry[0]));
- rangeEnd = str2int((char*) bdata(subtokens->entry[1]));
-
- if (!(rangeBegin <= rangeEnd))
- {
- ERROR_PRINT(Range End %d bigger than begin %d, rangeEnd, rangeBegin);
- }
-
- while (rangeBegin <= rangeEnd) {
- threads[numThreads] = rangeBegin;
- numThreads++;
- rangeBegin++;
- }
- }
- else
- {
- ERROR_PLAIN_PRINT(Parse Error);
- }
- bstrListDestroy(subtokens);
- }
- if (numThreads > MAX_NUM_THREADS)
- {
- ERROR_PRINT(Number Of threads %d too large, numThreads);
- }
-
- bstrListDestroy(tokens);
-
- return numThreads;
-}
-
-uint32_t
-bstr_to_cpuset_logical(uint32_t* threads, const_bstring q)
-{
- int i;
- uint32_t j;
- int id;
- uint32_t tmpThreads[MAX_NUM_THREADS];
- int globalNumThreads=0;
- uint32_t numThreads=0;
- struct bstrList* tokens;
- struct bstrList* subtokens;
- const AffinityDomain* domain;
-
- tokens = bsplit(q,'@');
-
- for (i=0;i<tokens->qty;i++)
- {
- subtokens = bsplit(tokens->entry[i],':');
-
- if ( subtokens->qty == 2 )
- {
- domain = affinity_getDomain(subtokens->entry[0]);
-
- if (!domain)
- {
- ERROR_PRINT(Unknown domain ##%s##,bdata(subtokens->entry[0]));
- }
-
- numThreads = bstr_to_cpuset_physical(tmpThreads, subtokens->entry[1]);
-
- for (j=0; j<numThreads; j++)
- {
- if (! (tmpThreads[j] >= domain->numberOfProcessors))
- {
- id = (tmpThreads[j]/domain->numberOfCores) +
- (tmpThreads[j]%domain->numberOfCores) * cpuid_topology.numThreadsPerCore;
- threads[globalNumThreads++] = domain->processorList[id];
- }
- else
- {
- ERROR_PRINT(Too many threads requested. Avaialable 0-%d,domain->numberOfProcessors-1);
- }
- }
- }
- else
- {
- ERROR_PLAIN_PRINT(Parse Error);
- }
- bstrListDestroy(subtokens);
- }
-
- bstrListDestroy(tokens);
-
- return globalNumThreads;
-}
-
-#define PRINT_EXPR_ERR printf("SYNTAX ERROR: Expression must have the format E:<thread domain>:<num threads>[:chunk size>:<stride>]\n")
-
-uint32_t
-bstr_to_cpuset_expression(uint32_t* threads, const_bstring qi)
-{
- int i;
- uint32_t j;
- bstring q = (bstring) qi;
- int globalNumThreads=0;
- uint32_t numThreads=0;
- struct bstrList* tokens;
- struct bstrList* subtokens;
- const AffinityDomain* domain;
-
- bdelete (q, 0, 2);
- tokens = bsplit(q,'@');
-
- for (i=0;i<tokens->qty;i++)
- {
- subtokens = bsplit(tokens->entry[i],':');
-
- if ( subtokens->qty == 2 )
- {
- domain = affinity_getDomain(subtokens->entry[0]);
-
- if (!domain)
- {
- ERROR_PRINT(Unknown domain ##%s##,bdata(subtokens->entry[0]));
- }
-
- numThreads = str2int(bdata(subtokens->entry[1]));
-
- if (numThreads > domain->numberOfProcessors)
- {
- ERROR_PRINT(Invalid processor id requested. Avaialable 0-%d,
- domain->numberOfProcessors-1);
- }
-
- for (j=0; j<numThreads; j++)
- {
- threads[globalNumThreads++] = domain->processorList[j];
- }
- }
- else if ( subtokens->qty == 4 )
- {
- int counter;
- int currentId = 0;
- int startId = 0;
- int chunksize = str2int(bdata(subtokens->entry[2]));
- int stride = str2int(bdata(subtokens->entry[3]));
- domain = affinity_getDomain(subtokens->entry[0]);
-
- if (!domain)
- {
- ERROR_PRINT(Unknown domain ##%s##,bdata(subtokens->entry[0]));
- }
-
- numThreads = str2int(bdata(subtokens->entry[1]));
-
- if (numThreads > domain->numberOfProcessors)
- {
- ERROR_PRINT(Invalid number of processors requested. Available 0-%d,
- domain->numberOfProcessors-1);
- }
-
-
- counter = 0;
- for (j=0; j<numThreads; j+=chunksize)
- {
- for(i=0;i<chunksize && j+i<numThreads ;i++)
- {
- threads[globalNumThreads++] = domain->processorList[counter+i];
- }
- counter += stride;
- if (counter >= domain->numberOfProcessors)
- {
- counter = 0;
- }
- }
- }
- else
- {
- PRINT_EXPR_ERR;
- ERROR_PLAIN_PRINT(Parse Error);
- }
- bstrListDestroy(subtokens);
- }
-
- bstrListDestroy(tokens);
-
- return globalNumThreads;
-}
-
-uint32_t
-bstr_to_cpuset_scatter(uint32_t* threads, const_bstring qi)
-{
- int domainId = 0;
- int id = 0;
- int threadId = 0;
- bstring q = (bstring) qi;
- bstring domaintag;
- int globalNumThreads=0;
- struct bstrList* subtokens;
- int numberOfDomains = 0;
- AffinityDomain* domain;
- AffinityDomain* tmpDomainPtr;
-
- domain = (AffinityDomain*) malloc(cpuid_topology.numHWThreads * sizeof(AffinityDomain));
-
- subtokens = bsplit(q,':');
-
- if ( subtokens->qty == 2 )
- {
- for(int i =0;;i++)
- {
- domaintag = bformat("%s%d",bdata(subtokens->entry[0]),i);
- tmpDomainPtr = (AffinityDomain*) affinity_getDomain(domaintag);
-
- if (tmpDomainPtr == NULL)
- {
- break;
- }
- else
- {
- memcpy(domain+i,tmpDomainPtr,sizeof(AffinityDomain));
- numberOfDomains++;
- }
- }
-
- threads[globalNumThreads++] = domain[domainId].processorList[0];
-
- for (uint32_t i=1; i<cpuid_topology.numHWThreads; i++)
- {
- domainId = i%numberOfDomains;
-
- if (domainId == 0)
- {
- threadId++;
- }
-
- id = (threadId/domain->numberOfCores) +
- (threadId%domain->numberOfCores) * cpuid_topology.numThreadsPerCore;
-
- threads[globalNumThreads++] = domain[domainId].processorList[id];
- }
- }
- else
- {
- PRINT_EXPR_ERR;
- ERROR_PLAIN_PRINT(Parse Error);
- }
- bstrListDestroy(subtokens);
- free(domain);
-
- return globalNumThreads;
-}
-
-
-
-#define CPUSET_ERROR \
- if (cpuid_isInCpuset()) { \
- ERROR_PLAIN_PRINT(You are running inside a cpuset. In cpusets only logical pinning inside set is allowed!); \
- }
-
-
-
-int
-bstr_to_cpuset(int* threadsIN, const_bstring q)
-{
- uint32_t i;
- int num=0;
- int cpuMapping[cpuid_topology.numHWThreads];
- cpu_set_t cpu_set;
- uint32_t numThreads;
- bstring domainStr = bformat("NSCM");
- const_bstring scatter = bformat("scatter");
- struct bstrList* tokens;
- CPU_ZERO(&cpu_set);
- sched_getaffinity(0,sizeof(cpu_set_t), &cpu_set);
- uint32_t* threads = (uint32_t*) threadsIN;
-
- if (binchr (q, 0, domainStr) != BSTR_ERR)
- {
- CPUSET_ERROR;
-
- if (binstr (q, 0 , scatter ) != BSTR_ERR)
- {
- numThreads = bstr_to_cpuset_scatter(threads,q);
- }
- else if (bstrchr (q, 'E') != BSTR_ERR)
- {
- numThreads = bstr_to_cpuset_expression(threads,q);
- }
- else
- {
- numThreads = bstr_to_cpuset_logical(threads,q);
- }
- }
- else if (bstrchr (q, 'L') != BSTR_ERR)
- {
- uint32_t count = cpu_count(&cpu_set);
-
- tokens = bsplit(q,':');
- numThreads = bstr_to_cpuset_physical(threads,tokens->entry[1]);
-
- for (i=0; i < cpuid_topology.numHWThreads; i++)
- {
- if (CPU_ISSET(i,&cpu_set))
- {
- cpuMapping[num++]=i;
- }
- }
-
- for (i=0; i < numThreads; i++)
- {
- if (!(threads[i] >= count))
- {
- threads[i] = cpuMapping[threads[i]];
- }
- else
- {
- fprintf(stderr, "Available CPUs: ");
- for (int j=0; j< num-1;j++)
- {
- fprintf(stderr, "%d,", cpuMapping[j]);
- }
- fprintf(stderr, "%d\n", cpuMapping[num-1]);
- ERROR_PRINT(Index %d out of range.,threads[i]);
- }
- }
- bstrListDestroy(tokens);
- }
- else
- {
- CPUSET_ERROR;
- numThreads = bstr_to_cpuset_physical(threads,q);
- }
-
- bdestroy(domainStr);
- return (int) numThreads;
-}
-
-
-void
-bstr_to_eventset(StrUtilEventSet* set, const_bstring q)
-{
- int i;
- struct bstrList* tokens;
- struct bstrList* subtokens;
-
- tokens = bsplit(q,',');
- set->numberOfEvents = tokens->qty;
- set->events = (StrUtilEvent*)
- malloc(set->numberOfEvents * sizeof(StrUtilEvent));
-
- for (i=0;i<tokens->qty;i++)
- {
- subtokens = bsplit(tokens->entry[i],':');
-
- if ( subtokens->qty != 2 )
- {
-
- fprintf(stderr, "Cannot parse event string %s, probably missing counter name\n"
- ,bdata(tokens->entry[i]));
- fprintf(stderr, "Format: <eventName>:<counter>,...\n");
- msr_finalize();
- pci_finalize();
- exit(EXIT_FAILURE);
-
- }
- else
- {
- set->events[i].eventName = bstrcpy(subtokens->entry[0]);
- set->events[i].counterName = bstrcpy(subtokens->entry[1]);
- }
-
- bstrListDestroy(subtokens);
- }
-
- bstrListDestroy(tokens);
-}
-
-FILE*
-bstr_to_outstream(const_bstring argString, bstring filter)
-{
- int i;
- char* cstr;
- FILE* STREAM;
- struct bstrList* tokens;
- bstring base;
- bstring suffix = bfromcstr(".");
- bstring filename;
-
- /* configure filter */
- tokens = bsplit(argString,'.');
-
- if (tokens->qty < 2)
- {
- fprintf(stderr, "Outputfile has no filetype suffix!\n");
- fprintf(stderr, "Add suffix .txt for raw output or any supported filter suffix.\n");
- exit(EXIT_FAILURE);
- }
-
- base = bstrcpy(tokens->entry[0]);
-
- if (biseqcstr(tokens->entry[1],"txt"))
- {
- bassigncstr(filter, "NO");
- }
- else
- {
- bassigncstr(filter, TOSTRING(LIKWIDFILTERPATH));
- bconchar(filter,'/');
- bconcat(filter,tokens->entry[1]);
- }
-
- bconcat(suffix,tokens->entry[1]);
- bstrListDestroy(tokens);
-
- tokens = bsplit(base,'_');
-
- if (tokens->qty < 1)
- {
- ERROR_PLAIN_PRINT(Error in parsing file string);
- }
-
- filename = bstrcpy(tokens->entry[0]);
-
- for (i=1; i<tokens->qty; i++)
- {
- if (biseqcstr(tokens->entry[i],"%j"))
- {
- cstr = getenv("PBS_JOBID");
- if (cstr != NULL)
- {
- bcatcstr(filename, "_");
- bcatcstr(filename, cstr);
- }
- }
- else if (biseqcstr(tokens->entry[i],"%r"))
- {
- cstr = getenv("PMI_RANK");
- if (cstr == NULL)
- {
- cstr = getenv("OMPI_COMM_WORLD_RANK");
- }
- if (cstr != NULL)
- {
- bcatcstr(filename, "_");
- bcatcstr(filename, cstr);
- }
- }
- else if (biseqcstr(tokens->entry[i],"%h"))
- {
- cstr = (char*) malloc(HOST_NAME_MAX * sizeof(char));
- gethostname(cstr,HOST_NAME_MAX);
- bcatcstr(filename, "_");
- bcatcstr(filename, cstr);
- free(cstr);
- }
- else if (biseqcstr(tokens->entry[i],"%p"))
- {
- bstring pid = bformat("_%d",getpid());
- bconcat(filename, pid);
- bdestroy(pid);
- }
- else
- {
- ERROR_PLAIN_PRINT(Unsupported placeholder in filename!);
- }
- }
-
- if (biseqcstr(filter,"NO"))
- {
- bconcat(filename, suffix);
- }
- else
- {
- bcatcstr(filter, " ");
- bcatcstr(filename, ".tmp");
- bconcat(filter, filename);
- }
-
- bstrListDestroy(tokens);
- STREAM = fopen(bdata(filename),"w");
- bdestroy(filename);
- bdestroy(suffix);
- bdestroy(base);
-
- return STREAM;
-}
-
-
-uint64_t
-bstr_to_doubleSize(const_bstring str, DataType type)
-{
- bstring unit = bmidstr(str, blength(str)-2, 2);
- bstring sizeStr = bmidstr(str, 0, blength(str)-2);
- uint64_t sizeU = str2int(bdata(sizeStr));
- uint64_t junk = 0;
- uint64_t bytesize = 0;
-
- switch (type)
- {
- case SINGLE:
- case SINGLE_RAND:
- bytesize = sizeof(float);
- break;
-
- case DOUBLE:
- case DOUBLE_RAND:
- bytesize = sizeof(double);
- break;
- }
-
- if (biseqcstr(unit, "kB")) {
- junk = (sizeU *1024)/bytesize;
- } else if (biseqcstr(unit, "MB")) {
- junk = (sizeU *1024*1024)/bytesize;
- } else if (biseqcstr(unit, "GB")) {
- junk = (sizeU *1024*1024*1024)/bytesize;
- }
-
- return junk;
-}
-
-void
-bstr_to_interval(const_bstring str, struct timespec* interval)
-{
- int size;
- int pos;
- bstring ms = bformat("ms");
-
- if ((pos = bstrrchr (str, 's')) != BSTR_ERR)
- {
- if (pos != (blength(str)-1))
- {
- fprintf(stderr, "You need to specify a time unit s or ms like 200ms\n");
- msr_finalize();
- exit(EXIT_FAILURE);
- }
-
- /* unit is ms */
- if (binstrr (str, blength(str), ms) != BSTR_ERR)
- {
- bstring sizeStr = bmidstr(str, 0, blength(str)-2);
- size = str2int(bdata(sizeStr));
- if (size >= 1000)
- {
- interval->tv_sec = size/1000;
- interval->tv_nsec = (size%1000) * 1.E06;
- }
- else
- {
- interval->tv_sec = 0L;
- interval->tv_nsec = size * 1.E06;
- }
- }
- /* unit is s */
- else
- {
- bstring sizeStr = bmidstr(str, 0, blength(str)-1);
- size = str2int(bdata(sizeStr));
- interval->tv_sec = size;
- interval->tv_nsec = 0L;
- }
- }
- else
- {
- fprintf(stderr, "You need to specify a time unit s or ms like 200ms\n");
- msr_finalize();
- exit(EXIT_FAILURE);
- }
-}
-
-
-void
-bstr_to_workgroup(Workgroup* group,
- const_bstring str,
- DataType type,
- int numberOfStreams)
-{
- uint32_t i;
- int parseStreams = 0;
- bstring threadInfo;
- bstring streams= bformat("0");
- struct bstrList* tokens;
- struct bstrList* subtokens;
- const AffinityDomain* domain;
-
- /* split the workgroup into the thread and the streams part */
- tokens = bsplit(str,'-');
-
- if (tokens->qty == 2)
- {
- threadInfo = bstrcpy(tokens->entry[0]);
- streams = bstrcpy(tokens->entry[1]);
- parseStreams = 1;
- }
- else if (tokens->qty == 1)
- {
- threadInfo = bstrcpy(tokens->entry[0]);
- }
- else
- {
- ERROR_PLAIN_PRINT(Error in parsing workgroup string);
- }
-
- bstrListDestroy (tokens);
- tokens = bsplit(threadInfo,':');
-
- if (tokens->qty == 5)
- {
- uint32_t maxNumThreads;
- int chunksize;
- int stride;
- int counter;
- int currentId = 0;
- int startId = 0;
-
- domain = affinity_getDomain(tokens->entry[0]);
-
- if (domain == NULL)
- {
- fprintf(stderr, "Error: Domain %s not available on current machine.\nTry likwid-bench -p for supported domains.",
- bdata(tokens->entry[0]));
- exit(EXIT_FAILURE);
- }
-
- group->size = bstr_to_doubleSize(tokens->entry[1], type);
- group->numberOfThreads = str2int(bdata(tokens->entry[2]));
- chunksize = str2int(bdata(tokens->entry[3]));
- stride = str2int(bdata(tokens->entry[4]));
- maxNumThreads = (domain->numberOfProcessors / stride) * chunksize;
-
- if (group->numberOfThreads > maxNumThreads)
- {
- fprintf(stderr, "Error: Domain %s supports only up to %d threads with used expression.\n",
- bdata(tokens->entry[0]), maxNumThreads);
- exit(EXIT_FAILURE);
- }
-
- group->processorIds = (int*) malloc(group->numberOfThreads * sizeof(int));
-
- counter = chunksize;
-
- for (i=0; i<group->numberOfThreads; i++)
- {
- if (counter)
- {
- group->processorIds[i] = domain->processorList[currentId++];
- }
- else
- {
- startId += stride;
- currentId = startId;
- group->processorIds[i] = domain->processorList[currentId++];
- counter = chunksize;
- }
- counter--;
- }
- }
- else if (tokens->qty == 3)
- {
- domain = affinity_getDomain(tokens->entry[0]);
-
- if (domain == NULL)
- {
- fprintf(stderr, "Error: Domain %s not available on current machine.\n", bdata(tokens->entry[0]));
- fprintf(stderr, "Try likwid-bench -p for supported domains.\n");
- exit(EXIT_FAILURE);
- }
-
- group->size = bstr_to_doubleSize(tokens->entry[1], type);
- group->numberOfThreads = str2int(bdata(tokens->entry[2]));
-
- if (group->numberOfThreads > domain->numberOfProcessors)
- {
- fprintf(stderr, "Error: Domain %s supports only up to %d threads.\n",
- bdata(tokens->entry[0]),domain->numberOfProcessors);
- exit(EXIT_FAILURE);
- }
-
- group->processorIds = (int*) malloc(group->numberOfThreads * sizeof(int));
-
- for (i=0; i<group->numberOfThreads; i++)
- {
- group->processorIds[i] = domain->processorList[i];
- }
- }
- else if (tokens->qty == 2)
- {
- domain = affinity_getDomain(tokens->entry[0]);
-
- if (domain == NULL)
- {
- fprintf(stderr, "Error: Domain %s not available on current machine.\n",
- bdata(tokens->entry[0]));
- fprintf(stderr, "Try likwid-bench -p for supported domains.\n");
- exit(EXIT_FAILURE);
- }
-
- group->size = bstr_to_doubleSize(tokens->entry[1], type);
- group->numberOfThreads = domain->numberOfProcessors;
- group->processorIds = (int*) malloc(group->numberOfThreads * sizeof(int));
-
- for (i=0; i<group->numberOfThreads; i++)
- {
- group->processorIds[i] = domain->processorList[i];
- }
- }
- else
- {
- ERROR_PLAIN_PRINT(Error in parsing workgroup string);
- }
-
- bstrListDestroy(tokens);
-
- /* parse stream list */
- if (parseStreams)
- {
- tokens = bsplit(streams,',');
-
- if (tokens->qty < numberOfStreams)
- {
- ERROR_PRINT(Testcase requires at least %d streams, numberOfStreams);
- }
-
- group->streams = (Stream*) malloc(numberOfStreams * sizeof(Stream));
-
- for (i=0;i<(uint32_t) tokens->qty;i++)
- {
- subtokens = bsplit(tokens->entry[i],':');
-
- if ( subtokens->qty == 3 )
- {
- int index = str2int(bdata(subtokens->entry[0]));
- if (index >= numberOfStreams)
- {
- ERROR_PRINT(Stream Index %d out of range,index);
- }
- group->streams[index].domain = bstrcpy(subtokens->entry[1]);
- group->streams[index].offset = str2int(bdata(subtokens->entry[2]));
- }
- else if ( subtokens->qty == 2 )
- {
- int index = str2int(bdata(subtokens->entry[0]));
- if (index >= numberOfStreams)
- {
- ERROR_PRINT(Stream Index %d out of range,index);
- }
- group->streams[index].domain = bstrcpy(subtokens->entry[1]);
- group->streams[index].offset = 0;
- }
- else
- {
- ERROR_PLAIN_PRINT(Error in parsing event string);
- }
-
- bstrListDestroy(subtokens);
- }
-
- bstrListDestroy(tokens);
- }
- else
- {
- group->streams = (Stream*) malloc(numberOfStreams * sizeof(Stream));
-
- for (i=0; i< (uint32_t)numberOfStreams; i++)
- {
- group->streams[i].domain = domain->tag;
- group->streams[i].offset = 0;
- }
- }
-
- group->size /= numberOfStreams;
-}
-
-
-#define INIT_SECURE_INPUT_LENGTH 256
-
-bstring
-bSecureInput (int maxlen, char* vgcCtx) {
- int i, m, c = 1;
- bstring b, t;
- int termchar = 0;
-
- if (!vgcCtx) return NULL;
-
- b = bfromcstralloc (INIT_SECURE_INPUT_LENGTH, "");
-
- for (i=0; ; i++)
- {
- if (termchar == c)
- {
- break;
- }
- else if ((maxlen > 0) && (i >= maxlen))
- {
- b = NULL;
- return b;
- }
- else
- {
- c = *(vgcCtx++);
- }
-
- if (EOF == c)
- {
- break;
- }
-
- if (i+1 >= b->mlen) {
-
- /* Double size, but deal with unusual case of numeric
- overflows */
-
- if ((m = b->mlen << 1) <= b->mlen &&
- (m = b->mlen + 1024) <= b->mlen &&
- (m = b->mlen + 16) <= b->mlen &&
- (m = b->mlen + 1) <= b->mlen)
- {
- t = NULL;
- }
- else
- {
- t = bfromcstralloc (m, "");
- }
-
- if (t)
- {
- memcpy (t->data, b->data, i);
- }
-
- bdestroy (b); /* Clean previous buffer */
- b = t;
- if (!b)
- {
- return b;
- }
- }
-
- b->data[i] = (unsigned char) c;
- }
-
- i--;
- b->slen = i;
- b->data[i] = (unsigned char) '\0';
- return b;
-}
-
-
-int
-bJustifyCenter (bstring b, int width)
-{
- unsigned char space = ' ';
- int alignSpace = (width - b->slen) / 2;
- int restSpace = (width - b->slen) % 2;
- if (width <= 0) return -__LINE__;
-
- if (b->slen <= width)
- {
- binsertch (b, 0, alignSpace, space);
- }
-
- binsertch (b, b->slen , alignSpace+restSpace, space);
-
- return BSTR_OK;
-}
-
-
diff --git a/src/thermal.c b/src/thermal.c
index 0812086..e5cf7a9 100644
--- a/src/thermal.c
+++ b/src/thermal.c
@@ -5,13 +5,13 @@
*
* Description: Module implementing Intel TM/TM2 interface
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -34,7 +34,7 @@
#include <types.h>
#include <thermal.h>
-#include <cpuid.h>
+#include <topology.h>
/* ##### EXPORTED VARIABLES ########################################### */
@@ -49,10 +49,15 @@ ThermalInfo thermal_info;
void thermal_init(int cpuId)
{
uint64_t flags=0ULL;
+ HPMinit();
+ HPMaddThread(cpuId);
if ( cpuid_hasFeature(TM2) )
{
- flags = msr_read(cpuId, IA32_THERM_STATUS);
+ if (HPMread(cpuId, MSR_DEV, IA32_THERM_STATUS, &flags))
+ {
+ return;
+ }
if ( flags & 0x1 )
{
@@ -66,7 +71,10 @@ void thermal_init(int cpuId)
thermal_info.resolution = extractBitField(flags,4,27);
flags = 0ULL;
- flags = msr_read(cpuId, MSR_TEMPERATURE_TARGET);
+ if (HPMread(cpuId, MSR_DEV, MSR_TEMPERATURE_TARGET, &flags))
+ {
+ return;
+ }
thermal_info.activationT = extractBitField(flags,8,16);
thermal_info.offset = extractBitField(flags,6,24);
}
diff --git a/src/threads.c b/src/threads.c
deleted file mode 100644
index 87fa2b2..0000000
--- a/src/threads.c
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * =======================================================================================
- *
- * Filename: threads.c
- *
- * Description: High level interface to pthreads
- *
- * Version: 3.1.3
- * Released: 4.11.2014
- *
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
- * Project: likwid
- *
- * Copyright (C) 2014 Jan Treibig
- *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 3 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-/* ##### HEADER FILE INCLUDES ######################################### */
-
-#include <stdlib.h>
-#include <stdio.h>
-
-#include <error.h>
-#include <types.h>
-#include <threads.h>
-
-
-/* ##### EXPORTED VARIABLES ########################################### */
-
-pthread_barrier_t threads_barrier;
-ThreadData* threads_data;
-ThreadGroup* threads_groups;
-
-/* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */
-
-static pthread_t* threads = NULL;
-static pthread_attr_t attr;
-static int numThreads = 0;
-
-
-/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
-
-void
-threads_init(FILE* OUTSTREAM, int numberOfThreads)
-{
- int i;
- numThreads = numberOfThreads;
-
- threads = (pthread_t*) malloc(numThreads * sizeof(pthread_t));
- threads_data = (ThreadData*) malloc(numThreads * sizeof(ThreadData));
-
- for(i = 0; i < numThreads; i++)
- {
- threads_data[i].numberOfThreads = numThreads;
- threads_data[i].globalNumberOfThreads = numThreads;
- threads_data[i].globalThreadId = i;
- threads_data[i].threadId = i;
- threads_data[i].output = OUTSTREAM;
- }
-
- pthread_barrier_init(&threads_barrier, NULL, numThreads);
- pthread_attr_init(&attr);
- pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
-}
-
-
-void
-threads_create(void *(*startRoutine)(void*))
-{
- int i;
-
- for(i = 0; i < numThreads; i++)
- {
- pthread_create(&threads[i],
- &attr,
- startRoutine,
- (void*) &threads_data[i]);
- }
-}
-
-void
-threads_createGroups(int numberOfGroups)
-{
- int i;
- int j;
- int numThreadsPerGroup;
- int globalId = 0;
-
- if (numThreads % numberOfGroups)
- {
- ERROR_PRINT(Not enough threads %d to create %d groups,numThreads,numberOfGroups);
- }
- else
- {
- numThreadsPerGroup = numThreads / numberOfGroups;
- }
-
- threads_groups = (ThreadGroup*) malloc(numberOfGroups *
- sizeof(ThreadGroup));
-
- for (i = 0; i < numberOfGroups; i++)
- {
- threads_groups[i].numberOfThreads = numThreadsPerGroup;
- threads_groups[i].threadIds = (int*) malloc(numThreadsPerGroup *
- sizeof(int));
-
- for (j = 0; j < numThreadsPerGroup; j++)
- {
- threads_data[globalId].threadId = j;
- threads_data[globalId].groupId = i;
- threads_data[globalId].numberOfGroups = numberOfGroups;
- threads_data[globalId].numberOfThreads = numThreadsPerGroup;
- threads_groups[i].threadIds[j] = globalId++;
- }
- }
-}
-
-
-void
-threads_registerDataAll(ThreadUserData* data, threads_copyDataFunc func)
-{
- int i;
-
- if (func == NULL)
- {
- for(i = 0; i < numThreads; i++)
- {
- threads_data[i].data = (*data);
- }
- }
- else
- {
- for(i = 0; i < numThreads; i++)
- {
- func( data, &threads_data[i].data);
- }
- }
-}
-
-void
-threads_registerDataThread(int threadId,
- ThreadUserData* data,
- threads_copyDataFunc func)
-{
- if (func == NULL)
- {
- threads_data[threadId].data = (*data);
- }
- else
- {
- func( data, &threads_data[threadId].data);
- }
-}
-
-void
-threads_registerDataGroup(int groupId,
- ThreadUserData* data,
- threads_copyDataFunc func)
-{
- int i;
-
- if (func == NULL)
- {
- for (i = 0; i < threads_groups[groupId].numberOfThreads; i++)
- {
- threads_data[threads_groups[groupId].threadIds[i]].data = (*data);
- }
- }
- else
- {
- for (i = 0; i < threads_groups[groupId].numberOfThreads; i++)
- {
- func( data,
- &threads_data[threads_groups[groupId].threadIds[i]].data);
- }
- }
-}
-
-void
-threads_join(void)
-{
- int i;
-
- for(i=0; i < numThreads; i++)
- {
- pthread_join(threads[i], NULL);
- }
-
- pthread_attr_destroy(&attr);
- pthread_barrier_destroy(&threads_barrier);
-}
-
-void
-threads_destroy(int numberOfGroups)
-{
- int i;
- free(threads_data);
- for(i=0;i<numberOfGroups;i++)
- {
- free(threads_groups[i].threadIds);
- }
- free(threads_groups);
- free(threads);
-}
diff --git a/src/timer.c b/src/timer.c
index 337c13d..ce43bba 100644
--- a/src/timer.c
+++ b/src/timer.c
@@ -5,13 +5,13 @@
*
* Description: Implementation of timer module
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
* Author: Jan Treibig (jt), jan.treibig at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -27,23 +27,182 @@
*
* =======================================================================================
*/
-
+/* ##### HEADER FILE INCLUDES ######################################### */
#include <stdlib.h>
#include <stdio.h>
-#include <time.h>
+#include <unistd.h>
#include <sys/time.h>
+#include <time.h>
#include <types.h>
-#include <timer.h>
+#include <error.h>
+#include <likwid.h>
+#include <cpuid.h>
+/* ##### EXPORTED VARIABLES ########################################### */
+/* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */
static uint64_t baseline = 0ULL;
static uint64_t cpuClock = 0ULL;
+static uint64_t cyclesClock = 0ULL;
+static uint64_t sleepbase = 0ULL;
+static int timer_initialized = 0;
+
+void (*TSTART)(TscCounter*) = NULL;
+void (*TSTOP)(TscCounter*) = NULL;
+
+/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
+
+
+/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
+#if defined(__x86_64)
+static void fRDTSC(TscCounter* cpu_c)
+{
+ __asm__ volatile("xor %%eax,%%eax\n\t" \
+ "cpuid\n\t" \
+ "rdtsc\n\t" \
+ "movl %%eax, %0\n\t" \
+ "movl %%edx, %1\n\t" \
+ : "=r" ((cpu_c)->int32.lo), "=r" ((cpu_c)->int32.hi) \
+ : : "%eax","%ebx","%ecx","%edx");
+}
+
+static void fRDTSC_CR(TscCounter* cpu_c)
+{
+ __asm__ volatile( \
+ "rdtsc\n\t" \
+ "movl %%eax, %0\n\t" \
+ "movl %%edx, %1\n\t" \
+ : "=r" ((cpu_c)->int32.lo), "=r" ((cpu_c)->int32.hi) \
+ : : "%eax","%ebx","%ecx","%edx");
+}
+#ifndef __MIC__
+static void fRDTSCP(TscCounter* cpu_c)
+{
+ __asm__ volatile( \
+ "rdtscp\n\t" \
+ "movl %%eax, %0\n\t" \
+ "movl %%edx, %1\n\t" \
+ "cpuid\n\t" \
+ : "=r" ((cpu_c)->int32.lo), "=r" ((cpu_c)->int32.hi) \
+ : : "%eax","%ebx","%ecx","%edx");
+}
+#endif
+#endif
+
+#if defined(__i386__)
+static void fRDTSC(TscCounter* cpu_c)
+{
+ uint64_t tmp;
+ __asm__ volatile( \
+ "xchgl %%ebx, %2\n\t" \
+ "xor %%eax,%%eax\n\t" \
+ "cpuid\n\t" \
+ "rdtsc\n\t" \
+ "movl %%eax, %0\n\t" \
+ "movl %%edx, %1\n\t" \
+ "xchgl %2, %%ebx\n\t" \
+ : "=r" ((cpu_c)->int32.lo), "=r" ((cpu_c)->int32.hi), "=m" (tmp) \
+ : : "%eax","%ecx","%edx");
+}
+static void fRDTSC_CR(TscCounter* cpu_c)
+{
+ __asm__ volatile( \
+ "rdtsc\n\t" \
+ "movl %%eax, %0\n\t" \
+ "movl %%edx, %1\n\t" \
+ : "=r" ((cpu_c)->int32.lo), "=r" ((cpu_c)->int32.hi) \
+ : : "%eax","%edx");
+}
+#ifndef __MIC__
+static void fRDTSCP(TscCounter* cpu_c)
+{
+ uint64_t tmp;
+ __asm__ volatile( \
+ "rdtscp\n\t" \
+ "movl %%eax, %0\n\t" \
+ "movl %%edx, %1\n\t" \
+ "xchgl %%ebx, %2\n\t" \
+ "cpuid\n\t" \
+ "xchgl %2, %%ebx\n\t" \
+ : "=r" ((cpu_c)->int32.lo), "=r" ((cpu_c)->int32.hi), "=m" (tmp) \
+ : : "%eax","%ecx","%edx");
+}
+#endif
+#endif
+static void _timer_start( TimerData* time )
+{
+#if defined(__x86_64) || defined(__i386__)
+ if (TSTART)
+ TSTART(&(time->start));
+#endif
+#ifdef _ARCH_PPC
+ uint32_t tbl, tbu0, tbu1;
+
+ do {
+ __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
+ __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
+ __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
+ } while (tbu0 != tbu1);
-static uint64_t
+ time->start.int64 = (((uint64_t)tbu0) << 32) | tbl;
+#endif
+}
+
+static void _timer_stop( TimerData* time )
+{
+#if defined(__x86_64) || defined(__i386__)
+ if (TSTOP)
+ TSTOP(&(time->stop));
+#endif
+#ifdef _ARCH_PPC
+ uint32_t tbl, tbu0, tbu1;
+ do {
+ __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
+ __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
+ __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
+ } while (tbu0 != tbu1);
+
+ time->stop.int64 = (((uint64_t)tbu0) << 32) | tbl;
+#endif
+}
+
+static uint64_t _timer_printCycles( TimerData* time )
+{
+ /* clamp to zero if something goes wrong */
+ if (((time->stop.int64-baseline) < time->start.int64) ||
+ (time->start.int64 == time->stop.int64))
+ {
+ return 0ULL;
+ }
+ else
+ {
+ return (time->stop.int64 - time->start.int64 - baseline);
+ }
+}
+
+/* Return time duration in seconds */
+static double _timer_print( TimerData* time )
+{
+ uint64_t cycles;
+ /* clamp to zero if something goes wrong */
+ if (((time->stop.int64-baseline) < time->start.int64) ||
+ (time->start.int64 == time->stop.int64))
+ {
+ cycles = 0ULL;
+ }
+ else
+ {
+ cycles = time->stop.int64 - time->start.int64 - baseline;
+ }
+ return ((double) cycles / (double) cyclesClock);
+}
+
+static void
getCpuSpeed(void)
{
-#ifdef __x86_64
+#if defined(__x86_64) || defined(__i386__)
+ int i;
TimerData data;
TscCounter start;
TscCounter stop;
@@ -51,36 +210,40 @@ getCpuSpeed(void)
struct timeval tv1;
struct timeval tv2;
struct timezone tzp;
- struct timespec delay = { 0, 800000000 }; /* calibration time: 800 ms */
+ struct timespec delay = { 0, 500000000 }; /* calibration time: 500 ms */
- for (int i=0; i< 10; i++)
+ for (i=0; i< 10; i++)
{
- timer_start(&data);
- timer_stop(&data);
- result = MIN(result,timer_printCycles(&data));
+ _timer_start(&data);
+ _timer_stop(&data);
+ result = MIN(result,_timer_printCycles(&data));
}
baseline = result;
result = 0xFFFFFFFFFFFFFFFFULL;
+ data.stop.int64 = 0;
+ data.start.int64 = 0;
- for (int i=0; i< 2; i++)
+ for (i=0; i< 2; i++)
{
- RDTSC(start);
+ _timer_start(&data);
gettimeofday( &tv1, &tzp);
nanosleep( &delay, NULL);
- RDTSC_STOP(stop);
+ _timer_stop(&data);
gettimeofday( &tv2, &tzp);
- result = MIN(result,(stop.int64 - start.int64));
+ result = MIN(result,(data.stop.int64 - data.start.int64));
}
- return (result) * 1000000 /
+ cpuClock = (result) * 1000000 /
(((uint64_t)tv2.tv_sec * 1000000 + tv2.tv_usec) -
((uint64_t)tv1.tv_sec * 1000000 + tv1.tv_usec));
+ cyclesClock = cpuClock;
#endif
#ifdef _ARCH_PPC
FILE *fpipe;
char *command="grep timebase /proc/cpuinfo | awk '{ print $3 }'";
+ char *command2="grep clock /proc/cpuinfo | head -n 1 | awk '{ print $3 }'";
char buff[256];
if ( !(fpipe = (FILE*)popen(command,"r")) )
@@ -91,55 +254,223 @@ getCpuSpeed(void)
fgets(buff, 256, fpipe);
- return (uint64_t) atoi(buff);
+ cyclesClock = (uint64_t) atoi(buff);
+ if ( !(fpipe = (FILE*)popen(command2,"r")) )
+ { // If fpipe is NULL
+ perror("Problems with pipe");
+ exit(1);
+ }
+
+ fgets(buff, 256, fpipe);
+
+ cpuClock = (uint64_t) atoi(buff);
+ cpuClock *= 1E6;
#endif
}
+
+/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
+
+void init_sleep()
+{
+ int status;
+ TimerData timer;
+ struct timespec req = {0,1};
+ struct timespec rem = {0,0};
+ for (int i=0; i<10; ++i)
+ {
+ _timer_start(&timer);
+ status = clock_nanosleep(CLOCK_REALTIME,0,&req, &rem);
+ _timer_stop(&timer);
+ if (_timer_print(&timer)*1E6 > sleepbase)
+ {
+ sleepbase = _timer_print(&timer)*1E6 + 2;
+ }
+ }
+}
+
+
void timer_init( void )
{
- cpuClock = getCpuSpeed();
+ uint32_t eax = 0x0,ebx = 0x0,ecx = 0x0,edx = 0x0;
+ if (timer_initialized == 1)
+ {
+ return;
+ }
+ if ((!TSTART) && (!TSTOP))
+ {
+ TSTART = fRDTSC;
+ eax = 0x80000001;
+ CPUID (eax, ebx, ecx, edx);
+#ifndef __MIC__
+ if (edx & (1<<27))
+ {
+ TSTOP = fRDTSCP;
+ }
+ else
+ {
+ TSTOP = fRDTSC_CR;
+ }
+#else
+ TSTOP = fRDTSC_CR;
+#endif
+ }
+ if (cpuClock == 0ULL)
+ {
+ getCpuSpeed();
+ }
+ timer_initialized = 1;
}
uint64_t timer_printCycles( TimerData* time )
{
- /* clamp to zero if something goes wrong */
- if ((time->stop.int64-baseline) < time->start.int64)
+ if (timer_initialized != 1)
{
+ ERROR_PLAIN_PRINT(Timer module not properly initialized);
return 0ULL;
}
- else
- {
- return (time->stop.int64 - time->start.int64 - baseline);
- }
+ return _timer_printCycles(time);
}
/* Return time duration in seconds */
double timer_print( TimerData* time )
{
uint64_t cycles;
+ if (timer_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Timer module not properly initialized);
+ return 0ULL;
+ }
+ return _timer_print(time);
+}
- /* clamp to zero if something goes wrong */
- if ((time->stop.int64-baseline) < time->start.int64)
+uint64_t timer_getCpuClock( void )
+{
+ if (timer_initialized != 1)
{
- cycles = 0ULL;
+ ERROR_PLAIN_PRINT(Timer module not properly initialized);
+ return 0ULL;
}
- else
+ return cpuClock;
+}
+
+uint64_t timer_getCpuClockCurrent( int cpu_id )
+{
+ int err;
+ uint64_t clock = 0x0ULL;
+ FILE *fpipe;
+ char cmd[256];
+ char buff[256];
+ char* eptr, *rptr;
+
+ sprintf(buff, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_cur_freq", cpu_id);
+ if (access(buff, R_OK))
{
- cycles = time->stop.int64 - time->start.int64 - baseline;
+ ERROR_PRINT(File %s not readable, buff);
+ return clock;
+ }
+ sprintf(cmd, "cat %s", buff);
+ if ( !(fpipe = (FILE*)popen(cmd,"r")) )
+ { // If fpipe is NULL
+ ERROR_PRINT(Problems reading cpu frequency of CPU %d, cpu_id);
+ return clock;
}
- return ((double) cycles / (double) cpuClock);
+ rptr = fgets(buff, 256, fpipe);
+ if (rptr != NULL)
+ {
+ clock = strtoull(buff, &eptr, 10);
+ }
+ return clock *1E3;
}
-uint64_t timer_getCpuClock( void )
+uint64_t timer_getCycleClock( void )
{
- return cpuClock;
+ if (timer_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Timer module not properly initialized);
+ return 0ULL;
+ }
+ return cyclesClock;
}
uint64_t timer_getBaseline( void )
{
+ if (timer_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Timer module not properly initialized);
+ return 0ULL;
+ }
return baseline;
}
+void timer_start( TimerData* time )
+{
+ if (timer_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Timer module not properly initialized);
+ return;
+ }
+ _timer_start(time);
+}
+
+
+void timer_stop( TimerData* time )
+{
+ if (timer_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Timer module not properly initialized);
+ return;
+ }
+ _timer_stop(time);
+}
+
+
+int timer_sleep(unsigned long usec)
+{
+ int status = -1;
+ struct timespec req;
+ struct timespec rem = {0,0};
+ if (sleepbase == 0x0ULL)
+ {
+ init_sleep();
+ }
+ if (usec >= 1000000)
+ {
+ status = sleep(usec / 1000000);
+ }
+ else
+ {
+ req.tv_sec = 0;
+ req.tv_nsec = (usec-sleepbase)*1.E3;
+ status = clock_nanosleep(CLOCK_REALTIME,0,&req, &rem);
+ if ((status == -1) && (errno == EINTR))
+ {
+ status = (rem.tv_sec * 1E6) + (rem.tv_nsec * 1E-3);
+ }
+ }
+ return status;
+}
+
+
+void timer_finalize(void)
+{
+ if (timer_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Timer module not properly initialized);
+ return;
+ }
+ baseline = 0ULL;
+ cpuClock = 0ULL;
+ TSTART = NULL;
+ TSTOP = NULL;
+ timer_initialized = 0;
+}
+
+void timer_reset( TimerData* time )
+{
+ time->start.int64 = 0;
+ time->stop.int64 = 0;
+}
diff --git a/src/topology.c b/src/topology.c
new file mode 100644
index 0000000..602abf2
--- /dev/null
+++ b/src/topology.c
@@ -0,0 +1,1041 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: topology.c
+ *
+ * Description: Interface to the topology backends
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Authors: Jan Treibig (jt), jan.treibig at gmail.com,
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sched.h>
+
+#include <likwid.h>
+
+#include <topology.h>
+#include <error.h>
+#include <tree.h>
+#include <bitUtil.h>
+//#include <strUtil.h>
+#include <configuration.h>
+
+
+static int topology_initialized = 0;
+CpuInfo cpuid_info;
+CpuTopology cpuid_topology;
+
+int affinity_thread2tile_lookup[MAX_NUM_THREADS];
+
+static char* pentium_m_b_str = "Intel Pentium M Banias processor";
+static char* pentium_m_d_str = "Intel Pentium M Dothan processor";
+static char* core_duo_str = "Intel Core Duo processor";
+static char* core_2a_str = "Intel Core 2 65nm processor";
+static char* core_2b_str = "Intel Core 2 45nm processor";
+static char* atom_45_str = "Intel Atom 45nm processor";
+static char* atom_32_str = "Intel Atom 32nm processor";
+static char* atom_22_str = "Intel Atom 22nm processor";
+static char* atom_silvermont_str = "Intel Atom (Silvermont) processor";
+static char* atom_airmont_str = "Intel Atom (Airmont) processor";
+static char* atom_goldmont_str = "Intel Atom (Goldmont) processor";
+static char* nehalem_bloom_str = "Intel Core Bloomfield processor";
+static char* nehalem_lynn_str = "Intel Core Lynnfield processor";
+static char* nehalem_west_str = "Intel Core Westmere processor";
+static char* sandybridge_str = "Intel Core SandyBridge processor";
+static char* ivybridge_str = "Intel Core IvyBridge processor";
+static char* ivybridge_ep_str = "Intel Xeon IvyBridge EN/EP/EX processor";
+static char* sandybridge_ep_str = "Intel Xeon SandyBridge EN/EP processor";
+static char* haswell_str = "Intel Core Haswell processor";
+static char* haswell_ep_str = "Intel Xeon Haswell EN/EP/EX processor";
+static char* broadwell_str = "Intel Core Broadwell processor";
+static char* broadwell_d_str = "Intel Xeon D Broadwell processor";
+static char* broadwell_ep_str = "Intel Xeon Broadwell EN/EP/EX processor";
+static char* skylake_str = "Intel Skylake processor";
+static char* nehalem_ex_str = "Intel Nehalem EX processor";
+static char* westmere_ex_str = "Intel Westmere EX processor";
+static char* xeon_mp_string = "Intel Xeon MP processor";
+static char* xeon_phi_string = "Intel Xeon Phi (Knights Corner) Coprocessor";
+static char* xeon_phi2_string = "Intel Xeon Phi (Knights Landing) Coprocessor";
+static char* barcelona_str = "AMD Barcelona processor";
+static char* shanghai_str = "AMD Shanghai processor";
+static char* istanbul_str = "AMD Istanbul processor";
+static char* magnycours_str = "AMD Magny Cours processor";
+static char* interlagos_str = "AMD Interlagos processor";
+static char* kabini_str = "AMD Family 16 model - Kabini processor";
+static char* opteron_sc_str = "AMD Opteron single core 130nm processor";
+static char* opteron_dc_e_str = "AMD Opteron Dual Core Rev E 90nm processor";
+static char* opteron_dc_f_str = "AMD Opteron Dual Core Rev F 90nm processor";
+static char* athlon64_str = "AMD Athlon64 X2 (AM2) Rev F 90nm processor";
+static char* athlon64_f_str = "AMD Athlon64 (AM2) Rev F 90nm processor";
+static char* athlon64_X2_g_str = "AMD Athlon64 X2 (AM2) Rev G 65nm processor";
+static char* athlon64_g_str = "AMD Athlon64 (AM2) Rev G 65nm processor";
+static char* amd_k8_str = "AMD K8 architecture";
+static char* unknown_intel_str = "Unknown Intel Processor";
+static char* unknown_amd_str = "Unknown AMD Processor";
+
+static char* short_core2 = "core2";
+static char* short_atom = "atom";
+static char* short_pm = "pentiumm";
+static char* short_silvermont = "silvermont";
+static char* short_goldmont = "goldmont";
+static char* short_nehalem = "nehalem";
+static char* short_nehalemEX = "nehalemEX";
+static char* short_westmere = "westmere";
+static char* short_westmereEX = "westmereEX";
+static char* short_haswell = "haswell";
+static char* short_haswell_ep = "haswellEP";
+static char* short_broadwell = "broadwell";
+static char* short_broadwell_d = "broadwellD";
+static char* short_broadwell_ep = "broadwellEP";
+static char* short_ivybridge = "ivybridge";
+static char* short_ivybridge_ep = "ivybridgeEP";
+static char* short_sandybridge = "sandybridge";
+static char* short_sandybridge_ep = "sandybridgeEP";
+static char* short_skylake = "skylake";
+static char* short_phi = "phi";
+static char* short_phi2 = "phi2";
+static char* short_k8 = "k8";
+static char* short_k10 = "k10";
+static char* short_k15 = "interlagos";
+static char* short_k16 = "kabini";
+static char* short_unknown = "unknown";
+
+
+
+int cpu_count(cpu_set_t* set)
+{
+ uint32_t i;
+ int s = 0;
+ const __cpu_mask *p = set->__bits;
+ const __cpu_mask *end = &set->__bits[sizeof(cpu_set_t) / sizeof (__cpu_mask)];
+
+ while (p < end)
+ {
+ __cpu_mask l = *p++;
+
+ if (l == 0)
+ {
+ continue;
+ }
+
+ for (i=0; i< (sizeof(__cpu_mask)*8); i++)
+ {
+ if (l&(1UL<<i))
+ {
+ s++;
+ }
+ }
+ }
+
+ return s;
+}
+
+static void initTopologyFile(FILE* file)
+{
+ size_t items;
+ HWThread* hwThreadPool;
+ CacheLevel* cacheLevels;
+ TreeNode* currentNode;
+
+ items = fread((void*) &cpuid_topology, sizeof(CpuTopology), 1, file);
+
+ hwThreadPool = (HWThread*) malloc(cpuid_topology.numHWThreads * sizeof(HWThread));
+ items = fread((void*) hwThreadPool, sizeof(HWThread), cpuid_topology.numHWThreads, file);
+ cpuid_topology.threadPool = hwThreadPool;
+
+ cacheLevels = (CacheLevel*) malloc(cpuid_topology.numCacheLevels * sizeof(CacheLevel));
+ items = fread((void*) cacheLevels, sizeof(CacheLevel), cpuid_topology.numCacheLevels, file);
+ cpuid_topology.cacheLevels = cacheLevels;
+ cpuid_topology.topologyTree = NULL;
+
+ tree_init(&cpuid_topology.topologyTree, 0);
+
+ for (uint32_t i=0; i< cpuid_topology.numHWThreads; i++)
+ {
+ if (!tree_nodeExists(cpuid_topology.topologyTree,
+ hwThreadPool[i].packageId))
+ {
+ tree_insertNode(cpuid_topology.topologyTree,
+ hwThreadPool[i].packageId);
+ }
+ currentNode = tree_getNode(cpuid_topology.topologyTree,
+ hwThreadPool[i].packageId);
+
+ if (!tree_nodeExists(currentNode, hwThreadPool[i].coreId))
+ {
+ tree_insertNode(currentNode, hwThreadPool[i].coreId);
+ }
+ currentNode = tree_getNode(currentNode, hwThreadPool[i].coreId);
+
+ if (!tree_nodeExists(currentNode, i))
+ {
+ tree_insertNode(currentNode, i);
+ affinity_thread2tile_lookup[hwThreadPool[i].apicId] = hwThreadPool[i].coreId;
+ }
+ }
+}
+
+
+static int readTopologyFile(const char* filename)
+{
+ FILE* fp;
+ char structure[256];
+ char field[256];
+ char value[256];
+ char line[512];
+ int numHWThreads = -1;
+ int numCacheLevels = -1;
+ int numberOfNodes = -1;
+ int* tmpNumberOfProcessors;
+ int counter;
+ int i;
+ uint32_t tmp, tmp1;
+
+ fp = fopen(filename, "r");
+
+ while (fgets(line, 512, fp) != NULL) {
+ sscanf(line,"%s %s", structure, field);
+ if ((strcmp(structure, "cpuid_topology") == 0) && (strcmp(field, "numHWThreads") == 0))
+ {
+ sscanf(line,"%s %s = %d", structure, field, &numHWThreads);
+ }
+ else if ((strcmp(structure, "cpuid_topology") == 0) && (strcmp(field, "numCacheLevels") == 0))
+ {
+ sscanf(line,"%s %s = %d", structure, field, &numCacheLevels);
+ }
+ else if ((strcmp(structure, "numa_info") == 0) && (strcmp(field, "numberOfNodes") == 0))
+ {
+ sscanf(line,"%s %s = %d", structure, field, &numberOfNodes);
+ }
+ if ((numHWThreads >= 0) && (numCacheLevels >= 0) && (numberOfNodes >= 0))
+ {
+ break;
+ }
+ }
+ if (numHWThreads < 0 || numCacheLevels < 0 || numberOfNodes < 0)
+ {
+ ERROR_PRINT(Cannot read topology information from file %s, filename);
+ fclose(fp);
+ return -1;
+ }
+
+ tmpNumberOfProcessors = (int*) malloc(numberOfNodes *sizeof(int));
+ fseek(fp, 0, SEEK_SET);
+ counter = 0;
+ while (fgets(line, 512, fp) != NULL) {
+ sscanf(line,"%s %s %d %s = %d", structure, field, &tmp, value, &tmp1);
+ if ((strcmp(structure, "numa_info") == 0) && (strcmp(value, "numberOfProcessors") == 0))
+ {
+ tmpNumberOfProcessors[tmp-1] = tmp1;
+ counter++;
+ }
+ if (counter == numberOfNodes)
+ {
+ break;
+ }
+ }
+
+ cpuid_topology.threadPool = (HWThread*)malloc(numHWThreads * sizeof(HWThread));
+ cpuid_topology.cacheLevels = (CacheLevel*)malloc(numCacheLevels * sizeof(CacheLevel));
+ cpuid_topology.numHWThreads = numHWThreads;
+ cpuid_topology.numCacheLevels = numCacheLevels;
+
+ numa_info.nodes = (NumaNode*) malloc(numberOfNodes * sizeof(NumaNode));
+ numa_info.numberOfNodes = numberOfNodes;
+
+ for(i=0;i<numberOfNodes;i++)
+ {
+ numa_info.nodes[i].processors = (uint32_t*) malloc (tmpNumberOfProcessors[i] * sizeof(int));
+ numa_info.nodes[i].distances = (uint32_t*) malloc (numberOfNodes * sizeof(int));
+ }
+ free(tmpNumberOfProcessors);
+
+ fseek(fp, 0, SEEK_SET);
+
+ while (fgets(line, 512, fp) != NULL) {
+ sscanf(line,"%s %s", structure, field);
+ if (strcmp(structure, "cpuid_topology") == 0)
+ {
+ if (strcmp(field, "numSockets") == 0)
+ {
+ sscanf(line, "%s %s = %d", structure, field, &tmp);
+ cpuid_topology.numSockets = tmp;
+ }
+ else if (strcmp(field, "numCoresPerSocket") == 0)
+ {
+ sscanf(line, "%s %s = %d", structure, field, &tmp);
+ cpuid_topology.numCoresPerSocket = tmp;
+ }
+ else if (strcmp(field, "numThreadsPerCore") == 0)
+ {
+ sscanf(line, "%s %s = %d", structure, field, &tmp);
+ cpuid_topology.numThreadsPerCore = tmp;
+ }
+ else if (strcmp(field, "threadPool") == 0)
+ {
+ int thread;
+
+ sscanf(line, "%s %s %d %s = %d", structure, field, &thread, value, &tmp);
+
+ if (strcmp(value, "threadId") == 0)
+ {
+ cpuid_topology.threadPool[thread].threadId = tmp;
+ }
+ else if (strcmp(value, "coreId") == 0)
+ {
+ cpuid_topology.threadPool[thread].coreId = tmp;
+ }
+ else if (strcmp(value, "packageId") == 0)
+ {
+ cpuid_topology.threadPool[thread].packageId = tmp;
+ }
+ else if (strcmp(value, "apicId") == 0)
+ {
+ cpuid_topology.threadPool[thread].apicId = tmp;
+ }
+
+ }
+ else if (strcmp(field, "cacheLevels") == 0)
+ {
+ int level;
+ char type[128];
+ sscanf(line, "%s %s %d %s", structure, field, &level, value);
+
+ cpuid_topology.cacheLevels[level-1].level = level-1;
+ if (strcmp(value, "type") == 0)
+ {
+ sscanf(line, "%s %s %d %s = %s", structure, field, &level, value, type);
+ if (strcmp(type, "UNIFIEDCACHE") == 0)
+ {
+ cpuid_topology.cacheLevels[level-1].type = UNIFIEDCACHE;
+ }
+ else if (strcmp(type, "DATACACHE") == 0)
+ {
+ cpuid_topology.cacheLevels[level-1].type = DATACACHE;
+ }
+ else if (strcmp(type, "INSTRUCTIONCACHE") == 0)
+ {
+ cpuid_topology.cacheLevels[level-1].type = INSTRUCTIONCACHE;
+ }
+ else if (strcmp(type, "ITLB") == 0)
+ {
+ cpuid_topology.cacheLevels[level-1].type = ITLB;
+ }
+ else if (strcmp(type, "DTLB") == 0)
+ {
+ cpuid_topology.cacheLevels[level-1].type = DTLB;
+ }
+ else if (strcmp(type, "NOCACHE") == 0)
+ {
+ cpuid_topology.cacheLevels[level-1].type = NOCACHE;
+ }
+ }
+ else
+ {
+ sscanf(line, "%s %s %d %s = %d", structure, field, &level, value, &tmp);
+ if (strcmp(value, "associativity") == 0)
+ {
+ cpuid_topology.cacheLevels[level-1].associativity = tmp;
+ }
+ else if (strcmp(value, "sets") == 0)
+ {
+ cpuid_topology.cacheLevels[level-1].sets = tmp;
+ }
+ else if (strcmp(value, "lineSize") == 0)
+ {
+ cpuid_topology.cacheLevels[level-1].lineSize = tmp;
+ }
+ else if (strcmp(value, "size") == 0)
+ {
+ cpuid_topology.cacheLevels[level-1].size = tmp;
+ }
+ else if (strcmp(value, "threads") == 0)
+ {
+ cpuid_topology.cacheLevels[level-1].threads = tmp;
+ }
+ else if (strcmp(value, "inclusive") == 0)
+ {
+ cpuid_topology.cacheLevels[level-1].inclusive = tmp;
+ }
+ }
+
+ }
+ }
+ else if (strcmp(structure, "cpuid_info") == 0)
+ {
+ if (strcmp(field, "family") == 0)
+ {
+ sscanf(line, "%s %s = %d", structure, field, &tmp);
+ cpuid_info.family = tmp;
+
+ }
+ else if (strcmp(field, "model") == 0)
+ {
+ sscanf(line, "%s %s = %d", structure, field, &tmp);
+ cpuid_info.model = tmp;
+ }
+ else if (strcmp(field, "osname") == 0)
+ {
+ strcpy(value,&(line[strlen(structure)+strlen(field)+4]));
+ cpuid_info.osname = (char*) malloc((strlen(value)+1) * sizeof(char));
+ strncpy(cpuid_info.osname, value, strlen(value));
+ cpuid_info.osname[strlen(value)-1] = '\0';
+ }
+ else if (strcmp(field, "stepping") == 0)
+ {
+ sscanf(line, "%s %s = %d", structure, field, &tmp);
+ cpuid_info.stepping = tmp;
+
+ }
+ else if (strcmp(field, "clock") == 0)
+ {
+ sscanf(line, "%s %s = %d", structure, field, &tmp);
+ cpuid_info.clock = tmp;
+
+ }
+ else if (strcmp(field, "turbo") == 0)
+ {
+ sscanf(line, "%s %s = %d", structure, field, &tmp);
+ cpuid_info.turbo = tmp;
+
+ }
+ else if (strcmp(field, "isIntel") == 0)
+ {
+ sscanf(line, "%s %s = %d", structure, field, &tmp);
+ cpuid_info.isIntel = tmp;
+
+ }
+ else if (strcmp(field, "featureFlags") == 0)
+ {
+ sscanf(line, "%s %s = %d", structure, field, &tmp);
+ cpuid_info.featureFlags = tmp;
+
+ }
+ else if (strcmp(field, "perf_version") == 0)
+ {
+ sscanf(line, "%s %s = %d", structure, field, &tmp);
+ cpuid_info.perf_version = tmp;
+
+ }
+ else if (strcmp(field, "perf_num_ctr") == 0)
+ {
+ sscanf(line, "%s %s = %d", structure, field, &tmp);
+ cpuid_info.perf_num_ctr = tmp;
+
+ }
+ else if (strcmp(field, "perf_width_ctr") == 0)
+ {
+ sscanf(line, "%s %s = %d", structure, field, &tmp);
+ cpuid_info.perf_width_ctr = tmp;
+
+ }
+ else if (strcmp(field, "perf_num_fixed_ctr") == 0)
+ {
+ sscanf(line, "%s %s = %d", structure, field, &tmp);
+ cpuid_info.perf_num_fixed_ctr = tmp;
+
+ }
+ else if (strcmp(field, "features") == 0)
+ {
+ strcpy(value,&(line[strlen(structure)+strlen(field)+4]));
+ cpuid_info.features = (char*) malloc((strlen(value)+1) * sizeof(char));
+ strncpy(cpuid_info.features, value, strlen(value));
+ cpuid_info.features[strlen(value)-1] = '\0';
+ }
+ }
+ else if (strcmp(structure, "numa_info") == 0)
+ {
+ if (strcmp(field, "nodes") == 0)
+ {
+ int id;
+ sscanf(line, "%s %s %d %s", structure, field, &id, value);
+
+ if (strcmp(value,"numberOfProcessors") == 0)
+ {
+ sscanf(line, "%s %s %d %s = %d", structure, field, &id, value, &tmp);
+ numa_info.nodes[id-1].numberOfProcessors = tmp;
+ }
+ else if (strcmp(value, "freeMemory") == 0)
+ {
+ sscanf(line, "%s %s %d %s = %d", structure, field, &id, value, &tmp);
+ numa_info.nodes[id-1].freeMemory = tmp;
+ }
+ else if (strcmp(value, "id") == 0)
+ {
+ sscanf(line, "%s %s %d %s = %d", structure, field, &id, value, &tmp);
+ numa_info.nodes[id-1].id = tmp;
+ }
+ else if (strcmp(value, "totalMemory") == 0)
+ {
+ sscanf(line, "%s %s %d %s = %d", structure, field, &id, value, &tmp);
+ numa_info.nodes[id-1].totalMemory = tmp;
+ }
+ else if (strcmp(value, "numberOfDistances") == 0)
+ {
+ sscanf(line, "%s %s %d %s = %d", structure, field, &id, value, &tmp);
+ numa_info.nodes[id-1].numberOfDistances = tmp;
+ }
+ if (strcmp(value, "processors") == 0)
+ {
+ sscanf(line, "%s %s %d %s %d = %d", structure, field, &id, value, &tmp, &tmp1);
+ numa_info.nodes[id-1].processors[tmp-1] = tmp1;
+ }
+ else if (strcmp(value,"distances") == 0)
+ {
+ sscanf(line, "%s %s %d %s %d = %d", structure, field, &id, value, &tmp, &tmp1);
+ numa_info.nodes[id-1].distances[tmp] = tmp1;
+ }
+ }
+ }
+ }
+ fclose(fp);
+
+ return 0;
+}
+
+int topology_setName(void)
+{
+ switch ( cpuid_info.family )
+ {
+ case P6_FAMILY:
+ switch ( cpuid_info.model )
+ {
+ case PENTIUM_M_BANIAS:
+ cpuid_info.name = pentium_m_b_str;
+ cpuid_info.short_name = short_pm;
+ break;
+
+ case PENTIUM_M_DOTHAN:
+ cpuid_info.name = pentium_m_d_str;
+ cpuid_info.short_name = short_pm;
+ break;
+
+ case CORE_DUO:
+ cpuid_info.name = core_duo_str;
+ cpuid_info.short_name = short_core2;
+ break;
+
+ case CORE2_65:
+ cpuid_info.name = core_2a_str;
+ cpuid_info.short_name = short_core2;
+ break;
+
+ case CORE2_45:
+ cpuid_info.name = core_2b_str;
+ cpuid_info.short_name = short_core2;
+ break;
+
+ case NEHALEM_BLOOMFIELD:
+ cpuid_info.name = nehalem_bloom_str;
+ cpuid_info.short_name = short_nehalem;
+ break;
+
+ case NEHALEM_LYNNFIELD:
+ cpuid_info.name = nehalem_lynn_str;
+ cpuid_info.short_name = short_nehalem;
+ break;
+
+ case NEHALEM_WESTMERE_M:
+
+ case NEHALEM_WESTMERE:
+ cpuid_info.name = nehalem_west_str;
+ cpuid_info.short_name = short_westmere;
+ break;
+
+ case SANDYBRIDGE:
+ cpuid_info.name = sandybridge_str;
+ cpuid_info.short_name = short_sandybridge;
+ break;
+
+ case SANDYBRIDGE_EP:
+ cpuid_info.supportUncore = 1;
+ cpuid_info.name = sandybridge_ep_str;
+ cpuid_info.short_name = short_sandybridge_ep;
+ break;
+
+ case IVYBRIDGE:
+ cpuid_info.name = ivybridge_str;
+ cpuid_info.short_name = short_ivybridge;
+ break;
+
+ case IVYBRIDGE_EP:
+ cpuid_info.supportUncore = 1;
+ cpuid_info.name = ivybridge_ep_str;
+ cpuid_info.short_name = short_ivybridge_ep;
+ break;
+
+ case HASWELL_EP:
+ cpuid_info.supportUncore = 1;
+ cpuid_info.name = haswell_ep_str;
+ cpuid_info.short_name = short_haswell_ep;
+ break;
+ case HASWELL:
+ case HASWELL_M1:
+ case HASWELL_M2:
+ cpuid_info.name = haswell_str;
+ cpuid_info.short_name = short_haswell;
+ break;
+
+ case BROADWELL:
+ cpuid_info.name = broadwell_str;
+ cpuid_info.short_name = short_broadwell;
+ break;
+ case BROADWELL_D:
+ cpuid_info.name = broadwell_d_str;
+ cpuid_info.short_name = short_broadwell_d;
+ break;
+ case BROADWELL_E:
+ cpuid_info.name = broadwell_ep_str;
+ cpuid_info.short_name = short_broadwell_ep;
+ break;
+
+ case SKYLAKE1:
+ case SKYLAKE2:
+ cpuid_info.name = skylake_str;
+ cpuid_info.short_name = short_skylake;
+ break;
+
+ case XEON_PHI2:
+ cpuid_info.name = xeon_phi2_string;
+ cpuid_info.short_name = short_phi2;
+ break;
+
+ case NEHALEM_EX:
+ cpuid_info.name = nehalem_ex_str;
+ cpuid_info.short_name = short_nehalemEX;
+ break;
+
+ case WESTMERE_EX:
+ cpuid_info.name = westmere_ex_str;
+ cpuid_info.short_name = short_westmereEX;
+ break;
+
+ case XEON_MP:
+ cpuid_info.name = xeon_mp_string;
+ cpuid_info.short_name = short_core2;
+ break;
+
+ case ATOM_45:
+
+ case ATOM:
+ cpuid_info.name = atom_45_str;
+ cpuid_info.short_name = short_atom;
+ break;
+
+ case ATOM_32:
+ cpuid_info.name = atom_32_str;
+ cpuid_info.short_name = short_atom;
+ break;
+
+ case ATOM_22:
+ cpuid_info.name = atom_22_str;
+ cpuid_info.short_name = short_atom;
+ break;
+
+ case ATOM_SILVERMONT_E:
+ case ATOM_SILVERMONT_C:
+ case ATOM_SILVERMONT_Z1:
+ case ATOM_SILVERMONT_Z2:
+ case ATOM_SILVERMONT_F:
+ cpuid_info.name = atom_silvermont_str;
+ cpuid_info.short_name = short_silvermont;
+ break;
+ case ATOM_SILVERMONT_AIR:
+ cpuid_info.name = atom_airmont_str;
+ cpuid_info.short_name = short_silvermont;
+ break;
+ case ATOM_SILVERMONT_GOLD:
+ cpuid_info.name = atom_goldmont_str;
+ cpuid_info.short_name = short_goldmont;
+ break;
+
+ default:
+ cpuid_info.name = unknown_intel_str;
+ cpuid_info.short_name = short_unknown;
+ break;
+ }
+ break;
+
+ case MIC_FAMILY:
+ switch ( cpuid_info.model )
+ {
+ case XEON_PHI:
+ cpuid_info.name = xeon_phi_string;
+ cpuid_info.short_name = short_phi;
+ break;
+
+ }
+ break;
+
+ case K8_FAMILY:
+
+ if (cpuid_info.isIntel)
+ {
+ ERROR_PLAIN_PRINT(Netburst architecture is not supported);
+ }
+
+ switch ( cpuid_info.model )
+ {
+ case OPTERON_DC_E:
+ cpuid_info.name = opteron_dc_e_str;
+ break;
+
+ case OPTERON_DC_F:
+ cpuid_info.name = opteron_dc_f_str;
+ break;
+
+ case ATHLON64_X2:
+
+ case ATHLON64_X2_F:
+ cpuid_info.name = athlon64_str;
+ break;
+
+ case ATHLON64_F1:
+
+ case ATHLON64_F2:
+ cpuid_info.name = athlon64_f_str;
+ break;
+
+ case ATHLON64_X2_G:
+ cpuid_info.name = athlon64_X2_g_str;
+ break;
+
+ case ATHLON64_G1:
+
+ case ATHLON64_G2:
+ cpuid_info.name = athlon64_g_str;
+ break;
+
+ case OPTERON_SC_1MB:
+ cpuid_info.name = opteron_sc_str;
+ break;
+
+ default:
+ cpuid_info.name = amd_k8_str;
+ break;
+ }
+ cpuid_info.short_name = short_k8;
+ break;
+
+ case K10_FAMILY:
+ switch ( cpuid_info.model )
+ {
+ case BARCELONA:
+ cpuid_info.name = barcelona_str;
+ break;
+
+ case SHANGHAI:
+ cpuid_info.name = shanghai_str;
+ break;
+
+ case ISTANBUL:
+ cpuid_info.name = istanbul_str;
+ break;
+
+ case MAGNYCOURS:
+ cpuid_info.name = magnycours_str;
+ break;
+
+ default:
+ cpuid_info.name = unknown_amd_str;
+ break;
+ }
+ cpuid_info.short_name = short_k10;
+ break;
+
+ case K15_FAMILY:
+ cpuid_info.name = interlagos_str;
+ cpuid_info.short_name = short_k15;
+ break;
+
+ case K16_FAMILY:
+ cpuid_info.name = kabini_str;
+ cpuid_info.short_name = short_k16;
+ break;
+
+ default:
+ return EXIT_FAILURE;
+ break;
+ }
+ return EXIT_SUCCESS;
+}
+
+const struct topology_functions topology_funcs = {
+#ifndef LIKWID_USE_HWLOC
+ .init_cpuInfo = cpuid_init_cpuInfo,
+ .init_cpuFeatures = cpuid_init_cpuFeatures,
+ .init_nodeTopology = cpuid_init_nodeTopology,
+ .init_cacheTopology = cpuid_init_cacheTopology,
+ .close_topology = NULL,
+#else
+ .init_cpuInfo = hwloc_init_cpuInfo,
+ .init_nodeTopology = hwloc_init_nodeTopology,
+ .init_cacheTopology = hwloc_init_cacheTopology,
+ .init_cpuFeatures = proc_init_cpuFeatures,
+ .close_topology = hwloc_close,
+#endif
+ .init_fileTopology = initTopologyFile,
+};
+
+
+void topology_setupTree(void)
+{
+ uint32_t i;
+ TreeNode* currentNode;
+ HWThread* hwThreadPool = cpuid_topology.threadPool;
+
+ tree_init(&cpuid_topology.topologyTree, 0);
+ for (i=0; i< cpuid_topology.numHWThreads; i++)
+ {
+ /* Add node to Topology tree */
+ if (!tree_nodeExists(cpuid_topology.topologyTree,
+ hwThreadPool[i].packageId))
+ {
+ //printf("Insert Socket %d\n", hwThreadPool[i].packageId);
+ tree_insertNode(cpuid_topology.topologyTree,
+ hwThreadPool[i].packageId);
+ }
+ currentNode = tree_getNode(cpuid_topology.topologyTree,
+ hwThreadPool[i].packageId);
+ if (!tree_nodeExists(currentNode, hwThreadPool[i].coreId))
+ {
+ //printf("Insert Core %d at Socket %d\n", hwThreadPool[i].coreId, hwThreadPool[i].packageId);
+ tree_insertNode(currentNode, hwThreadPool[i].coreId);
+ }
+ currentNode = tree_getNode(currentNode, hwThreadPool[i].coreId);
+ if (!tree_nodeExists(currentNode, hwThreadPool[i].apicId))
+ {
+ /*
+ printf("WARNING: Thread already exists!\n");
+ */
+ //printf("Insert HWThread %d from Core %d at Socket %d\n", hwThreadPool[i].apicId, hwThreadPool[i].coreId, hwThreadPool[i].packageId);
+ tree_insertNode(currentNode, hwThreadPool[i].apicId);
+ affinity_thread2tile_lookup[hwThreadPool[i].apicId] = hwThreadPool[i].coreId;
+ }
+
+ }
+ cpuid_topology.numSockets = tree_countChildren(cpuid_topology.topologyTree);
+ currentNode = tree_getChildNode(cpuid_topology.topologyTree);
+ cpuid_topology.numCoresPerSocket = tree_countChildren(currentNode);
+ currentNode = tree_getChildNode(currentNode);
+ cpuid_topology.numThreadsPerCore = tree_countChildren(currentNode);
+ return;
+}
+
+int topology_init(void)
+{
+ int ret = 0;
+ cpu_set_t cpuSet;
+ struct topology_functions funcs = topology_funcs;
+
+ if (topology_initialized)
+ {
+ return EXIT_SUCCESS;
+ }
+
+ if (init_configuration())
+ {
+ ERROR_PLAIN_PRINT(Cannot initialize configuration module to check for topology file name);
+ return EXIT_FAILURE;
+ }
+
+ if ((config.topologyCfgFileName == NULL) || access(config.topologyCfgFileName, R_OK))
+ {
+standard_init:
+ CPU_ZERO(&cpuSet);
+ sched_getaffinity(0,sizeof(cpu_set_t), &cpuSet);
+ if (cpu_count(&cpuSet) < sysconf(_SC_NPROCESSORS_CONF))
+ {
+ funcs.init_cpuInfo = proc_init_cpuInfo;
+ funcs.init_cpuFeatures = proc_init_cpuFeatures;
+ funcs.init_nodeTopology = proc_init_nodeTopology;
+ funcs.init_cacheTopology = proc_init_cacheTopology;
+ cpuid_topology.activeHWThreads =
+ ((cpu_count(&cpuSet) < sysconf(_SC_NPROCESSORS_CONF)) ?
+ cpu_count(&cpuSet) :
+ sysconf(_SC_NPROCESSORS_CONF));
+ }
+ else
+ {
+ cpuid_topology.activeHWThreads = sysconf(_SC_NPROCESSORS_CONF);
+ }
+ funcs.init_cpuInfo(cpuSet);
+ topology_setName();
+ funcs.init_cpuFeatures();
+ funcs.init_nodeTopology(cpuSet);
+ topology_setupTree();
+ funcs.init_cacheTopology();
+ sched_setaffinity(0, sizeof(cpu_set_t), &cpuSet);
+ }
+ else
+ {
+ CPU_ZERO(&cpuSet);
+ sched_getaffinity(0,sizeof(cpu_set_t), &cpuSet);
+ DEBUG_PRINT(DEBUGLEV_INFO, Reading topology information from %s, config.topologyCfgFileName);
+ ret = readTopologyFile(config.topologyCfgFileName);
+ if (ret < 0)
+ goto standard_init;
+ cpuid_topology.activeHWThreads = 0;
+ for (int i=0;i<cpuid_topology.numHWThreads;i++)
+ {
+ if (CPU_ISSET(cpuid_topology.threadPool[i].apicId, &cpuSet))
+ {
+ cpuid_topology.activeHWThreads++;
+ cpuid_topology.threadPool[i].inCpuSet = 1;
+ }
+ }
+ topology_setName();
+ topology_setupTree();
+ }
+
+
+ topology_initialized = 1;
+ return EXIT_SUCCESS;
+}
+
+
+void topology_finalize(void)
+{
+ struct topology_functions funcs = topology_funcs;
+ if (!topology_initialized)
+ {
+ return;
+ }
+ if (cpuid_info.features != NULL)
+ {
+ free(cpuid_info.features);
+ cpuid_info.features = NULL;
+ }
+ if (cpuid_info.osname != NULL)
+ {
+ free(cpuid_info.osname);
+ cpuid_info.osname = NULL;
+ }
+ if (cpuid_topology.cacheLevels != NULL)
+ {
+ free(cpuid_topology.cacheLevels);
+ cpuid_topology.cacheLevels = NULL;
+ }
+ if (cpuid_topology.threadPool != NULL)
+ {
+ free(cpuid_topology.threadPool);
+ cpuid_topology.threadPool = NULL;
+ }
+ if (cpuid_topology.topologyTree != NULL)
+ {
+ tree_destroy(cpuid_topology.topologyTree);
+ cpuid_topology.topologyTree = NULL;
+ }
+ if (topology_funcs.close_topology != NULL)
+ {
+ topology_funcs.close_topology();
+ }
+ cpuid_info.family = 0;
+ cpuid_info.model = 0;
+ cpuid_info.stepping = 0;
+ cpuid_info.clock = 0;
+ cpuid_info.turbo = 0;
+ cpuid_info.name = NULL;
+ cpuid_info.short_name = NULL;
+ cpuid_info.isIntel = 0;
+ cpuid_info.supportUncore = 0;
+ cpuid_info.featureFlags = 0;
+ cpuid_info.perf_version = 0;
+ cpuid_info.perf_num_ctr = 0;
+ cpuid_info.perf_width_ctr = 0;
+ cpuid_info.perf_num_fixed_ctr = 0;
+
+ cpuid_topology.numHWThreads = 0;
+ cpuid_topology.activeHWThreads = 0;
+ cpuid_topology.numSockets = 0;
+ cpuid_topology.numCoresPerSocket = 0;
+ cpuid_topology.numThreadsPerCore = 0;
+ cpuid_topology.numCacheLevels = 0;
+
+ topology_initialized = 0;
+}
+
+
+
+
+
+void print_supportedCPUs (void)
+{
+ printf("Supported Intel processors:\n");
+ printf("\t%s\n",core_2a_str);
+ printf("\t%s\n",core_2b_str);
+ printf("\t%s\n",xeon_mp_string);
+ printf("\t%s\n",atom_45_str);
+ printf("\t%s\n",atom_32_str);
+ printf("\t%s\n",atom_22_str);
+ printf("\t%s\n",nehalem_bloom_str);
+ printf("\t%s\n",nehalem_lynn_str);
+ printf("\t%s\n",nehalem_west_str);
+ printf("\t%s\n",nehalem_ex_str);
+ printf("\t%s\n",westmere_ex_str);
+ printf("\t%s\n",sandybridge_str);
+ printf("\t%s\n",sandybridge_ep_str);
+ printf("\t%s\n",ivybridge_str);
+ printf("\t%s\n",ivybridge_ep_str);
+ printf("\t%s\n",haswell_str);
+ printf("\t%s\n",haswell_ep_str);
+ printf("\t%s\n",atom_silvermont_str);
+ printf("\t%s\n",atom_airmont_str);
+ printf("\t%s\n",xeon_phi_string);
+ printf("\t%s\n",broadwell_str);
+ printf("\t%s\n",broadwell_d_str);
+ printf("\t%s\n",broadwell_ep_str);
+ printf("\t%s\n",skylake_str);
+ printf("\n");
+ printf("Supported AMD processors:\n");
+ printf("\t%s\n",opteron_sc_str);
+ printf("\t%s\n",opteron_dc_e_str);
+ printf("\t%s\n",opteron_dc_f_str);
+ printf("\t%s\n",barcelona_str);
+ printf("\t%s\n",shanghai_str);
+ printf("\t%s\n",istanbul_str);
+ printf("\t%s\n",magnycours_str);
+ printf("\t%s\n",interlagos_str);
+ printf("\t%s\n",kabini_str);
+ printf("\n");
+}
+
+
+
+CpuTopology_t get_cpuTopology(void)
+{
+ return &cpuid_topology;
+}
+
+CpuInfo_t get_cpuInfo(void)
+{
+ return &cpuid_info;
+}
+NumaTopology_t get_numaTopology(void)
+{
+ return &numa_info;
+}
+
diff --git a/src/topology_cpuid.c b/src/topology_cpuid.c
new file mode 100644
index 0000000..504714d
--- /dev/null
+++ b/src/topology_cpuid.c
@@ -0,0 +1,939 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: topology_cpuid.c
+ *
+ * Description: Interface to the cpuid based topology backend
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Authors: Jan Treibig (jt), jan.treibig at gmail.com,
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sched.h>
+#include <unistd.h>
+
+#include <error.h>
+
+#include <tree.h>
+#include <bitUtil.h>
+#include <tlb-info.h>
+#include <topology.h>
+#include <cpuid.h>
+
+/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
+#define MAX_CACHE_LEVELS 4
+
+/* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */
+static int largest_function = 0;
+static uint32_t eax, ebx, ecx, edx;
+
+/* Dirty hack to avoid nonull warnings */
+char* (*ownstrcpy)(char *__restrict __dest, const char *__restrict __src);
+
+/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
+static int intelCpuidFunc_4(CacheLevel** cachePool)
+{
+ int i;
+ int level=0;
+ int maxNumLevels=0;
+ uint32_t valid=1;
+ CacheLevel* pool;
+ while (valid)
+ {
+ eax = 0x04;
+ ecx = level;
+ CPUID(eax, ebx, ecx, edx);
+ valid = extractBitField(eax,5,0);
+ if (!valid)
+ {
+ break;
+ }
+ level++;
+ }
+
+ maxNumLevels = level;
+ *cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel));
+ pool = *cachePool;
+
+ for (i=0; i < maxNumLevels; i++)
+ {
+ eax = 0x04;
+ ecx = i;
+ CPUID(eax, ebx, ecx, edx);
+
+ pool[i].level = extractBitField(eax,3,5);
+ pool[i].type = (CacheType) extractBitField(eax,5,0);
+ pool[i].associativity = extractBitField(ebx,8,22)+1;
+ pool[i].sets = ecx+1;
+ pool[i].lineSize = extractBitField(ebx,12,0)+1;
+ pool[i].size = pool[i].sets *
+ pool[i].associativity *
+ pool[i].lineSize;
+ pool[i].threads = extractBitField(eax,10,14)+1;
+
+ /* WORKAROUND cpuid reports wrong number of threads on SMT processor with SMT
+ * turned off */
+ if (i < 3)
+ {
+ if ((cpuid_info.model == NEHALEM_BLOOMFIELD) ||
+ (cpuid_info.model == NEHALEM_LYNNFIELD) ||
+ (cpuid_info.model == NEHALEM_WESTMERE) ||
+ (cpuid_info.model == NEHALEM_WESTMERE_M) ||
+ (cpuid_info.model == SANDYBRIDGE) ||
+ (cpuid_info.model == SANDYBRIDGE_EP) ||
+ (cpuid_info.model == IVYBRIDGE) ||
+ (cpuid_info.model == IVYBRIDGE_EP) ||
+ (cpuid_info.model == HASWELL) ||
+ (cpuid_info.model == HASWELL_EP) ||
+ (cpuid_info.model == HASWELL_M1) ||
+ (cpuid_info.model == HASWELL_M2) ||
+ (cpuid_info.model == WESTMERE_EX) ||
+ (cpuid_info.model == NEHALEM_EX))
+ {
+ if (cpuid_topology.numThreadsPerCore == 1)
+ {
+ pool[i].threads = 1;
+ }
+ }
+ }
+
+ /* :WORKAROUND:08/13/2009 08:34:15 AM:jt: For L3 caches the value is sometimes
+ * too large in here. Ask Intel what is wrong here!
+ * Limit threads per Socket then to the maximum possible value.*/
+ if(pool[i].threads > (int)
+ (cpuid_topology.numCoresPerSocket*
+ cpuid_topology.numThreadsPerCore))
+ {
+ pool[i].threads = cpuid_topology.numCoresPerSocket*
+ cpuid_topology.numThreadsPerCore;
+ }
+ pool[i].inclusive = edx&0x2;
+ }
+
+ return maxNumLevels;
+}
+
+static uint32_t amdGetAssociativity(uint32_t flag)
+{
+ uint32_t asso= 0;
+
+ switch ( flag )
+ {
+ case 0x0:
+ asso = 0;
+ break;
+
+ case 0x1:
+ asso = 1;
+ break;
+
+ case 0x2:
+ asso = 2;
+ break;
+
+ case 0x4:
+ asso = 4;
+ break;
+
+ case 0x6:
+ asso = 8;
+ break;
+
+ case 0x8:
+ asso = 16;
+ break;
+
+ case 0xA:
+ asso = 32;
+ break;
+
+ case 0xB:
+ asso = 48;
+ break;
+
+ case 0xC:
+ asso = 64;
+ break;
+
+ case 0xD:
+ asso = 96;
+ break;
+
+ case 0xE:
+ asso = 128;
+ break;
+
+ case 0xF:
+ asso = 0;
+ break;
+
+ default:
+ break;
+ }
+ return asso;
+
+}
+
+
+/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
+
+void cpuid_printTlbTopology()
+{
+ int i;
+ uint32_t loop = 1;
+
+ if (cpuid_info.isIntel)
+ {
+ eax = 0x02;
+ CPUID(eax, ebx, ecx, edx);
+
+
+ loop = extractBitField(eax,8,0);
+ for(i=1;i<loop;i++)
+ {
+ eax = 0x02;
+ CPUID(eax, ebx, ecx, edx);
+ }
+
+ for(i=8;i<32;i+=8)
+ {
+ if (extractBitField(eax,8,i) != 0x0)
+ {
+ if (intel_tlb_info[extractBitField(eax,8,i)])
+ printf("%s\n",intel_tlb_info[extractBitField(eax,8,i)]);
+ }
+ }
+ for(i=0;i<32;i+=8)
+ {
+ if (extractBitField(eax,8,i) != 0x0)
+ {
+ if (intel_tlb_info[extractBitField(ebx,8,i)])
+ printf("%s\n",intel_tlb_info[extractBitField(ebx,8,i)]);
+ }
+ }
+ for(i=0;i<32;i+=8)
+ {
+ if (extractBitField(eax,8,i) != 0x0)
+ {
+ if (intel_tlb_info[extractBitField(ecx,8,i)])
+ printf("%s\n",intel_tlb_info[extractBitField(ecx,8,i)]);
+ }
+ }
+ for(i=0;i<32;i+=8)
+ {
+ if (extractBitField(eax,8,i) != 0x0)
+ {
+ if (intel_tlb_info[extractBitField(edx,8,i)])
+ printf("%s\n",intel_tlb_info[extractBitField(edx,8,i)]);
+ }
+ }
+ }
+ else
+ {
+ eax = 0x80000005;
+ CPUID(eax, ebx, ecx, edx);
+ printf("L1DTlb2and4MAssoc: 0x%x\n",extractBitField(eax,8,24));
+ printf("L1DTlb2and4MSize: %d entries for 2MB pages\n",(uint32_t)extractBitField(eax,8,16));
+ printf("L1ITlb2and4MAssoc: 0x%x\n",extractBitField(eax,8,8));
+ printf("L1ITlb2and4MSize: %d entries for 2MB pages\n",(uint32_t)extractBitField(eax,8,0));
+ ebx = 0x80000005;
+ CPUID(eax, ebx, ecx, edx);
+ printf("L1DTlb4KAssoc: 0x%x\n",extractBitField(ebx,8,24));
+ printf("L1DTlb4KSize: 0x%x\n",extractBitField(ebx,8,16));
+ printf("L1ITlb4KAssoc: 0x%x\n",extractBitField(ebx,8,8));
+ printf("L1ITlb4KSize: 0x%x\n",extractBitField(ebx,8,0));
+ eax = 0x80000006;
+ CPUID(eax, ebx, ecx, edx);
+ printf("L2DTlb2and4MAssoc: 0x%x\n",extractBitField(eax,4,24));
+ printf("L2DTlb2and4MAssoc_c: %d\n",amdGetAssociativity(extractBitField(eax,4,24)));
+ printf("L2DTlb2and4MSize: 0x%x\n",extractBitField(eax,12,16));
+ printf("L2ITlb2and4MAssoc: 0x%x\n",extractBitField(eax,4,12));
+ printf("L2ITlb2and4MAssoc_c: %d\n",amdGetAssociativity(extractBitField(eax,4,12)));
+ printf("L2ITlb2and4MSize: 0x%x\n",extractBitField(eax,12,0));
+ ebx = 0x80000006;
+ CPUID(eax, ebx, ecx, edx);
+ printf("L2DTlb4KAssoc: 0x%x\n",extractBitField(eax,4,24));
+ printf("L2DTlb4KAssoc_c: %d\n",amdGetAssociativity(extractBitField(eax,4,24)));
+ printf("L2DTlb4KSize: 0x%x\n",extractBitField(eax,12,16));
+ printf("L2ITlb4KAssoc: 0x%x\n",extractBitField(eax,4,12));
+ printf("L2ITlb4KAssoc_c: %d\n",amdGetAssociativity(extractBitField(eax,4,12)));
+ printf("L2ITlb4KSize: 0x%x\n",extractBitField(eax,12,0));
+ }
+ return;
+}
+
+static void
+cpuid_set_osname(void)
+{
+ FILE *fp;
+ bstring nameString = bformat("model name");
+ cpuid_info.osname = malloc(MAX_MODEL_STRING_LENGTH * sizeof(char));
+ memset(cpuid_info.osname, '\0', MAX_MODEL_STRING_LENGTH);
+ ownstrcpy = strcpy;
+ int i;
+
+ if (NULL != (fp = fopen ("/proc/cpuinfo", "r")))
+ {
+ bstring src = bread ((bNread) fread, fp);
+ struct bstrList* tokens = bsplit(src,(char) '\n');
+
+ for (i=0;i<tokens->qty;i++)
+ {
+ if (binstr(tokens->entry[i],0,nameString) != BSTR_ERR)
+ {
+ struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':');
+ bltrimws(subtokens->entry[1]);
+ ownstrcpy(cpuid_info.osname, bdata(subtokens->entry[1]));
+ bstrListDestroy(subtokens);
+ }
+ }
+ bstrListDestroy(tokens);
+ bdestroy(src);
+ }
+ else
+ {
+ bdestroy(nameString);
+ ERROR;
+ }
+ bdestroy(nameString);
+ fclose(fp);
+}
+
+
+void cpuid_init_cpuInfo(cpu_set_t cpuSet)
+{
+ int cpus_in_set = 0;
+ cpuid_info.isIntel = 1;
+
+ eax = 0x00;
+ CPUID(eax, ebx, ecx, edx);
+
+ largest_function = eax;
+ if (ebx == 0x68747541U)
+ {
+ cpuid_info.isIntel = 0;
+ }
+
+ eax = 0x01;
+ CPUID(eax, ebx, ecx, edx);
+ cpuid_info.family = ((eax>>8)&0xFU) + ((eax>>20)&0xFFU);
+ cpuid_info.model = (((eax>>16)&0xFU)<<4) + ((eax>>4)&0xFU);
+ cpuid_info.stepping = (eax&0xFU);
+ cpuid_set_osname();
+ cpuid_topology.numHWThreads = sysconf(_SC_NPROCESSORS_CONF);
+ cpus_in_set = cpu_count(&cpuSet);
+ if (cpus_in_set < cpuid_topology.numHWThreads)
+ {
+ cpuid_topology.numHWThreads = cpus_in_set;
+ }
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, CPU-ID CpuInfo Family %d Model %d Stepping %d isIntel %d numHWThreads %d activeHWThreads %d,
+ cpuid_info.family,
+ cpuid_info.model,
+ cpuid_info.stepping,
+ cpuid_info.isIntel,
+ cpuid_topology.numHWThreads,
+ cpuid_topology.activeHWThreads)
+ return;
+}
+
+void cpuid_init_cpuFeatures(void)
+{
+ eax = 0x01;
+ CPUID(eax, ebx, ecx, edx);
+
+ cpuid_info.featureFlags = 0;
+ cpuid_info.features = (char*) malloc(MAX_FEATURE_STRING_LENGTH*sizeof(char));
+ cpuid_info.features[0] = '\0';
+ if (ecx & (1<<0))
+ {
+ strcat(cpuid_info.features, "SSE3 ");
+ cpuid_info.featureFlags |= (1<<SSE3);
+ }
+ if (ecx & (1<<3))
+ {
+ strcat(cpuid_info.features, "MONITOR ");
+ cpuid_info.featureFlags |= (1<<MONITOR);
+ }
+ if (ecx & (1<<5))
+ {
+ strcat(cpuid_info.features, "VMX ");
+ cpuid_info.featureFlags |= (1<<VMX);
+ }
+ if (ecx & (1<<7))
+ {
+ strcat(cpuid_info.features, "EIST ");
+ cpuid_info.featureFlags |= (1<<EIST);
+ }
+ if (ecx & (1<<8))
+ {
+ strcat(cpuid_info.features, "TM2 ");
+ cpuid_info.featureFlags |= (1<<TM2);
+ }
+ if (ecx & (1<<9))
+ {
+ strcat(cpuid_info.features, "SSSE3 ");
+ cpuid_info.featureFlags |= (1<<SSSE3);
+ }
+ if (ecx & (1<<12))
+ {
+ strcat(cpuid_info.features, "FMA ");
+ cpuid_info.featureFlags |= (1<<FMA);
+ }
+ if (ecx & (1<<19))
+ {
+ strcat(cpuid_info.features, "SSE4.1 ");
+ cpuid_info.featureFlags |= (1<<SSE41);
+ }
+ if (ecx & (1<<20))
+ {
+ strcat(cpuid_info.features, "SSE4.2 ");
+ cpuid_info.featureFlags |= (1<<SSE42);
+ }
+ if (ecx & (1<<25))
+ {
+ strcat(cpuid_info.features, "AES ");
+ cpuid_info.featureFlags |= (1<<AES);
+ }
+ if (ecx & (1<<28))
+ {
+ strcat(cpuid_info.features, "AVX ");
+ cpuid_info.featureFlags |= (1<<AVX);
+ }
+ if (ecx & (1<<30))
+ {
+ strcat(cpuid_info.features, "RDRAND ");
+ cpuid_info.featureFlags |= (1<<RDRAND);
+ }
+
+ if (edx & (1<<22))
+ {
+ strcat(cpuid_info.features, "ACPI ");
+ cpuid_info.featureFlags |= (1<<ACPI);
+ }
+ if (edx & (1<<23))
+ {
+ strcat(cpuid_info.features, "MMX ");
+ cpuid_info.featureFlags |= (1<<MMX);
+ }
+ if (edx & (1<<25))
+ {
+ strcat(cpuid_info.features, "SSE ");
+ cpuid_info.featureFlags |= (1<<SSE);
+ }
+ if (edx & (1<<26))
+ {
+ strcat(cpuid_info.features, "SSE2 ");
+ cpuid_info.featureFlags |= (1<<SSE2);
+ }
+ if (edx & (1<<28))
+ {
+ strcat(cpuid_info.features, "HTT ");
+ cpuid_info.featureFlags |= (1<<HTT);
+ }
+ if (edx & (1<<29))
+ {
+ strcat(cpuid_info.features, "TM ");
+ cpuid_info.featureFlags |= (1<<TM);
+ }
+
+ eax = 0x7;
+ ecx = 0x0;
+ CPUID(eax, ebx, ecx, edx);
+ if (ebx & (1<<5))
+ {
+ strcat(cpuid_info.features, "AVX2 ");
+ cpuid_info.featureFlags |= (1<<AVX2);
+ }
+ if (ebx & (1<<11))
+ {
+ strcat(cpuid_info.features, "RTM ");
+ cpuid_info.featureFlags |= (1<<RTM);
+ }
+ if (ebx & (1<<4))
+ {
+ strcat(cpuid_info.features, "HLE ");
+ cpuid_info.featureFlags |= (1<<HLE);
+ }
+ if (ebx & (1<<18))
+ {
+ strcat(cpuid_info.features, "RDSEED ");
+ cpuid_info.featureFlags |= (1<<RDSEED);
+ }
+
+ eax = 0x80000001;
+ CPUID(eax, ebx, ecx, edx);
+ if (edx & (1<<27))
+ {
+ strcat(cpuid_info.features, "RDTSCP ");
+ cpuid_info.featureFlags |= (1<<RDTSCP);
+ }
+
+ cpuid_info.perf_version = 0;
+ if( cpuid_info.family == P6_FAMILY && 0x0A <= largest_function)
+ {
+ eax = 0x0A;
+ CPUID(eax, ebx, ecx, edx);
+ cpuid_info.perf_version = (eax&0xFFU);
+ cpuid_info.perf_num_ctr = ((eax>>8)&0xFFU);
+ cpuid_info.perf_width_ctr = ((eax>>16)&0xFFU);
+ cpuid_info.perf_num_fixed_ctr = (edx&0xFU);
+
+ eax = 0x06;
+ CPUID(eax, ebx, ecx, edx);
+ if (eax & (1<<1))
+ {
+ cpuid_info.turbo = 1;
+ }
+ else
+ {
+ cpuid_info.turbo = 0;
+ }
+ }
+
+ return;
+}
+
+void cpuid_init_nodeTopology(cpu_set_t cpuSet)
+{
+ uint32_t apicId;
+ uint32_t bitField;
+ int level;
+ int prevOffset = 0;
+ int currOffset = 0;
+ cpu_set_t set;
+ HWThread* hwThreadPool;
+ int hasBLeaf = 0;
+ int maxNumLogicalProcs;
+ int maxNumLogicalProcsPerCore;
+ int maxNumCores;
+ int width;
+
+ hwThreadPool = (HWThread*) malloc(cpuid_topology.numHWThreads * sizeof(HWThread));
+
+
+ /* check if 0x0B cpuid leaf is supported */
+ if (largest_function >= 0x0B)
+ {
+ eax = 0x0B;
+ ecx = 0;
+ CPUID(eax, ebx, ecx, edx);
+
+ if (ebx)
+ {
+ hasBLeaf = 1;
+ }
+ }
+
+ if (hasBLeaf)
+ {
+ for (uint32_t i=0; i < cpuid_topology.numHWThreads; i++)
+ {
+ int id;
+ CPU_ZERO(&set);
+ CPU_SET(i,&set);
+ sched_setaffinity(0, sizeof(cpu_set_t), &set);
+ eax = 0x0B;
+ ecx = 0;
+ CPUID(eax, ebx, ecx, edx);
+ apicId = edx;
+ id = i;
+ hwThreadPool[id].apicId = i;
+ hwThreadPool[id].inCpuSet = 0;
+ if (CPU_ISSET(id, &cpuSet))
+ {
+ hwThreadPool[id].inCpuSet = 1;
+ }
+
+ for (level=0; level < 3; level++)
+ {
+ eax = 0x0B;
+ ecx = level;
+ CPUID(eax, ebx, ecx, edx);
+ currOffset = eax&0xFU;
+
+ switch ( level ) {
+ case 0: /* SMT thread */
+ bitField = extractBitField(apicId,
+ currOffset,
+ 0);
+ hwThreadPool[id].threadId = bitField;
+ break;
+
+ case 1: /* Core */
+ bitField = extractBitField(apicId,
+ currOffset-prevOffset,
+ prevOffset);
+ hwThreadPool[id].coreId = bitField;
+ break;
+
+ case 2: /* Package */
+ bitField = extractBitField(apicId,
+ 32-prevOffset,
+ prevOffset);
+ hwThreadPool[id].packageId = bitField;
+ break;
+
+ }
+ prevOffset = currOffset;
+ }
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, I[%d] ID[%d] APIC[%d] T[%d] C[%d] P [%d], i, id,
+ hwThreadPool[id].apicId, hwThreadPool[id].threadId,
+ hwThreadPool[id].coreId, hwThreadPool[id].packageId);
+ }
+ }
+ else
+ {
+ switch ( cpuid_info.family )
+ {
+
+ case MIC_FAMILY:
+
+ case P6_FAMILY:
+ eax = 0x01;
+ CPUID(eax, ebx, ecx, edx);
+ maxNumLogicalProcs = extractBitField(ebx,8,16);
+
+ /* Check number of cores per package */
+ eax = 0x04;
+ ecx = 0;
+ CPUID(eax, ebx, ecx, edx);
+ maxNumCores = extractBitField(eax,6,26)+1;
+
+ maxNumLogicalProcsPerCore = maxNumLogicalProcs/maxNumCores;
+
+ for (uint32_t i=0; i< cpuid_topology.numHWThreads; i++)
+ {
+ int id;
+ CPU_ZERO(&set);
+ CPU_SET(i,&set);
+ sched_setaffinity(0, sizeof(cpu_set_t), &set);
+
+ eax = 0x01;
+ CPUID(eax, ebx, ecx, edx);
+ id = i;
+ hwThreadPool[id].apicId = i;//extractBitField(ebx,8,24);
+
+ /* ThreadId is extracted from th apicId using the bit width
+ * of the number of logical processors
+ * */
+ hwThreadPool[id].threadId =
+ extractBitField(hwThreadPool[id].apicId,
+ getBitFieldWidth(maxNumLogicalProcsPerCore),0);
+
+ /* CoreId is extracted from th apicId using the bitWidth
+ * of the number of logical processors as offset and the
+ * bit width of the number of cores as width
+ * */
+ hwThreadPool[id].coreId =
+ extractBitField(hwThreadPool[id].apicId,
+ getBitFieldWidth(maxNumCores),
+ getBitFieldWidth(maxNumLogicalProcsPerCore));
+
+ hwThreadPool[id].packageId =
+ extractBitField(hwThreadPool[id].apicId,
+ 8-getBitFieldWidth(maxNumLogicalProcs),
+ getBitFieldWidth(maxNumLogicalProcs));
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, I[%d] ID[%d] APIC[%d] T[%d] C[%d] P [%d], i, id,
+ hwThreadPool[id].apicId, hwThreadPool[id].threadId,
+ hwThreadPool[id].coreId, hwThreadPool[id].packageId);
+ }
+ break;
+
+ case K8_FAMILY:
+ /* AMD Bios manual Rev. 2.28 section 3.1
+ * Legacy method */
+ /*FIXME: This is a bit of a hack */
+
+ maxNumLogicalProcsPerCore = 1;
+ maxNumLogicalProcs = 1;
+
+ eax = 0x80000008;
+ CPUID(eax, ebx, ecx, edx);
+
+ maxNumCores = extractBitField(ecx,8,0)+1;
+
+ for (uint32_t i=0; i< cpuid_topology.numHWThreads; i++)
+ {
+ int id;
+ CPU_ZERO(&set);
+ CPU_SET(i,&set);
+ sched_setaffinity(0, sizeof(cpu_set_t), &set);
+
+ eax = 0x01;
+ CPUID(eax, ebx, ecx, edx);
+ id = extractBitField(ebx,8,24);
+ hwThreadPool[id].apicId = extractBitField(ebx,8,24);
+
+ /* ThreadId is extracted from th apicId using the bit width
+ * of the number of logical processors
+ * */
+ hwThreadPool[id].threadId =
+ extractBitField(hwThreadPool[i].apicId,
+ getBitFieldWidth(maxNumLogicalProcsPerCore),0);
+
+ /* CoreId is extracted from th apicId using the bitWidth
+ * of the number of logical processors as offset and the
+ * bit width of the number of cores as width
+ * */
+ hwThreadPool[id].coreId =
+ extractBitField(hwThreadPool[i].apicId,
+ getBitFieldWidth(maxNumCores),
+ 0);
+
+ hwThreadPool[id].packageId =
+ extractBitField(hwThreadPool[i].apicId,
+ 8-getBitFieldWidth(maxNumCores),
+ getBitFieldWidth(maxNumCores));
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, I[%d] ID[%d] APIC[%d] T[%d] C[%d] P [%d], i, id,
+ hwThreadPool[id].apicId, hwThreadPool[id].threadId,
+ hwThreadPool[id].coreId, hwThreadPool[id].packageId);
+ }
+ break;
+
+ case K16_FAMILY:
+
+ case K15_FAMILY:
+
+ case K10_FAMILY:
+ /* AMD Bios manual Rev. 2.28 section 3.2
+ * Extended method */
+ eax = 0x80000008;
+ CPUID(eax, ebx, ecx, edx);
+
+ width = extractBitField(ecx,4,12);
+
+ if (width == 0)
+ {
+ width = extractBitField(ecx,8,0)+1;
+ }
+
+ eax = 0x01;
+ CPUID(eax, ebx, ecx, edx);
+ maxNumLogicalProcs = extractBitField(ebx,8,16);
+ maxNumCores = extractBitField(ecx,8,0)+1;
+
+
+ for (uint32_t i=0; i< cpuid_topology.numHWThreads; i++)
+ {
+ int id;
+ CPU_ZERO(&set);
+ CPU_SET(i,&set);
+ sched_setaffinity(0, sizeof(cpu_set_t), &set);
+
+ eax = 0x01;
+ CPUID(eax, ebx, ecx, edx);
+ id = extractBitField(ebx,8,24);
+ hwThreadPool[id].apicId = extractBitField(ebx,8,24);
+ /* AMD only knows cores */
+ hwThreadPool[id].threadId = 0;
+
+ hwThreadPool[id].coreId =
+ extractBitField(hwThreadPool[i].apicId,
+ width, 0);
+ hwThreadPool[id].packageId =
+ extractBitField(hwThreadPool[i].apicId,
+ (8-width), width);
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, I[%d] ID[%d] APIC[%d] T[%d] C[%d] P [%d], i, id,
+ hwThreadPool[id].apicId, hwThreadPool[id].threadId,
+ hwThreadPool[id].coreId, hwThreadPool[id].packageId);
+ }
+
+ break;
+ }
+ }
+ cpuid_topology.threadPool = hwThreadPool;
+
+ return;
+}
+
+
+void cpuid_init_cacheTopology(void)
+{
+ int maxNumLevels=0;
+ int id=0;
+ CacheLevel* cachePool = NULL;
+ CacheType type = DATACACHE;
+
+ switch ( cpuid_info.family )
+ {
+ case MIC_FAMILY:
+
+ case P6_FAMILY:
+
+ if (largest_function >= 4)
+ {
+ maxNumLevels = intelCpuidFunc_4(&cachePool);
+ }
+ else
+ {
+ // intelCpuidFunc_2(&cachePool);
+ }
+
+ break;
+
+ case K8_FAMILY:
+ maxNumLevels = 2;
+ cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel));
+
+ eax = 0x80000005;
+ CPUID(eax, ebx, ecx, edx);
+ cachePool[0].level = 1;
+ cachePool[0].type = DATACACHE;
+ cachePool[0].associativity = extractBitField(ecx,8,16);
+ cachePool[0].lineSize = extractBitField(ecx,8,0);
+ cachePool[0].size = extractBitField(ecx,8,24) * 1024;
+ if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
+ {
+ cachePool[0].sets = cachePool[0].size/
+ (cachePool[0].associativity * cachePool[0].lineSize);
+ }
+ cachePool[0].threads = 1;
+ cachePool[0].inclusive = 1;
+
+ eax = 0x80000006;
+ CPUID(eax, ebx, ecx, edx);
+ cachePool[1].level = 2;
+ cachePool[1].type = UNIFIEDCACHE;
+ cachePool[1].associativity =
+ amdGetAssociativity(extractBitField(ecx,4,12));
+ cachePool[1].lineSize = extractBitField(ecx,8,0);
+ cachePool[1].size = extractBitField(ecx,16,16) * 1024;
+ if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
+ {
+ cachePool[1].sets = cachePool[1].size/
+ (cachePool[1].associativity * cachePool[1].lineSize);
+ }
+ cachePool[1].threads = 1;
+ cachePool[1].inclusive = 1;
+
+ break;
+
+
+ case K10_FAMILY:
+ /* FIXME: Adds one level for the instruction cache on Intel
+ * This fixes the level for the cores
+ */
+ maxNumLevels = 3;
+ cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel));
+
+ eax = 0x80000005;
+ CPUID(eax, ebx, ecx, edx);
+ cachePool[0].level = 1;
+ cachePool[0].type = DATACACHE;
+ cachePool[0].associativity = extractBitField(ecx,8,16);
+ cachePool[0].lineSize = extractBitField(ecx,8,0);
+ cachePool[0].size = extractBitField(ecx,8,24) * 1024;
+ if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
+ {
+ cachePool[0].sets = cachePool[0].size/
+ (cachePool[0].associativity * cachePool[0].lineSize);
+ }
+ cachePool[0].threads = 1;
+ cachePool[0].inclusive = 1;
+
+ eax = 0x80000006;
+ CPUID(eax, ebx, ecx, edx);
+ cachePool[1].level = 2;
+ cachePool[1].type = UNIFIEDCACHE;
+ cachePool[1].associativity =
+ amdGetAssociativity(extractBitField(ecx,4,12));
+ cachePool[1].lineSize = extractBitField(ecx,8,0);
+ cachePool[1].size = extractBitField(ecx,16,16) * 1024;
+ if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
+ {
+ cachePool[1].sets = cachePool[1].size/
+ (cachePool[1].associativity * cachePool[1].lineSize);
+ }
+ cachePool[1].threads = 1;
+ cachePool[1].inclusive = 1;
+
+ cachePool[2].level = 3;
+ cachePool[2].type = UNIFIEDCACHE;
+ cachePool[2].associativity =
+ amdGetAssociativity(extractBitField(edx,4,12));
+ cachePool[2].lineSize = extractBitField(edx,8,0);
+ cachePool[2].size = (extractBitField(edx,14,18)+1) * 524288;
+ if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
+ {
+ cachePool[2].sets = cachePool[1].size/
+ (cachePool[1].associativity * cachePool[1].lineSize);
+ }
+
+ if (cpuid_info.model != MAGNYCOURS)
+ {
+ cachePool[2].threads = cpuid_topology.numCoresPerSocket;
+ }
+ else
+ {
+ cachePool[2].threads = cpuid_topology.numCoresPerSocket/2;
+ cachePool[2].size /= 2 ;
+ }
+
+ cachePool[2].inclusive = 1;
+
+ break;
+
+ case K16_FAMILY:
+
+ case K15_FAMILY:
+
+ maxNumLevels = 0;
+ cachePool = (CacheLevel*) malloc(3 * sizeof(CacheLevel));
+
+ while (type)
+ {
+ ecx = id;
+ eax = 0x8000001D;
+ CPUID(eax, ebx, ecx, edx);
+ type = (CacheType) extractBitField(eax,4,0);
+
+ if ((type == DATACACHE) || (type == UNIFIEDCACHE))
+ {
+ cachePool[maxNumLevels].level = extractBitField(eax,3,5);
+ cachePool[maxNumLevels].type = type;
+ cachePool[maxNumLevels].associativity = extractBitField(ebx,10,22)+1;
+ cachePool[maxNumLevels].lineSize = extractBitField(ebx,12,0)+1;
+ cachePool[maxNumLevels].sets = extractBitField(ecx,32,0)+1;
+ cachePool[maxNumLevels].size = cachePool[maxNumLevels].associativity *
+ cachePool[maxNumLevels].lineSize * cachePool[maxNumLevels].sets;
+ cachePool[maxNumLevels].threads = extractBitField(eax,12,14)+1;
+ cachePool[maxNumLevels].inclusive = (edx & (0x1<<1));
+ maxNumLevels++;
+ }
+ id++;
+ }
+ break;
+
+ default:
+ ERROR_PLAIN_PRINT(Processor is not supported);
+ break;
+ }
+
+
+ cpuid_topology.numCacheLevels = maxNumLevels;
+ cpuid_topology.cacheLevels = cachePool;
+
+ return;
+}
diff --git a/src/topology_hwloc.c b/src/topology_hwloc.c
new file mode 100644
index 0000000..04c2417
--- /dev/null
+++ b/src/topology_hwloc.c
@@ -0,0 +1,327 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: topology_hwloc.c
+ *
+ * Description: Interface to the hwloc based topology backend
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Authors: Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <error.h>
+
+#include <topology.h>
+#ifdef LIKWID_USE_HWLOC
+#include <hwloc.h>
+#include <topology_hwloc.h>
+#endif
+
+hwloc_topology_t hwloc_topology = NULL;
+
+
+/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
+
+/* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */
+
+/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
+
+/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
+#ifdef LIKWID_USE_HWLOC
+int likwid_hwloc_record_objs_of_type_below_obj(hwloc_topology_t t, hwloc_obj_t obj, hwloc_obj_type_t type, int* index, uint32_t **list)
+{
+ int i;
+ int count = 0;
+ hwloc_obj_t walker;
+ if (!obj) return 0;
+ if (!obj->arity) return 0;
+ for (i=0;i<obj->arity;i++)
+ {
+ walker = obj->children[i];
+ if (walker->type == type)
+ {
+ if (list && *list && index)
+ {
+ (*list)[(*index)++] = walker->os_index;
+ }
+ count++;
+ }
+ count += likwid_hwloc_record_objs_of_type_below_obj(t, walker, type, index, list);
+ }
+ return count;
+}
+
+void hwloc_init_cpuInfo(cpu_set_t cpuSet)
+{
+ int i;
+ hwloc_obj_t obj;
+ if (perfmon_verbosity <= 1)
+ {
+ setenv("HWLOC_HIDE_ERRORS", "1", 1);
+ }
+ likwid_hwloc_topology_init(&hwloc_topology);
+ likwid_hwloc_topology_set_flags(hwloc_topology, HWLOC_TOPOLOGY_FLAG_WHOLE_IO );
+ likwid_hwloc_topology_load(hwloc_topology);
+ obj = likwid_hwloc_get_obj_by_type(hwloc_topology, HWLOC_OBJ_SOCKET, 0);
+
+ cpuid_info.model = 0;
+ cpuid_info.family = 0;
+ cpuid_info.isIntel = 0;
+ cpuid_info.stepping = 0;
+ cpuid_info.osname = malloc(MAX_MODEL_STRING_LENGTH * sizeof(char));
+ cpuid_info.osname[0] = '\0';
+ if (!obj)
+ {
+ return;
+ }
+
+ const char * info;
+ if ((info = likwid_hwloc_obj_get_info_by_name(obj, "CPUModelNumber")))
+ cpuid_info.model = atoi(info);
+ if ((info = likwid_hwloc_obj_get_info_by_name(obj, "CPUFamilyNumber")))
+ cpuid_info.family = atoi(info);
+ if ((info = likwid_hwloc_obj_get_info_by_name(obj, "CPUVendor")))
+ cpuid_info.isIntel = strcmp(info, "GenuineIntel") == 0;
+ if ((info = likwid_hwloc_obj_get_info_by_name(obj, "CPUModel")))
+ strcpy(cpuid_info.osname, info);
+ if ((info = likwid_hwloc_obj_get_info_by_name(obj, "CPUStepping")))
+ cpuid_info.stepping = atoi(info);
+
+ cpuid_topology.numHWThreads = likwid_hwloc_get_nbobjs_by_type(hwloc_topology, HWLOC_OBJ_PU);
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, HWLOC CpuInfo Family %d Model %d Stepping %d isIntel %d numHWThreads %d activeHWThreads %d,
+ cpuid_info.family,
+ cpuid_info.model,
+ cpuid_info.stepping,
+ cpuid_info.isIntel,
+ cpuid_topology.numHWThreads,
+ cpuid_topology.activeHWThreads)
+ return;
+}
+
+void hwloc_init_nodeTopology(cpu_set_t cpuSet)
+{
+ HWThread* hwThreadPool;
+ int maxNumLogicalProcs;
+ int maxNumLogicalProcsPerCore;
+ int maxNumCores;
+ hwloc_obj_t obj;
+ int poolsize = 0;
+ int id = 0;
+ hwloc_obj_type_t socket_type = HWLOC_OBJ_SOCKET;
+ for (uint32_t i=0;i<cpuid_topology.numHWThreads;i++)
+ {
+ if (CPU_ISSET(i, &cpuSet))
+ {
+ poolsize = i+1;
+ }
+ }
+ hwThreadPool = (HWThread*) malloc(cpuid_topology.numHWThreads * sizeof(HWThread));
+ for (uint32_t i=0;i<cpuid_topology.numHWThreads;i++)
+ {
+ hwThreadPool[i].apicId = -1;
+ hwThreadPool[i].threadId = -1;
+ hwThreadPool[i].coreId = -1;
+ hwThreadPool[i].packageId = -1;
+ hwThreadPool[i].inCpuSet = 0;
+ }
+
+ maxNumLogicalProcs = likwid_hwloc_get_nbobjs_by_type(hwloc_topology, HWLOC_OBJ_PU);
+ maxNumCores = likwid_hwloc_get_nbobjs_by_type(hwloc_topology, HWLOC_OBJ_CORE);
+ if (likwid_hwloc_get_nbobjs_by_type(hwloc_topology, socket_type) == 0)
+ {
+ socket_type = HWLOC_OBJ_NODE;
+ }
+ maxNumLogicalProcsPerCore = maxNumLogicalProcs/maxNumCores;
+ for (uint32_t i=0; i< cpuid_topology.numHWThreads; i++)
+ {
+ int skip = 0;
+ obj = likwid_hwloc_get_obj_by_type(hwloc_topology, HWLOC_OBJ_PU, i);
+ if (!obj)
+ {
+ continue;
+ }
+ id = obj->os_index;
+ hwThreadPool[id].inCpuSet = 1;
+ hwThreadPool[id].apicId = obj->os_index;
+ hwThreadPool[id].threadId = obj->sibling_rank;
+ while (obj->type != HWLOC_OBJ_CORE) {
+ obj = obj->parent;
+ if (!obj)
+ {
+ skip = 1;
+ break;
+ }
+ }
+ if (skip)
+ {
+ hwThreadPool[id].coreId = 0;
+ hwThreadPool[id].packageId = 0;
+ continue;
+ }
+ hwThreadPool[id].coreId = obj->os_index;
+ while (obj->type != socket_type) {
+ obj = obj->parent;
+ if (!obj)
+ {
+ skip = 1;
+ break;
+ }
+ }
+ if (skip)
+ {
+ hwThreadPool[id].packageId = 0;
+ continue;
+ }
+ hwThreadPool[id].packageId = obj->os_index;
+ /*DEBUG_PRINT(DEBUGLEV_DEVELOP, HWLOC Thread Pool PU %d Thread %d Core %d Socket %d,
+ hwThreadPool[threadIdx].apicId,
+ hwThreadPool[threadIdx].threadId,
+ hwThreadPool[threadIdx].coreId,
+ hwThreadPool[threadIdx].packageId)*/
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, I[%d] ID[%d] APIC[%d] T[%d] C[%d] P [%d], i, id,
+ hwThreadPool[id].apicId, hwThreadPool[id].threadId,
+ hwThreadPool[id].coreId, hwThreadPool[id].packageId);
+ }
+
+ cpuid_topology.threadPool = hwThreadPool;
+
+ return;
+}
+
+
+void hwloc_init_cacheTopology(void)
+{
+ int maxNumLevels=0;
+ int id=0;
+ CacheLevel* cachePool = NULL;
+ hwloc_obj_t obj;
+ int depth;
+ int d;
+ const char* info;
+
+ /* Sum up all depths with caches */
+ depth = likwid_hwloc_topology_get_depth(hwloc_topology);
+ for (d = 0; d < depth; d++)
+ {
+ if (likwid_hwloc_get_depth_type(hwloc_topology, d) == HWLOC_OBJ_CACHE)
+ maxNumLevels++;
+ }
+ cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel));
+ /* Start at the bottom of the tree to get all cache levels in order */
+ depth = likwid_hwloc_topology_get_depth(hwloc_topology);
+ id = 0;
+
+ for(d=depth-1;d >= 0; d--)
+ {
+ /* We only need caches, so skip other levels */
+ if (likwid_hwloc_get_depth_type(hwloc_topology, d) != HWLOC_OBJ_CACHE)
+ {
+ continue;
+ }
+ /* Get the cache object */
+ obj = likwid_hwloc_get_obj_by_depth(hwloc_topology, d, 0);
+ /* All caches have this attribute, so safe to access */
+ switch (obj->attr->cache.type)
+ {
+ case HWLOC_OBJ_CACHE_DATA:
+ cachePool[id].type = DATACACHE;
+ break;
+ case HWLOC_OBJ_CACHE_INSTRUCTION:
+ cachePool[id].type = INSTRUCTIONCACHE;
+ break;
+ case HWLOC_OBJ_CACHE_UNIFIED:
+ cachePool[id].type = UNIFIEDCACHE;
+ break;
+ default:
+ cachePool[id].type = NOCACHE;
+ break;
+ }
+
+ cachePool[id].associativity = obj->attr->cache.associativity;
+ cachePool[id].level = obj->attr->cache.depth;
+ cachePool[id].lineSize = obj->attr->cache.linesize;
+ cachePool[id].size = obj->attr->cache.size;
+ cachePool[id].sets = 0;
+ if ((cachePool[id].associativity * cachePool[id].lineSize) != 0)
+ {
+ cachePool[id].sets = cachePool[id].size /
+ (cachePool[id].associativity * cachePool[id].lineSize);
+ }
+
+ /* Count all HWThreads below the current cache */
+ cachePool[id].threads = likwid_hwloc_record_objs_of_type_below_obj(
+ hwloc_topology, obj, HWLOC_OBJ_PU, NULL, NULL);
+
+ while (!(info = likwid_hwloc_obj_get_info_by_name(obj, "inclusiveness")) && obj->next_cousin)
+ {
+ obj = obj->next_cousin; // If some PU/core are not bindable because of cgroup, hwloc may not know the inclusiveness of some of their cache.
+ }
+ if(info)
+ {
+ cachePool[id].inclusive = info[0]=='t';
+ }
+ else
+ {
+ ERROR_PLAIN_PRINT(Processor is not supported);
+ break;
+ }
+ id++;
+ }
+
+ cpuid_topology.numCacheLevels = maxNumLevels;
+ cpuid_topology.cacheLevels = cachePool;
+ return;
+}
+
+void hwloc_close(void)
+{
+ if (hwloc_topology)
+ {
+ hwloc_topology_destroy(hwloc_topology);
+ }
+}
+
+#else
+
+void hwloc_init_cpuInfo(void)
+{
+ return;
+}
+
+void hwloc_init_cpuFeatures(void)
+{
+ return;
+}
+
+void hwloc_init_nodeTopology(void)
+{
+ return;
+}
+
+void hwloc_init_cacheTopology(void)
+{
+ return;
+}
+#endif
diff --git a/src/topology_proc.c b/src/topology_proc.c
new file mode 100644
index 0000000..1d3d0e0
--- /dev/null
+++ b/src/topology_proc.c
@@ -0,0 +1,626 @@
+/*
+ * =======================================================================================
+ *
+ * Filename: topology_proc.c
+ *
+ * Description: Interface to the procfs/sysfs based topology backend
+ *
+ * Version: 4.1
+ * Released: 19.5.2016
+ *
+ * Authors: Jan Treibig (jt), jan.treibig at gmail.com,
+ * Thomas Roehl (tr), thomas.roehl at googlemail.com
+ * Project: likwid
+ *
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 3 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <topology_proc.h>
+#include <cpuid.h>
+
+/* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */
+/* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */
+static int get_cpu_perf_data(void)
+{
+ uint32_t eax = 0x0U, ebx = 0x0U, ecx = 0x0U, edx = 0x0U;
+ int largest_function = 0;
+ eax = 0x00;
+ CPUID(eax, ebx, ecx, edx);
+ largest_function = eax;
+ if (cpuid_info.family == P6_FAMILY && 0x0A <= largest_function)
+ {
+ eax = 0x0A;
+ CPUID(eax, ebx, ecx, edx);
+ cpuid_info.perf_version = (eax&0xFFU);
+ cpuid_info.perf_num_ctr = ((eax>>8)&0xFFU);
+ cpuid_info.perf_width_ctr = ((eax>>16)&0xFFU);
+ cpuid_info.perf_num_fixed_ctr = (edx&0xFU);
+
+ eax = 0x06;
+ CPUID(eax, ebx, ecx, edx);
+ if (eax & (1<<1))
+ {
+ cpuid_info.turbo = 1;
+ }
+ else
+ {
+ cpuid_info.turbo = 0;
+ }
+ }
+ return 0;
+}
+
+int get_listPosition(int ownid, bstring list)
+{
+ bstring ownStr = bformat("%d",ownid);
+ struct bstrList* tokens = bsplit(list,(char) ',');
+ for(int i=0;i<tokens->qty;i++)
+ {
+ btrimws(tokens->entry[i]);
+ if (bstrcmp(ownStr, tokens->entry[i]) == BSTR_OK)
+ {
+ return i;
+ }
+ }
+ bstrListDestroy(tokens);
+ return -1;
+}
+
+int fillList(int* outList, int outOffset, bstring list)
+{
+ int current = 0;
+ int (*ownatoi)(const char*);
+ struct bstrList* tokens = bsplit(list,',');
+ ownatoi = &atoi;
+ for(int i=0;i<tokens->qty;i++)
+ {
+ btrimws(tokens->entry[i]);
+ if (bstrchrp(tokens->entry[i],'-',0) == BSTR_ERR)
+ {
+ if (outList)
+ {
+ outList[outOffset+current] = ownatoi(bdata(tokens->entry[i]));
+ }
+ current++;
+ }
+ else
+ {
+ struct bstrList* range = bsplit(tokens->entry[i],'-');
+ if (range->qty == 2)
+ {
+ for (int j=ownatoi(bdata(range->entry[0]));j<=ownatoi(bdata(range->entry[1]));j++)
+ {
+ if (outList)
+ {
+ outList[outOffset+current] = j;
+ }
+
+ current++;
+ }
+ }
+ bstrListDestroy(range);
+ }
+ }
+ bstrListDestroy(tokens);
+ return current;
+}
+
+static int readCacheInclusiveIntel(int level)
+{
+ uint32_t eax = 0x0U, ebx = 0x0U, ecx = 0x0U, edx = 0x0U;
+ eax = 0x04;
+ ecx = level;
+ CPUID(eax, ebx, ecx, edx);
+ return edx & 0x2;
+}
+
+static int readCacheInclusiveAMD(int level)
+{
+ uint32_t eax = 0x0U, ebx = 0x0U, ecx = 0x0U, edx = 0x0U;
+ eax = 0x8000001D;
+ ecx = level;
+ CPUID(eax, ebx, ecx, edx);
+ return (edx & (0x1<<1));
+}
+
+/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
+void proc_init_cpuInfo(cpu_set_t cpuSet)
+{
+ int i = 0;
+ int HWthreads = 0;
+ FILE *fp = NULL;
+
+ int (*ownatoi)(const char*);
+ char* (*ownstrcpy)(char*,const char*);
+ ownatoi = &atoi;
+ ownstrcpy = &strcpy;
+
+ const_bstring countString = bformat("processor\t:");
+ const_bstring modelString = bformat("model\t\t:");
+ const_bstring familyString = bformat("cpu family\t:");
+ const_bstring steppingString = bformat("stepping\t:");
+ const_bstring vendorString = bformat("vendor_id\t:");
+ const_bstring vendorIntelString = bformat("GenuineIntel");
+ const_bstring nameString = bformat("model name\t:");
+
+ cpuid_info.isIntel = 0;
+ cpuid_info.model = 0;
+ cpuid_info.family = 0;
+ cpuid_info.stepping = 0;
+ cpuid_topology.numHWThreads = 0;
+ cpuid_info.osname = malloc(MAX_MODEL_STRING_LENGTH * sizeof(char));
+
+ if (NULL != (fp = fopen ("/proc/cpuinfo", "r")))
+ {
+ bstring src = bread ((bNread) fread, fp);
+ struct bstrList* tokens = bsplit(src,(char) '\n');
+ bdestroy(src);
+ fclose(fp);
+ for (i=0;i<tokens->qty;i++)
+ {
+ if (binstr(tokens->entry[i],0,countString) != BSTR_ERR)
+ {
+ HWthreads++;
+ }
+ else if ((cpuid_info.model == 0) && (binstr(tokens->entry[i],0,modelString) != BSTR_ERR))
+ {
+ struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':');
+ bltrimws(subtokens->entry[1]);
+ cpuid_info.model = ownatoi(bdata(subtokens->entry[1]));
+ }
+ else if ((cpuid_info.family == 0) && (binstr(tokens->entry[i],0,familyString) != BSTR_ERR))
+ {
+ struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':');
+ bltrimws(subtokens->entry[1]);
+ cpuid_info.family = ownatoi(bdata(subtokens->entry[1]));
+ }
+ else if (binstr(tokens->entry[i],0,steppingString) != BSTR_ERR)
+ {
+ struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':');
+ bltrimws(subtokens->entry[1]);
+ cpuid_info.stepping = ownatoi(bdata(subtokens->entry[1]));
+ }
+ else if (binstr(tokens->entry[i],0,nameString) != BSTR_ERR)
+ {
+ struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':');
+ bltrimws(subtokens->entry[1]);
+ ownstrcpy(cpuid_info.osname, bdata(subtokens->entry[1]));
+ }
+ else if (binstr(tokens->entry[i],0,vendorString) != BSTR_ERR)
+ {
+ struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':');
+ bltrimws(subtokens->entry[1]);
+ if (bstrcmp(subtokens->entry[1], vendorIntelString) == BSTR_OK)
+ {
+ cpuid_info.isIntel = 1;
+ }
+ }
+ }
+ cpuid_topology.numHWThreads = HWthreads;
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, PROC CpuInfo Family %d Model %d Stepping %d isIntel %d numHWThreads %d,
+ cpuid_info.family,
+ cpuid_info.model,
+ cpuid_info.stepping,
+ cpuid_info.isIntel,
+ cpuid_topology.numHWThreads)
+ }
+ return;
+}
+
+void proc_init_cpuFeatures(void)
+{
+ int ret;
+ FILE* file;
+ char buf[1024];
+ char ident[30];
+ char delimiter[] = " ";
+ char* cptr;
+
+ if ( (file = fopen( "/proc/cpuinfo", "r")) == NULL )
+ {
+ fprintf(stderr, "Cannot open /proc/cpuinfo\n");
+ return;
+ }
+ ret = 0;
+ while( fgets(buf, sizeof(buf)-1, file) )
+ {
+ ret = sscanf(buf, "%s\t:", &(ident[0]));
+ if (ret != 1 || strcmp(ident,"flags") != 0)
+ {
+ continue;
+ }
+ else
+ {
+ ret = 1;
+ break;
+ }
+ }
+ fclose(file);
+ if (ret == 0)
+ {
+ return;
+ }
+
+ cpuid_info.featureFlags = 0;
+ cpuid_info.features = (char*) malloc(MAX_FEATURE_STRING_LENGTH*sizeof(char));
+ cpuid_info.features[0] = '\0';
+ buf[strcspn(buf, "\n")] = '\0';
+ cptr = strtok(&(buf[6]),delimiter);
+
+ while (cptr != NULL)
+ {
+ if (strcmp(cptr,"ssse3") == 0)
+ {
+ cpuid_info.featureFlags |= (1<<SSSE3);
+ strcat(cpuid_info.features, "SSSE3 ");
+ }
+ else if (strcmp(cptr,"sse3") == 0)
+ {
+ cpuid_info.featureFlags |= (1<<SSE3);
+ strcat(cpuid_info.features, "SSE3 ");
+ }
+ else if (strcmp(cptr,"monitor") == 0)
+ {
+ cpuid_info.featureFlags |= (1<<MONITOR);
+ strcat(cpuid_info.features, "MONITOR ");
+ }
+ else if (strcmp(cptr,"mmx") == 0)
+ {
+ cpuid_info.featureFlags |= (1<<MMX);
+ strcat(cpuid_info.features, "MMX ");
+ }
+ else if (strcmp(cptr,"sse") == 0)
+ {
+ cpuid_info.featureFlags |= (1<<SSE);
+ strcat(cpuid_info.features, "SSE ");
+ }
+ else if (strcmp(cptr,"sse2") == 0)
+ {
+ cpuid_info.featureFlags |= (1<<SSE2);
+ strcat(cpuid_info.features, "SSE2 ");
+ }
+ else if (strcmp(cptr,"acpi") == 0)
+ {
+ cpuid_info.featureFlags |= (1<<ACPI);
+ strcat(cpuid_info.features, "ACPI ");
+ }
+ else if (strcmp(cptr,"rdtscp") == 0)
+ {
+ cpuid_info.featureFlags |= (1<<RDTSCP);
+ strcat(cpuid_info.features, "RDTSCP ");
+ }
+ else if (strcmp(cptr,"vmx") == 0)
+ {
+ cpuid_info.featureFlags |= (1<<VMX);
+ strcat(cpuid_info.features, "VMX ");
+ }
+ else if (strcmp(cptr,"est") == 0)
+ {
+ cpuid_info.featureFlags |= (1<<EIST);
+ strcat(cpuid_info.features, "EIST ");
+ }
+ else if (strcmp(cptr,"tm") == 0)
+ {
+ cpuid_info.featureFlags |= (1<<TM);
+ strcat(cpuid_info.features, "TM ");
+ }
+ else if (strcmp(cptr,"tm2") == 0)
+ {
+ cpuid_info.featureFlags |= (1<<TM2);
+ strcat(cpuid_info.features, "TM2 ");
+ }
+ else if (strcmp(cptr,"aes") == 0)
+ {
+ cpuid_info.featureFlags |= (1<<AES);
+ strcat(cpuid_info.features, "AES ");
+ }
+ else if (strcmp(cptr,"rdrand") == 0)
+ {
+ cpuid_info.featureFlags |= (1<<RDRAND);
+ strcat(cpuid_info.features, "RDRAND ");
+ }
+ else if (strcmp(cptr,"sse4_1") == 0)
+ {
+ cpuid_info.featureFlags |= (1<<SSE41);
+ strcat(cpuid_info.features, "SSE4.1 ");
+ }
+ else if (strcmp(cptr,"sse4_2") == 0)
+ {
+ cpuid_info.featureFlags |= (1<<SSE42);
+ strcat(cpuid_info.features, "SSE4.2 ");
+ }
+ else if (strcmp(cptr,"avx") == 0)
+ {
+ cpuid_info.featureFlags |= (1<<AVX);
+ strcat(cpuid_info.features, "AVX ");
+ }
+ else if (strcmp(cptr,"fma") == 0)
+ {
+ cpuid_info.featureFlags |= (1<<FMA);
+ strcat(cpuid_info.features, "FMA ");
+ }
+ else if (strcmp(cptr,"avx2") == 0)
+ {
+ cpuid_info.featureFlags |= (1<<AVX2);
+ strcat(cpuid_info.features, "AVX2 ");
+ }
+ else if (strcmp(cptr,"rtm") == 0)
+ {
+ cpuid_info.featureFlags |= (1<<RTM);
+ strcat(cpuid_info.features, "RTM ");
+ }
+ else if (strcmp(cptr,"hle") == 0)
+ {
+ cpuid_info.featureFlags |= (1<<HLE);
+ strcat(cpuid_info.features, "HLE ");
+ }
+ else if (strcmp(cptr,"rdseed") == 0)
+ {
+ cpuid_info.featureFlags |= (1<<RDSEED);
+ strcat(cpuid_info.features, "RDSEED ");
+ }
+ else if (strcmp(cptr,"ht") == 0)
+ {
+ cpuid_info.featureFlags |= (1<<HTT);
+ strcat(cpuid_info.features, "HTT ");
+ }
+ cptr = strtok(NULL, delimiter);
+ }
+
+ if ((cpuid_info.featureFlags & (1<<SSSE3)) && !((cpuid_info.featureFlags) & (1<<SSE3)))
+ {
+ cpuid_info.featureFlags |= (1<<SSE3);
+ strcat(cpuid_info.features, "SSE3 ");
+ }
+
+ get_cpu_perf_data();
+ return;
+}
+
+
+
+void proc_init_nodeTopology(cpu_set_t cpuSet)
+{
+ HWThread* hwThreadPool;
+ FILE *fp;
+ bstring cpudir;
+ bstring file;
+ int (*ownatoi)(const char*);
+ ownatoi = &atoi;
+
+ hwThreadPool = (HWThread*) malloc(cpuid_topology.numHWThreads * sizeof(HWThread));
+ for (uint32_t i=0;i<cpuid_topology.numHWThreads;i++)
+ {
+ hwThreadPool[i].apicId = i;
+ hwThreadPool[i].threadId = -1;
+ hwThreadPool[i].coreId = -1;
+ hwThreadPool[i].packageId = -1;
+ hwThreadPool[i].inCpuSet = 1;
+ if (!CPU_ISSET(i, &cpuSet))
+ {
+ hwThreadPool[i].inCpuSet = 0;
+ }
+ cpudir = bformat("/sys/devices/system/cpu/cpu%d/topology",i);
+ file = bformat("%s/core_id", bdata(cpudir));
+ if (NULL != (fp = fopen (bdata(file), "r")))
+ {
+ bstring src = bread ((bNread) fread, fp);
+ hwThreadPool[i].coreId = ownatoi(bdata(src));
+ fclose(fp);
+ }
+ bdestroy(file);
+ file = bformat("%s/physical_package_id", bdata(cpudir));
+ if (NULL != (fp = fopen (bdata(file), "r")))
+ {
+ bstring src = bread ((bNread) fread, fp);
+ hwThreadPool[i].packageId = ownatoi(bdata(src));
+ fclose(fp);
+ }
+ bdestroy(file);
+ file = bformat("%s/thread_siblings_list", bdata(cpudir));
+ if (NULL != (fp = fopen (bdata(file), "r")))
+ {
+ bstring src = bread ((bNread) fread, fp);
+ hwThreadPool[i].threadId = get_listPosition(i, src);
+ fclose(fp);
+ }
+ bdestroy(file);
+ DEBUG_PRINT(DEBUGLEV_DEVELOP, PROC Thread Pool PU %d Thread %d Core %d Socket %d,
+ hwThreadPool[i].apicId,
+ hwThreadPool[i].threadId,
+ hwThreadPool[i].coreId,
+ hwThreadPool[i].packageId)
+ bdestroy(cpudir);
+ }
+ cpuid_topology.threadPool = hwThreadPool;
+ return;
+}
+
+void proc_init_cacheTopology(void)
+{
+ FILE *fp;
+ CacheLevel* cachePool = NULL;
+ int maxNumLevels = 0;
+ int nrCaches = 0;
+ bstring cpudir = bformat("/sys/devices/system/cpu/cpu0/cache");
+ bstring levelStr;
+ int (*ownatoi)(const char*);
+ ownatoi = &atoi;
+ for (int i=0;i<10;i++)
+ {
+ levelStr = bformat("%s/index%d/level",bdata(cpudir),i);
+ if (NULL != (fp = fopen (bdata(levelStr), "r")))
+ {
+ bstring src = bread ((bNread) fread, fp);
+ int tmp = 0;
+ tmp = ownatoi(bdata(src));
+ if (tmp > maxNumLevels)
+ {
+ maxNumLevels = tmp;
+ }
+ nrCaches++;
+ fclose(fp);
+ }
+ else
+ {
+ bdestroy(levelStr);
+ break;
+ }
+ bdestroy(levelStr);
+ }
+
+ cachePool = (CacheLevel*) malloc(nrCaches * sizeof(CacheLevel));
+ for (int i=0;i<nrCaches;i++)
+ {
+ levelStr = bformat("%s/index%d/level",bdata(cpudir),i);
+ if (NULL != (fp = fopen (bdata(levelStr), "r")))
+ {
+ bstring src = bread ((bNread) fread, fp);
+ cachePool[i].level = ownatoi(bdata(src));
+ fclose(fp);
+ bdestroy(src);
+ }
+ bdestroy(levelStr);
+ levelStr = bformat("%s/index%d/type",bdata(cpudir),i);
+ if (NULL != (fp = fopen (bdata(levelStr), "r")))
+ {
+ bstring unifiedStr = bformat("Unified");
+ bstring dataStr = bformat("Data");
+ bstring intrStr = bformat("Instruction");
+ bstring src = bread ((bNread) fread, fp);
+ btrimws(src);
+ if (bstrcmp(dataStr, src) == BSTR_OK)
+ {
+ cachePool[i].type = DATACACHE;
+ }
+ else if (bstrcmp(intrStr, src) == BSTR_OK)
+ {
+ cachePool[i].type = INSTRUCTIONCACHE;
+ }
+ else if (bstrcmp(unifiedStr, src) == BSTR_OK)
+ {
+ cachePool[i].type = UNIFIEDCACHE;
+ }
+ else
+ {
+ cachePool[i].type = NOCACHE;
+ }
+ fclose(fp);
+ bdestroy(unifiedStr);
+ bdestroy(dataStr);
+ bdestroy(intrStr);
+ bdestroy(src);
+ }
+ bdestroy(levelStr);
+ levelStr = bformat("%s/index%d/size",bdata(cpudir),i);
+ if (NULL != (fp = fopen (bdata(levelStr), "r")))
+ {
+ bstring src = bread ((bNread) fread, fp);
+ btrimws(src);
+ bdelete(src, blength(src)-1, 1);
+ cachePool[i].size = ownatoi(bdata(src)) * 1024;
+ fclose(fp);
+ bdestroy(src);
+ }
+ else
+ {
+ cachePool[i].size = 0;
+ }
+ bdestroy(levelStr);
+ levelStr = bformat("%s/index%d/ways_of_associativity",bdata(cpudir),i);
+ if (NULL != (fp = fopen (bdata(levelStr), "r")))
+ {
+ bstring src = bread ((bNread) fread, fp);
+ btrimws(src);
+ cachePool[i].associativity = ownatoi(bdata(src));
+ fclose(fp);
+ bdestroy(src);
+ }
+ else
+ {
+ cachePool[i].associativity = 0;
+ }
+ bdestroy(levelStr);
+ levelStr = bformat("%s/index%d/coherency_line_size",bdata(cpudir),i);
+ if (NULL != (fp = fopen (bdata(levelStr), "r")))
+ {
+ bstring src = bread ((bNread) fread, fp);
+ btrimws(src);
+ cachePool[i].lineSize = ownatoi(bdata(src));
+ fclose(fp);
+ bdestroy(src);
+ }
+ else
+ {
+ cachePool[i].lineSize = 0;
+ }
+ bdestroy(levelStr);
+ levelStr = bformat("%s/index%d/number_of_sets",bdata(cpudir),i);
+ if (NULL != (fp = fopen (bdata(levelStr), "r")))
+ {
+ bstring src = bread ((bNread) fread, fp);
+ btrimws(src);
+ cachePool[i].sets = ownatoi(bdata(src));
+ fclose(fp);
+ bdestroy(src);
+ }
+ else
+ {
+ if ((cachePool[i].associativity * cachePool[i].lineSize) != 0)
+ {
+ cachePool[i].sets = cachePool[i].size /
+ (cachePool[i].associativity * cachePool[i].lineSize);
+ }
+ }
+ bdestroy(levelStr);
+ levelStr = bformat("%s/index%d/shared_cpu_list",bdata(cpudir),i);
+ if (NULL != (fp = fopen (bdata(levelStr), "r")))
+ {
+ bstring src = bread ((bNread) fread, fp);
+ btrimws(src);
+ cachePool[i].threads = fillList(NULL, 0, src);
+ fclose(fp);
+ bdestroy(src);
+ }
+ bdestroy(levelStr);
+
+ switch ( cpuid_info.family )
+ {
+ case MIC_FAMILY:
+ case P6_FAMILY:
+ cachePool[i].inclusive = readCacheInclusiveIntel(cachePool[i].level);
+ break;
+ case K16_FAMILY:
+ case K15_FAMILY:
+ cachePool[i].inclusive = readCacheInclusiveAMD(cachePool[i].level);
+ break;
+ /* For K8 and K10 it is known that they are inclusive */
+ case K8_FAMILY:
+ case K10_FAMILY:
+ cachePool[i].inclusive = 1;
+ break;
+ default:
+ ERROR_PLAIN_PRINT(Processor is not supported);
+ break;
+ }
+ }
+ bdestroy(cpudir);
+ cpuid_topology.numCacheLevels = nrCaches;
+ cpuid_topology.cacheLevels = cachePool;
+ return;
+}
+
diff --git a/src/tree.c b/src/tree.c
index 795dd17..2ac8ab8 100644
--- a/src/tree.c
+++ b/src/tree.c
@@ -5,13 +5,13 @@
*
* Description: Module implementing a tree data structure
*
- * Version: 3.1.3
- * Released: 4.11.2014
+ * Version: 4.1
+ * Released: 19.5.2016
*
- * Author: Jan Treibig (jt), jan.treibig at gmail.com
+ * Author: Jan Treibig (jt), jan.treibig at gmail.com
* Project: likwid
*
- * Copyright (C) 2014 Jan Treibig
+ * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
@@ -34,12 +34,35 @@
#include <error.h>
#include <tree.h>
+/* ##### FUNCTION DEFINITIONS - INTERNAL FUNCTIONS ################## */
+void _tree_destroy(TreeNode* nodePtr)
+{
+ if (nodePtr == NULL)
+ return;
+ if (nodePtr->rlink)
+ {
+ _tree_destroy(nodePtr->rlink);
+ free(nodePtr->rlink);
+ }
+ if (nodePtr->llink)
+ {
+ _tree_destroy(nodePtr->llink);
+ free(nodePtr->llink);
+ }
+ return;
+}
+
/* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */
void
tree_init(TreeNode** root, int id)
{
*root = (TreeNode*) malloc(sizeof(TreeNode));
+ if (!(*root))
+ {
+ *root = NULL;
+ return;
+ }
(*root)->id = id;
(*root)->llink = NULL;
(*root)->rlink = NULL;
@@ -48,52 +71,71 @@ tree_init(TreeNode** root, int id)
void
tree_print(TreeNode* nodePtr)
{
- int level = 0;
+ int level = 0;
- if (nodePtr != NULL)
+ if (nodePtr != NULL)
+ {
+
+ TreeNode* digger = NULL;
+ TreeNode* walker = NULL;
+
+ digger = nodePtr->llink;
+
+ while (digger != NULL)
{
+ printf("\n Level %d:\n", level++);
+ printf("%d ", digger->id);
+ walker = digger->rlink;
- TreeNode* digger;
- TreeNode* walker;
+ while (walker != NULL)
+ {
+ printf("%d ", walker->id);
+ walker = walker->rlink;
+ }
- digger = nodePtr->llink;
+ digger = digger->llink;
+ }
- while (digger != NULL)
- {
- printf("\n Level %d:\n", level++);
- printf("%d ", digger->id);
- walker = digger->rlink;
+ printf("\n ");
+ }
+}
- while (walker != NULL)
- {
- printf("%d ", walker->id);
- walker = walker->rlink;
- }
- digger = digger->llink;
- }
+void
+tree_destroy(TreeNode* nodePtr)
+{
- printf("\n ");
+ if (nodePtr != NULL)
+ {
+ _tree_destroy(nodePtr);
+ free(nodePtr);
}
}
void
tree_insertNode(TreeNode* nodePtr, int id)
{
- TreeNode* currentNode;
- TreeNode* tmpNode;
+ TreeNode* currentNode = NULL;
+ TreeNode* tmpNode = NULL;
+ TreeNode* newNode = NULL;
if (nodePtr == NULL)
{
ERROR_PLAIN_PRINT(Node invalid);
}
+ newNode = (TreeNode*) malloc(sizeof(TreeNode));
+ if (!newNode)
+ {
+ return;
+ }
+ newNode->id = id;
+ newNode->llink = NULL;
+ newNode->rlink = NULL;
+
if (nodePtr->llink == NULL)
{
- nodePtr->llink = (TreeNode*) malloc(sizeof(TreeNode));
- nodePtr->llink->id = id;
- nodePtr->llink->llink = NULL;
- nodePtr->llink->rlink = NULL;
+ nodePtr->llink = newNode;
}
else
{
@@ -104,29 +146,21 @@ tree_insertNode(TreeNode* nodePtr, int id)
if (id < currentNode->rlink->id)
{
tmpNode = currentNode->rlink;
- currentNode->rlink = (TreeNode*) malloc(sizeof(TreeNode));
- currentNode->rlink->id = id;
- currentNode->rlink->llink = NULL;
+ currentNode->rlink = newNode;
currentNode->rlink->rlink = tmpNode;
return;
}
currentNode = currentNode->rlink;
}
-
if (id > currentNode->id)
{
- currentNode->rlink = (TreeNode*) malloc(sizeof(TreeNode));
- currentNode->rlink->id = id;
- currentNode->rlink->llink = NULL;
- currentNode->rlink->rlink = NULL;
+ currentNode->rlink = newNode;
}
else
{
tmpNode = currentNode;
- nodePtr->llink = (TreeNode*) malloc(sizeof(TreeNode));
- nodePtr->llink->id = id;
- nodePtr->llink->llink = NULL;
+ nodePtr->llink = newNode;
nodePtr->llink->rlink = tmpNode;
}
}
@@ -140,6 +174,7 @@ tree_nodeExists(TreeNode* nodePtr, int id)
if (nodePtr == NULL)
{
ERROR_PLAIN_PRINT(Node invalid);
+ return 0;
}
walker = nodePtr->llink;
@@ -168,6 +203,7 @@ tree_countChildren(TreeNode* nodePtr)
if (nodePtr == NULL)
{
ERROR_PLAIN_PRINT(Node invalid);
+ return 0;
}
if (nodePtr->llink == NULL)
{
@@ -193,6 +229,7 @@ tree_getNode(TreeNode* nodePtr, int id)
if (nodePtr == NULL)
{
ERROR_PLAIN_PRINT(Node invalid);
+ return NULL;
}
if (nodePtr->llink == NULL)
{
@@ -222,6 +259,7 @@ tree_getChildNode(TreeNode* nodePtr)
if (nodePtr == NULL)
{
ERROR_PLAIN_PRINT(Node invalid);
+ return NULL;
}
if (nodePtr->llink == NULL)
{
diff --git a/test/MPI_pin_test.c b/test/MPI_pin_test.c
index 5624a95..f0e1271 100644
--- a/test/MPI_pin_test.c
+++ b/test/MPI_pin_test.c
@@ -1,15 +1,46 @@
#include <stdio.h>
+#include <stdlib.h>
#include <unistd.h>
#include <mpi.h>
+#include <sys/types.h>
+#include <string.h>
+#include <sys/syscall.h>
+
#ifdef _OPENMP
extern int omp_get_num_threads();
extern int omp_get_thread_num();
#endif
-#include <affinity.h>
+#include <sched.h>
+
+int get_cpu_id()
+{
+ int i;
+ int cpu_id = 0;
+ /* Get the the current process' stat file from the proc filesystem */
+ FILE* procfile = fopen("/proc/self/stat", "r");
+ long to_read = 8192;
+ char* line;
+ char buffer[to_read];
+ int read = fread(buffer, sizeof(char), to_read, procfile);
+ fclose(procfile);
+
+ // Field with index 38 (zero-based counting) is the one we want
+ line = strtok(buffer, " ");
+ for (i = 1; i < 38; i++)
+ {
+ line = strtok(NULL, " ");
+ }
+
+ line = strtok(NULL, " ");
+ cpu_id = atoi(line);
+ return cpu_id;
+}
+#define HOST_NAME_MAX 1024
#define MASTER(msg) \
if (rank == 0) printf(#msg "\n")
+#define gettid() (int)syscall(SYS_gettid)
main(int argc, char **argv)
{
@@ -19,27 +50,31 @@ main(int argc, char **argv)
MPI_Init(&argc,&argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
host = (char*) malloc(HOST_NAME_MAX * sizeof(char));
- gethostname(host,HOST_NAME_MAX);
+ gethostname(host, HOST_NAME_MAX);
MASTER(MPI started);
MPI_Barrier(MPI_COMM_WORLD);
- printf("Process with rank %d running on Node %s Core %d\n",rank ,host, likwid_getProcessorId());
- fflush(stdout);
+ printf("Process with rank %d running on Node %s Core %d/%d\n",rank ,host, sched_getcpu(),get_cpu_id());
MPI_Barrier(MPI_COMM_WORLD);
MASTER(Enter OpenMP parallel region);
MPI_Barrier(MPI_COMM_WORLD);
#pragma omp parallel
{
- int coreId = likwid_getProcessorId();
+#pragma omp master
+ {
+ pid_t pid = getppid();
+ char cmd[1024];
+ sprintf(cmd, "pstree -p -H %d %d",pid, pid);
+ system(cmd);
+ }
#pragma omp critical
{
- printf ("Rank %d Thread %d running on core %d \n",rank,omp_get_thread_num(), coreId);
- fflush(stdout);
+ printf ("Rank %d Thread %d running on core %d/%d with pid %d and tid %d\n",rank,omp_get_thread_num(), sched_getcpu(),get_cpu_id(), getpid(),gettid());
}
- }
- sleep(2);
+ }
+ free(host);
MPI_Finalize();
}
diff --git a/test/Makefile b/test/Makefile
index 56fece1..1209136 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -1,29 +1,66 @@
-LIKWID_LIB = -L../ -llikwid
-INCLUDES = -I../src/includes -I../ -I../MIC
+include ../config.mk
-all: testmarker testmarkerF90 stream streamM
+LIKWID_LIB ?= -L$(PREFIX)/lib -llikwid
+LIKWID_INCLUDE ?= -I$(PREFIX)/include
+LIKWID_DEFINES ?= -DLIKWID_PERFMON
-testmarkerF90: chaos.F90
- ifort $(INCLUDES) -O3 -o $@ chaos.F90 $(LIKWID_LIB) -lpthread
+all: streamGCC
+
+GCC_C11_SUPPORT_MAJOR=$(shell /bin/bash -c "g++ -v 2>&1 | grep -o -E '([0-9])\.' | head -n 1 | tr -d '[:punct:]'")
+GCC_C11_SUPPORT_MINOR=$(shell /bin/bash -c "g++ -v 2>&1 | grep -o -E '\.([0-9])\.' | head -n 1 | tr -d '[:punct:]'")
+ICC_AVAILABLE=$(shell /bin/bash -c "which icc | wc -l")
+ICPC_AVAILABLE=$(shell /bin/bash -c "which icpc | wc -l")
+TBB_AVAILABLE=$(shell /bin/bash -c "ldconfig -v 2>/dev/null | grep libtbb.so | wc -l")
+
+streamGCC: stream.c
+ gcc -O3 -std=c99 $(LIKWID_INCLUDE) $(LIKWID_DEFINES) -fopenmp -o $@ stream.c $(LIKWID_LIB) -lm
+
+streamAPIGCC: stream-API.c
+ gcc -O3 -std=c99 $(LIKWID_INCLUDE) -fopenmp -ftree-vectorize -ffast-math -o $@ stream-API.c $(LIKWID_LIB) -lm
+
+serial: serial.c
+ gcc -O3 -std=c99 $(LIKWID_INCLUDE) $(LIKWID_DEFINES) -o $@ serial.c $(LIKWID_LIB) -lm
+
+test-likwidAPI: test-likwidAPI.c
+ gcc -O3 -std=c99 $(LIKWID_INCLUDE) $(LIKWID_DEFINES) -o $@ test-likwidAPI.c $(LIKWID_LIB) -lm
+
+test-msr-access: test-msr-access.c
+ gcc -o $@ test-msr-access.c
-stream: stream.c
- icc -O3 $(INCLUDES) -mmic -openmp -o $@ -DLIKWID_PERFMON stream.c $(LIKWID_LIB) -lm
+streamICC: stream.c
+ if [ $(ICC_AVAILABLE) -ne 0 ]; then icc -O3 -xHost -std=c99 $(LIKWID_INCLUDES) -openmp -o $@ $(LIKWID_DEFINES) stream.c $(LIKWID_LIB) -lm; fi
+
+streamGCC_C11: stream.cc
+ @if [ $(GCC_C11_SUPPORT_MAJOR) -eq 4 -a $(GCC_C11_SUPPORT_MINOR) -gt 8 ]; then g++ -O3 -std=c++11 -pthread -o $@ $(LIKWID_DEFINES) stream.cc $(LIKWID_LIB) -lm; fi
+ @if [ $(GCC_C11_SUPPORT_MAJOR) -gt 4 ]; then g++ -O3 -std=c++11 -pthread -o $@ $(LIKWID_DEFINES) stream.cc $(LIKWID_LIB) -lm; fi
+
+streamICC_C11: stream.cc
+ @if [ $(ICPC_AVAILABLE) -ne 0 ]; then icpc -restrict -O3 -std=c++11 -pthread -o $@ $(LIKWID_DEFINES) stream.cc $(LIKWID_LIB) -lm; fi
+
+testmarker-cnt: testmarker-cnt.c
+ gcc -O3 -std=c99 $(LIKWID_INCLUDES) -fopenmp $(LIKWID_DEFINES) -o $@ testmarker-cnt.c $(LIKWID_LIB) -lm
+
+testmarker-omp: testmarker-omp.c
+ gcc -O3 -std=c99 $(LIKWID_INCLUDES) -fopenmp $(LIKWID_DEFINES) -o $@ testmarker-omp.c $(LIKWID_LIB)
+
+testmarkerF90: chaos.F90
+ ifort $(LIKWID_INCLUDES) $(LIKWID_DEFINES) -O3 -o $@ chaos.F90 $(LIKWID_LIB) -lpthread
-streamM: stream.c
- gcc -O3 $(INCLUDES) -fopenmp -o $@ -DLIKWID_PERFMON stream.c $(LIKWID_LIB) -lm
+test-mpi: MPI_pin_test.c$
+ mpicc -O2 -fopenmp -D_GNU_SOURCE -o $@ MPI_pin_test.c
-testmarker:
- gcc -O3 -std=c99 $(INCLUDES) -fopenmp -DLIKWID_PERFMON -o $@ testmarker-cnt.c $(LIKWID_LIB) -lm
+stream_cilk: stream_cilk.c
+ @if [ $(ICC_AVAILABLE) -ne 0 ]; then icc -O3 $(LIKWID_DEFINES) $(LIKWID_INCLUDES) -o $@ stream_cilk.c $(LIKWID_LIB); fi
-testmarker-omp:
- gcc -O3 -std=c99 $(INCLUDES) -openmp -DLIKWID_PERFMON -o $@ testmarker-omp.c $(LIKWID_LIB)
+testTBBGCC:
+ @if [ $(TBB_AVAILABLE) -ne 0 ]; then g++ -O3 $(LIKWID_DEFINES) $(LIKWID_INCLUDES) -o $@ testTBB.cc -ltbb $(LIKWID_LIB); fi
-test-mpi:
- mpicc -DMAX_NUM_THREADS=128 -O2 -openmp -I../src/includes -I../GCC -D_GNU_SOURCE -o $@ MPI_pin_test.c $(LIKWID_LIB)
+testTBBICC:
+ @if [ $(TBB_AVAILABLE) -ne 0 -a $(ICPC_AVAILABLE) -ne 0 ]; then icpc -O3 $(LIKWID_DEFINES) $(LIKWID_INCLUDES) -o $@ testTBB.cc -ltbb $(LIKWID_LIB); else echo "Either TBB or ICPC missing"; fi
-.PHONY: clean
+.PHONY: clean streamGCC streamICC streamGCC_C11 streamICC_C11 testmarker-cnt testmarker-omp testmarkerF90 test-mpi stream_cilk serial test-likwidAPI streamAPIGCC test-msr-access testTBBGCC testTBBICC
-clean:
- rm -f stream streamM testmarker testmarkerF90
+clean:
+ rm -f streamGCC streamICC streamGCC_C11 streamICC_C11 stream_cilk testmarker-cnt testmarkerF90 test-mpi testmarker-omp serial test-likwidAPI streamAPIGCC test-msr-access testTBBGCC testTBBICC
diff --git a/test/accuracy/Makefile b/test/accuracy/Makefile
index f84b1cd..0740b0c 100644
--- a/test/accuracy/Makefile
+++ b/test/accuracy/Makefile
@@ -1,25 +1,30 @@
LIKWID_PATH=../..
+LIKWID_BENCH_PATH=../../bench
LIKWID_APP=likwid-bench
HOST=$(shell hostname -s)
-all: plain marker
-
-plain:
- sed -i -e s/'INSTRUMENT_BENCH = .*#'/'INSTRUMENT_BENCH = false#'/g $(LIKWID_PATH)/config.mk
- sed -i -e s/'CPPFLAGS := -DPAPI '/'CPPFLAGS := '/g $(LIKWID_PATH)/Makefile
- cd $(LIKWID_PATH) && make distclean && make
- cp $(LIKWID_PATH)/$(LIKWID_APP) $(LIKWID_APP)-plain
+all: clean marker localize_likwid
marker:
- sed -i -e s/'INSTRUMENT_BENCH = .*#'/'INSTRUMENT_BENCH = true#'/g $(LIKWID_PATH)/config.mk
- sed -i -e s/'CPPFLAGS := -DPAPI '/'CPPFLAGS := '/g $(LIKWID_PATH)/Makefile
- cd $(LIKWID_PATH) && make distclean && make
- cp $(LIKWID_PATH)/$(LIKWID_APP) $(LIKWID_APP)-marker
+ @echo "===> Building instrumented likwid-bench"
+ @sed -i -e s/'INSTRUMENT_BENCH = .*#'/'INSTRUMENT_BENCH = true#'/g $(LIKWID_PATH)/config.mk
+ @sed -i -e s/'CPPFLAGS := -DPAPI '/'CPPFLAGS := '/g $(LIKWID_PATH)/Makefile
+ @cd $(LIKWID_PATH) && make distclean >/dev/null && make >/dev/null 2>/dev/null
+ @cp $(LIKWID_BENCH_PATH)/$(LIKWID_APP) $(LIKWID_APP)-marker
+
papi:
- sed -i -e s/'INSTRUMENT_BENCH = .*#'/'INSTRUMENT_BENCH = false#'/g $(LIKWID_PATH)/config.mk
- cp $(LIKWID_PATH)/Makefile $(LIKWID_PATH)/Makefile.orig
- sed -i -e s/'CPPFLAGS := '/'CPPFLAGS := -DPAPI '/g $(LIKWID_PATH)/Makefile
- cd $(LIKWID_PATH) && make distclean && make
- cp $(LIKWID_PATH)/$(LIKWID_APP) $(LIKWID_APP)-papi
- mv $(LIKWID_PATH)/Makefile.orig $(LIKWID_PATH)/Makefile
+ @echo "===> Building instrumented likwid-bench using PAPI API"
+ @sed -i -e s/'INSTRUMENT_BENCH = .*#'/'INSTRUMENT_BENCH = false#'/g $(LIKWID_PATH)/config.mk
+ @cp $(LIKWID_PATH)/Makefile $(LIKWID_PATH)/Makefile.orig
+ @sed -i -e s/'CPPFLAGS := '/'CPPFLAGS := -DPAPI '/g $(LIKWID_PATH)/Makefile
+ @cd $(LIKWID_PATH) && make distclean >/dev/null && make >/dev/null 2>/dev/null
+ @cp $(LIKWID_BENCH_PATH)/$(LIKWID_APP) $(LIKWID_APP)-papi
+ @mv $(LIKWID_PATH)/Makefile.orig $(LIKWID_PATH)/Makefile
+
+localize_likwid:
+ @cd $(LIKWID_PATH) && make local >/dev/null && cd - >/dev/null
+
+clean:
+ @echo "===> Cleaning old likwid-bench executables"
+ @rm -f $(LIKWID_APP)-plain $(LIKWID_APP)-marker $(LIKWID_APP)-papi
diff --git a/test/accuracy/README b/test/accuracy/README
index 9dd8a78..6baaa01 100644
--- a/test/accuracy/README
+++ b/test/accuracy/README
@@ -1,6 +1,6 @@
LIKWID accuracy tester
-likwid-tester and likwid-tester-plot are test applications written in Perl. The likwid-accuracy.py application does the same but is written in Python.
+The likwid-accuracy.py application tests the accuracy of LIKWID's measurements. The tool is written in Python. The measurements are compared to an instrumented version of likwid-bench. By scaling the calculated likwid-bench results, it also takes write-allocates into account.
Usage:
make #build non-instrumentated and LIKWID-instrumentated versions of
@@ -9,10 +9,11 @@ Adjust test files in TESTS.
Adjust test set file SET.txt or use the -s/--sets switch on commandline.
likwid-accuracy.py #Runs the tests of all sets and saves results in folder RESULTS/<hostname>
+You should use some plotting option on the commandline.
+
Options for likwid-accuracy.py:
--pgf: Create a TeX file containing the definition of a PGF plot with suffix .tex -> .pdf
--grace: Create grace batch file for further manipulation with XMgrace or create plot with gracebat .agr/.bat -> .png
--gnuplot: Create GNUplot script .plot -> .jpg
---script: Create a Bash script containing all commands to create all plots using pdflatex, gracebat and gnuplot.
+--script: Create a Bash script containing all commands to create all plots using pdflatex, gracebat and/or gnuplot.
--scriptname: Set name for Bash script, default is $CWD/create_plots.sh
---wiki/--only_wiki: Create a Wiki page for the Google Code Wiki including the .png pics found in Google Code Wiki picture path (http://<project>.googlecode.com/svn/wiki/images).
diff --git a/test/accuracy/TESTS/BRANCH.txt b/test/accuracy/TESTS/BRANCH.txt
new file mode 100644
index 0000000..11efe50
--- /dev/null
+++ b/test/accuracy/TESTS/BRANCH.txt
@@ -0,0 +1,42 @@
+REGEX_BENCH NOTHING
+REGEX_PERF \|\s+Instructions per branch\s+\|\s+([0-9\.e\+\-]+)
+
+TEST load
+RUNS 5
+WA_FACTOR 11.0
+VARIANT 12kB 20000
+VARIANT 1MB 10000
+VARIANT 4MB 7500
+VARIANT 1GB 50
+
+TEST triad
+RUNS 5
+WA_FACTOR 19.0
+VARIANT 12kB 20000
+VARIANT 1MB 10000
+VARIANT 4MB 7500
+VARIANT 1GB 50
+
+TEST copy
+RUNS 5
+WA_FACTOR 11.0
+VARIANT 12kB 20000
+VARIANT 1MB 10000
+VARIANT 4MB 7500
+VARIANT 1GB 50
+
+TEST stream
+RUNS 5
+WA_FACTOR 19.0
+VARIANT 12kB 20000
+VARIANT 1MB 10000
+VARIANT 4MB 7500
+VARIANT 1GB 50
+
+TEST store
+RUNS 5
+WA_FACTOR 7.0
+VARIANT 12kB 20000
+VARIANT 1MB 10000
+VARIANT 4MB 7500
+VARIANT 1GB 50
diff --git a/test/accuracy/TESTS/CLOCK.txt b/test/accuracy/TESTS/CLOCK.txt
new file mode 100644
index 0000000..3ee855c
--- /dev/null
+++ b/test/accuracy/TESTS/CLOCK.txt
@@ -0,0 +1,53 @@
+REGEX_BENCH Instructions:\s+([0-9]+)
+REGEX_PERF \|\s+INSTR_RETIRED_ANY\s+\|\s+FIXC0\s+\|\s+([0-9\.e\+\-]+)
+
+
+TEST daxpy
+RUNS 5
+VARIANT 223231kB 1000
+VARIANT 430488kB 1000
+VARIANT 637745kB 1000
+VARIANT 845002kB 1000
+
+
+TEST ddot
+RUNS 5
+VARIANT 223231kB 1000
+VARIANT 430488kB 1000
+VARIANT 637745kB 1000
+VARIANT 845002kB 1000
+
+TEST copy
+RUNS 5
+VARIANT 223231kB 1000
+VARIANT 430488kB 1000
+VARIANT 637745kB 1000
+VARIANT 845002kB 1000
+
+TEST load
+RUNS 5
+VARIANT 223231kB 1000
+VARIANT 430488kB 1000
+VARIANT 637745kB 1000
+VARIANT 845002kB 1000
+
+TEST store
+RUNS 5
+VARIANT 223231kB 1000
+VARIANT 430488kB 1000
+VARIANT 637745kB 1000
+VARIANT 845002kB 1000
+
+TEST stream
+RUNS 5
+VARIANT 223231kB 1000
+VARIANT 430488kB 1000
+VARIANT 637745kB 1000
+VARIANT 845002kB 1000
+
+TEST triad
+RUNS 5
+VARIANT 223231kB 1000
+VARIANT 430488kB 1000
+VARIANT 637745kB 1000
+VARIANT 845002kB 1000
diff --git a/test/accuracy/TESTS/DATA.txt b/test/accuracy/TESTS/DATA.txt
new file mode 100644
index 0000000..454d10b
--- /dev/null
+++ b/test/accuracy/TESTS/DATA.txt
@@ -0,0 +1,34 @@
+REGEX_BENCH NOTHING
+REGEX_PERF \|\s+Load to store ratio\s+\|\s+([0-9\.e\+\-]+)
+
+TEST store
+RUNS 5
+WA_FACTOR 0.0
+VARIANT 12kB 20000
+VARIANT 1MB 10000
+VARIANT 4MB 7500
+VARIANT 1GB 50
+
+TEST copy
+RUNS 5
+WA_FACTOR 1.0
+VARIANT 12kB 20000
+VARIANT 1MB 10000
+VARIANT 4MB 7500
+VARIANT 1GB 50
+
+TEST stream
+RUNS 5
+WA_FACTOR 2.0
+VARIANT 12kB 20000
+VARIANT 1MB 10000
+VARIANT 4MB 7500
+VARIANT 1GB 50
+
+TEST triad
+RUNS 5
+WA_FACTOR 3.0
+VARIANT 12kB 20000
+VARIANT 1MB 10000
+VARIANT 4MB 7500
+VARIANT 1GB 50
diff --git a/test/accuracy/TESTS/FLOPS_AVX.txt b/test/accuracy/TESTS/FLOPS_AVX.txt
index f5ce80e..7c2ea39 100644
--- a/test/accuracy/TESTS/FLOPS_AVX.txt
+++ b/test/accuracy/TESTS/FLOPS_AVX.txt
@@ -1,5 +1,13 @@
REGEX_BENCH MFlops\/s:\s+([0-9]+)
-REGEX_PERF \|\s+DP MFlops\/s\s+\|\s+([0-9]+)
+REGEX_PERF \|\s+Packed DP MFLOP\/s\s+\|\s+([0-9\.e\+\-]+)
+
+
+TEST triad_avx
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
TEST stream_avx
RUNS 10
@@ -8,10 +16,23 @@ VARIANT 128kB 10000
VARIANT 2MB 5000
VARIANT 1GB 50
-TEST triad_avx
+TEST daxpy_avx
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
+
+TEST ddot_avx
RUNS 10
VARIANT 24kB 20000
VARIANT 128kB 10000
VARIANT 2MB 5000
VARIANT 1GB 50
+TEST sum_avx
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
diff --git a/test/accuracy/TESTS/FLOPS_DP.txt b/test/accuracy/TESTS/FLOPS_DP.txt
index da6f8be..810308b 100644
--- a/test/accuracy/TESTS/FLOPS_DP.txt
+++ b/test/accuracy/TESTS/FLOPS_DP.txt
@@ -1,5 +1,5 @@
REGEX_BENCH MFlops\/s:\s+([0-9]+)
-REGEX_PERF \|\s+MFlops\/s\s+\|\s+([0-9]+)
+REGEX_PERF \|\s+MFLOP\/s\s+\|\s+([0-9\.e\+\-]+)
TEST stream
RUNS 10
@@ -9,6 +9,22 @@ VARIANT 128kB 10000
VARIANT 2MB 5000
VARIANT 1GB 50
+TEST stream_sse
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
+
+TEST stream_avx
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
+
TEST triad
RUNS 10
VARIANT 12kB 20000
@@ -17,3 +33,90 @@ VARIANT 128kB 10000
VARIANT 2MB 5000
VARIANT 1GB 50
+TEST triad_sse
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
+
+TEST triad_avx
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
+
+TEST daxpy
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
+
+TEST daxpy_sse
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
+
+TEST daxpy_avx
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
+
+TEST ddot
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
+
+TEST ddot_sse
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
+
+TEST ddot_avx
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
+
+TEST sum
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
+
+TEST sum_sse
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
+
+TEST sum_avx
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
diff --git a/test/accuracy/TESTS/FLOPS_SP.txt b/test/accuracy/TESTS/FLOPS_SP.txt
index 3bad7d7..72f2a62 100644
--- a/test/accuracy/TESTS/FLOPS_SP.txt
+++ b/test/accuracy/TESTS/FLOPS_SP.txt
@@ -1,5 +1,26 @@
REGEX_BENCH MFlops\/s:\s+([0-9]+)
-REGEX_PERF \|\s+MFlops\/s\s+\|\s+([0-9]+)
+REGEX_PERF \|\s+MFLOP\/s\s+\|\s+([0-9\.e\+\-]+)
+
+TEST sum_sp
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
+
+TEST sum_sp_sse
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
+
+TEST sum_sp_avx
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
TEST stream_sp
RUNS 10
@@ -8,6 +29,20 @@ VARIANT 128kB 10000
VARIANT 2MB 5000
VARIANT 1GB 50
+TEST stream_sp_sse
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
+
+TEST stream_sp_avx
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
+
TEST triad_sp
RUNS 10
VARIANT 24kB 20000
@@ -15,3 +50,58 @@ VARIANT 128kB 10000
VARIANT 2MB 5000
VARIANT 1GB 50
+TEST triad_sp_sse
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
+
+TEST triad_sp_avx
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
+
+TEST ddot_sp
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
+
+TEST ddot_sp_sse
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
+
+TEST ddot_sp_avx
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
+
+TEST daxpy_sp
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
+
+TEST daxpy_sp_sse
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
+
+TEST daxpy_sp_avx
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT 2MB 5000
+VARIANT 1GB 50
diff --git a/test/accuracy/TESTS/HA.txt b/test/accuracy/TESTS/HA.txt
new file mode 100644
index 0000000..037c980
--- /dev/null
+++ b/test/accuracy/TESTS/HA.txt
@@ -0,0 +1,58 @@
+REGEX_BENCH MByte\/s:\s+([0-9]+)
+REGEX_PERF \|\s+Memory bandwidth \[MBytes\/s\]\s+\|\s+([0-9\.e\+\-]+)
+
+TEST load
+RUNS 10
+WA_FACTOR 1.0
+VARIANT 228863kB 1000
+VARIANT 435096kB 1000
+VARIANT 641329kB 1000
+VARIANT 847562kB 1000
+
+TEST store
+RUNS 10
+WA_FACTOR 2.0
+VARIANT 228863kB 1000
+VARIANT 435096kB 1000
+VARIANT 641329kB 1000
+VARIANT 847562kB 1000
+
+TEST copy
+RUNS 10
+WA_FACTOR 1.5
+VARIANT 228863kB 1000
+VARIANT 435096kB 1000
+VARIANT 641329kB 1000
+VARIANT 847562kB 1000
+
+TEST stream
+RUNS 10
+WA_FACTOR 1.3333
+VARIANT 228863kB 1000
+VARIANT 435096kB 1000
+VARIANT 641329kB 1000
+VARIANT 847562kB 1000
+
+TEST triad
+RUNS 10
+WA_FACTOR 1.25
+VARIANT 228863kB 1000
+VARIANT 435096kB 1000
+VARIANT 641329kB 1000
+VARIANT 847562kB 1000
+
+TEST daxpy
+RUNS 10
+WA_FACTOR 1.0
+VARIANT 228863kB 1000
+VARIANT 435096kB 1000
+VARIANT 641329kB 1000
+VARIANT 847562kB 1000
+
+TEST ddot
+RUNS 10
+WA_FACTOR 1.0
+VARIANT 228863kB 1000
+VARIANT 435096kB 1000
+VARIANT 641329kB 1000
+VARIANT 847562kB 1000
diff --git a/test/accuracy/TESTS/L2.txt b/test/accuracy/TESTS/L2.txt
index 35b2bea..6924c89 100644
--- a/test/accuracy/TESTS/L2.txt
+++ b/test/accuracy/TESTS/L2.txt
@@ -1,38 +1,58 @@
REGEX_BENCH MByte\/s:\s+([0-9]+)
-REGEX_PERF \|\s+L2 bandwidth \[MBytes\/s\]\s+\|\s+([0-9]+)
+REGEX_PERF \|\s+L2 bandwidth \[MBytes\/s\]\s+\|\s+([0-9\.e\+\-]+)
TEST load
RUNS 5
-VARIANT 12kB 20000
-VARIANT 1MB 10000
-VARIANT 4MB 7500
-VARIANT 1GB 50
+WA_FACTOR 1.0
+VARIANT 85kB 1000
+VARIANT 129kB 1000
+VARIANT 173kB 1000
+VARIANT 217kB 1000
TEST store
RUNS 5
-VARIANT 12kB 20000
-VARIANT 1MB 10000
-VARIANT 4MB 7500
-VARIANT 1GB 50
+WA_FACTOR 2.0
+VARIANT 85kB 1000
+VARIANT 129kB 1000
+VARIANT 173kB 1000
+VARIANT 217kB 1000
TEST copy
RUNS 5
-VARIANT 12kB 20000
-VARIANT 1MB 10000
-VARIANT 4MB 7500
-VARIANT 1GB 50
+WA_FACTOR 1.5
+VARIANT 85kB 1000
+VARIANT 129kB 1000
+VARIANT 173kB 1000
+VARIANT 217kB 1000
TEST stream
RUNS 5
-VARIANT 12kB 20000
-VARIANT 1MB 10000
-VARIANT 4MB 7500
-VARIANT 1GB 50
+WA_FACTOR 1.3333
+VARIANT 85kB 1000
+VARIANT 129kB 1000
+VARIANT 173kB 1000
+VARIANT 217kB 1000
TEST triad
RUNS 5
-VARIANT 12kB 20000
-VARIANT 1MB 10000
-VARIANT 4MB 7500
-VARIANT 1GB 50
+WA_FACTOR 1.25
+VARIANT 85kB 1000
+VARIANT 129kB 1000
+VARIANT 173kB 1000
+VARIANT 217kB 1000
+TEST daxpy
+RUNS 5
+WA_FACTOR 1.0
+VARIANT 85kB 1000
+VARIANT 129kB 1000
+VARIANT 173kB 1000
+VARIANT 217kB 1000
+
+TEST ddot
+RUNS 5
+WA_FACTOR 1.0
+VARIANT 85kB 1000
+VARIANT 129kB 1000
+VARIANT 173kB 1000
+VARIANT 217kB 1000
diff --git a/test/accuracy/TESTS/L3.txt b/test/accuracy/TESTS/L3.txt
index 8ff6c62..a124cdb 100644
--- a/test/accuracy/TESTS/L3.txt
+++ b/test/accuracy/TESTS/L3.txt
@@ -1,38 +1,58 @@
REGEX_BENCH MByte\/s:\s+([0-9]+)
-REGEX_PERF \|\s+L3 bandwidth \[MBytes\/s\]\s+\|\s+([0-9]+)
+REGEX_PERF \|\s+L3 bandwidth \[MBytes\/s\]\s+\|\s+([0-9\.e\+\-]+)
TEST load
RUNS 5
-VARIANT 12kB 20000
-VARIANT 1MB 10000
-VARIANT 4MB 7500
-VARIANT 1GB 50
+WA_FACTOR 1.0
+VARIANT 5810kB 1000
+VARIANT 11288kB 1000
+VARIANT 16766kB 1000
+VARIANT 22244kB 1000
TEST store
RUNS 5
-VARIANT 12kB 20000
-VARIANT 1MB 10000
-VARIANT 4MB 2000
-VARIANT 1GB 50
+WA_FACTOR 2.0
+VARIANT 5810kB 1000
+VARIANT 11288kB 1000
+VARIANT 16766kB 1000
+VARIANT 22244kB 1000
TEST copy
RUNS 5
-VARIANT 12kB 20000
-VARIANT 1MB 10000
-VARIANT 4MB 2000
-VARIANT 1GB 50
+WA_FACTOR 1.5
+VARIANT 5810kB 1000
+VARIANT 11288kB 1000
+VARIANT 16766kB 1000
+VARIANT 22244kB 1000
TEST stream
RUNS 5
-VARIANT 12kB 20000
-VARIANT 1MB 10000
-VARIANT 4MB 2000
-VARIANT 1GB 50
+WA_FACTOR 1.333
+VARIANT 5810kB 1000
+VARIANT 11288kB 1000
+VARIANT 16766kB 1000
+VARIANT 22244kB 1000
TEST triad
RUNS 5
-VARIANT 12kB 20000
-VARIANT 1MB 10000
-VARIANT 4MB 2000
-VARIANT 1GB 50
+WA_FACTOR 1.333
+VARIANT 5810kB 1000
+VARIANT 11288kB 1000
+VARIANT 16766kB 1000
+VARIANT 22244kB 1000
+TEST daxpy
+RUNS 5
+WA_FACTOR 1.0
+VARIANT 5810kB 1000
+VARIANT 11288kB 1000
+VARIANT 16766kB 1000
+VARIANT 22244kB 1000
+
+TEST ddot
+RUNS 5
+WA_FACTOR 1.0
+VARIANT 5810kB 1000
+VARIANT 11288kB 1000
+VARIANT 16766kB 1000
+VARIANT 22244kB 1000
diff --git a/test/accuracy/TESTS/MEM.txt b/test/accuracy/TESTS/MEM.txt
index 09993f6..71288a4 100644
--- a/test/accuracy/TESTS/MEM.txt
+++ b/test/accuracy/TESTS/MEM.txt
@@ -1,38 +1,58 @@
REGEX_BENCH MByte\/s:\s+([0-9]+)
-REGEX_PERF \|\s+Memory bandwidth \[MBytes\/s\]\s+\|\s+([0-9]+)
+REGEX_PERF \|\s+Memory bandwidth \[MBytes\/s\]\s+\|\s+([0-9\.e\+\-]+)
TEST load
RUNS 10
-VARIANT 24kB 20000
-VARIANT 128kB 10000
-VARIANT 2MB 7500
-VARIANT 1GB 50
+WA_FACTOR 1.0
+VARIANT 218725kB 1000
+VARIANT 426801kB 1000
+VARIANT 634877kB 1000
+VARIANT 842953kB 1000
TEST store
RUNS 10
-VARIANT 24kB 20000
-VARIANT 128kB 10000
-VARIANT 2MB 7500
-VARIANT 1GB 50
+WA_FACTOR 2.0
+VARIANT 218725kB 1000
+VARIANT 426801kB 1000
+VARIANT 634877kB 1000
+VARIANT 842953kB 1000
TEST copy
RUNS 10
-VARIANT 24kB 20000
-VARIANT 128kB 10000
-VARIANT 2MB 7500
-VARIANT 1GB 50
+WA_FACTOR 1.5
+VARIANT 218725kB 1000
+VARIANT 426801kB 1000
+VARIANT 634877kB 1000
+VARIANT 842953kB 1000
TEST stream
RUNS 10
-VARIANT 24kB 20000
-VARIANT 128kB 10000
-VARIANT 2MB 7500
-VARIANT 1GB 50
+WA_FACTOR 1.3333
+VARIANT 218725kB 1000
+VARIANT 426801kB 1000
+VARIANT 634877kB 1000
+VARIANT 842953kB 1000
TEST triad
RUNS 10
-VARIANT 24kB 20000
-VARIANT 128kB 10000
-VARIANT 2MB 7500
-VARIANT 1GB 50
+WA_FACTOR 1.25
+VARIANT 218725kB 1000
+VARIANT 426801kB 1000
+VARIANT 634877kB 1000
+VARIANT 842953kB 1000
+TEST daxpy
+RUNS 10
+WA_FACTOR 1.0
+VARIANT 218725kB 1000
+VARIANT 426801kB 1000
+VARIANT 634877kB 1000
+VARIANT 842953kB 1000
+
+TEST ddot
+RUNS 10
+WA_FACTOR 1.0
+VARIANT 218725kB 1000
+VARIANT 426801kB 1000
+VARIANT 634877kB 1000
+VARIANT 842953kB 1000
diff --git a/test/accuracy/TESTS/UOPS.txt b/test/accuracy/TESTS/UOPS.txt
new file mode 100644
index 0000000..1ebb4fe
--- /dev/null
+++ b/test/accuracy/TESTS/UOPS.txt
@@ -0,0 +1,30 @@
+REGEX_BENCH UOPs:\s+([0-9]+)
+REGEX_PERF \|\s+Retired UOPs\s+\|\s+([0-9\.e\+\-]+)
+
+TEST ddot
+RUNS 5
+VARIANT 223231kB 1000
+VARIANT 430488kB 1000
+VARIANT 637745kB 1000
+VARIANT 845002kB 1000
+
+TEST stream
+RUNS 5
+VARIANT 223231kB 1000
+VARIANT 430488kB 1000
+VARIANT 637745kB 1000
+VARIANT 845002kB 1000
+
+TEST daxpy
+RUNS 5
+VARIANT 223231kB 1000
+VARIANT 430488kB 1000
+VARIANT 637745kB 1000
+VARIANT 845002kB 1000
+
+TEST triad
+RUNS 5
+VARIANT 223231kB 1000
+VARIANT 430488kB 1000
+VARIANT 637745kB 1000
+VARIANT 845002kB 1000
diff --git a/test/accuracy/likwid-accuracy.py b/test/accuracy/likwid-accuracy.py
index 3d2d63c..916ed38 100755
--- a/test/accuracy/likwid-accuracy.py
+++ b/test/accuracy/likwid-accuracy.py
@@ -16,6 +16,8 @@ bench_marker = "./likwid-bench-marker"
bench_papi = "./likwid-bench-papi"
perfctr = "../../likwid-perfctr"
topology = "../../likwid-topology"
+topology_name = re.compile("^CPU name:\s+(.*)")
+topology_stepping = re.compile("^CPU stepping:\s+(\d*)")
topology_type = re.compile("^CPU type:\s+(.*)")
topology_sockets = re.compile("^Sockets:\s+(\d+)")
topology_corespersocket = re.compile("^Cores per socket:\s+(\d+)")
@@ -24,10 +26,18 @@ testlist = "SET.txt"
testfolder = "TESTS"
resultfolder = "RESULTS"
hostname = socket.gethostname()
-picture_base = "http://likwid.googlecode.com/svn/wiki/images"
+picture_base = ".."
+topology_outputfile = "topology.dat"
+nrThreads = 1
-gnu_colors = ["red","blue","green"]#,"black","brown", "gray","violet", "cyan", "magenta","orange","#4B0082","#800000","turquoise","#006400","yellow"]
-gnu_marks = [5,13,9]#,2,3,4,6,7,8,9,10,11,12,14,15]
+gnu_colors = ["red","blue","green","black"]#,"brown", "gray","violet", "cyan", "magenta","orange","#4B0082","#800000","turquoise","#006400","yellow"]
+gnu_marks = [5,13,9,2]#,3,4,6,7,8,9,10,11,12,14,15]
+
+units = { "L2" : "MByte/s", "L3" : "MByte/s", "MEM" : "MByte/s", "HA" : "MByte/s",
+ "FLOPS_SP" : "MFLOP/s", "FLOPS_DP" : "MFLOP/s", "FLOPS_AVX" : "MFLOP/s",
+ "DATA": "Load/Store ratio", "BRANCH" : "Instructions per branch",
+ "CLOCK" : "Instructions", "UOPS" : "UOPs"}
+translate_group = {"CLOCK" : "INST_RETIRED_ANY", "UOPS" : "UOPS_RETIRED_ANY"}
wiki = False
papi = False
@@ -38,14 +48,28 @@ out_gnuplot = False
out_grace = False
scriptfilename = "create_plots.sh"
out_script = False
+test_set = {}
+plain_set = {}
+corrected_set = {}
+marker_set = {}
+papi_set = {}
+
+if not os.path.exists(bench_marker):
+ print "Please run make before using likwid-accuracy.py"
+ sys.exit(1)
+if not os.path.exists(perfctr):
+ print "Cannot find likwid-perfctr"
+ sys.exit(1)
+
def usage():
print "Execute and evaluate accuracy tests for LIKWID with likwid-bench and likwid-perfctr"
print
print "-h/--help:\tPrint this help text"
print "-s/--sets:\tSpecifiy testgroups (comma separated). Can also be set in SET.txt"
- print "--wiki:\t\tBesides testing write out results in Google code wiki syntax"
- print "--only_wiki:\tDo not run benchmarks, read results from file and write out results in Google code wiki syntax"
+# print "--wiki:\t\tBesides testing write out results in Google code wiki syntax"
+# print "--only_wiki:\tDo not run benchmarks, read results from file and write out results in Google code wiki syntax"
+ print "-c <nrThreads>:\tSet number of threads. The accuracy tool uses the E notation of likwid like E:N:<nrThreads>:1:2. Default is 1 thread."
print "Picture options:"
print "--pgf:\t\tCreate TeX document for each test with PGFPlot"
print "--gnuplot:\tCreate GNUPlot script for each test"
@@ -53,30 +77,6 @@ def usage():
print "--script:\tActivate recording of commands in a bash script"
print "--scriptname:\tRecord commands to create pictures in file (default: %s)" % (os.path.join(os.path.join(resultfolder,hostname),scriptfilename))
-def get_system_info():
- name = None
- sockets = 0
- corespersocket = 0
- threadspercore = 0
-
- p = subprocess.Popen(topology, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
- p.wait()
- if p.returncode != 0:
- name = "Unknown system"
- return
- for line in p.stdout.read().split("\n"):
- if not line.strip() or line.startswith("*") or line.startswith("-"): continue
- if line.startswith("CPU type"):
- name = topology_type.match(line).group(1).strip()
- if line.startswith("Sockets"):
- sockets = int(topology_sockets.match(line).group(1))
- if line.startswith("Cores per socket"):
- corespersocket = int(topology_corespersocket.match(line).group(1))
- if line.startswith("Threads per core"):
- threadspercore = int(topology_threadspercore.match(line).group(1))
- if name and sockets > 0 and corespersocket > 0 and threadspercore > 0:
- break
- return name, sockets, corespersocket, threadspercore
def get_groups():
groups = {}
@@ -87,8 +87,10 @@ def get_groups():
for line in p.stdout.read().split("\n"):
if line.startswith("-") or not line.strip(): continue
if line.startswith("Available"): continue
- name, description = line.split(":")
- groups[name.strip()] = description.strip()
+ linelist = re.split("\s+", line.strip())
+ name = linelist[0]
+ description = " ".join(linelist[1:])
+ groups[name] = description
return groups
def get_test_groups(groupdict):
@@ -99,52 +101,103 @@ def get_test_groups(groupdict):
setfp = open("SET.txt",'r')
setlist = setfp.read().strip().split("\n")
setfp.close()
-
+
filelist = glob.glob(testfolder+"/*.txt")
for name in setlist:
- tests = []
- file = os.path.join(testfolder, name) + ".txt"
- if not os.path.exists(file): continue
- fp = open(file,'r')
- finput = fp.read().strip().split("\n")
- fp.close()
- for line in finput:
- if line.startswith("TEST"):
- tests.append(line.split(" ")[1])
- groups[name] = tests
-
-
+ if name in get_groups():
+ tests = []
+ file = os.path.join(testfolder, name) + ".txt"
+ if not os.path.exists(file): continue
+ fp = open(file,'r')
+ finput = fp.read().strip().split("\n")
+ fp.close()
+ for line in finput:
+ if line.startswith("TEST"):
+ tests.append(line.split(" ")[1])
+ groups[name] = tests
+
+
return groups
-
-def get_values_from_file(file, lineoffset, linecount):
- results = []
- fp = open(file,'r')
- finput = fp.read().strip().split("\n")
- fp.close()
+
+def write_topology(path):
+ try:
+ f = open(os.path.join(path, topology_outputfile),"w")
+ except:
+ print "Cannot write topology file %s" % (os.path.join(path, topology_outputfile),)
+ return
+ p = subprocess.Popen(topology, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+ p.wait()
+ if p.returncode != 0:
+ return
+ f.write(p.stdout.read())
+ f.close()
+
+def approx(in1, in2):
+ if in1 > (0.95*in2) or in1 < (1.05*in2):
+ return 1
+ return 0
+
+def legend(file1, file2):
+ input1 = []
+ input2 = []
+ numbers1 = []
+ numbers2 = []
+ try:
+ f=open(file1,"r")
+ input1 = f.read().strip().split("\n")
+ f.close()
+ except:
+ print "Cannot open file "+file1
try:
- for line in finput[lineoffset:lineoffset+linecount]:
- results.append(float(line.split(" ")[1]))
+ f=open(file2,"r")
+ input2 = f.read().strip().split("\n")
+ f.close()
except:
- print "Cannot read file %s from %d to %d" % (file, lineoffset,lineoffset+linecount, )
- for line in finput[lineoffset:lineoffset+linecount]:
- print line
- return results
+ print "Cannot open file "+file2
+ if len(input1) == 0 and len(input2) == 0:
+ return "no"
+ for line in input1:
+ numbers1.append(line.split(" ")[1])
+ for line in input2:
+ numbers2.append(line.split(" ")[1])
+ if float(numbers1[0]) > float(numbers1[-1]) and float(numbers2[0]) > float(numbers2[-1]):
+ return "no"
+ elif float(numbers1[0]) < float(numbers1[-1]) and float(numbers2[0]) < float(numbers2[-1]):
+ return "so"
+ elif approx(float(numbers1[0]), float(numbers1[-1])) and approx(float(numbers2[0]), float(numbers2[-1])):
+ return "so"
+ return "no"
+
-def write_pgf(group, test, plain_file, marker_file, papi_file=None, execute=False, script=None):
- filename = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+".tex")
+def write_pgf(group, test, plain_file, marker_file, scale=0.0,papi_file=None, execute=False, script=None):
+ printgrp = group
+ if translate_group.has_key(group):
+ printgrp = translate_group[group]
+ filename = os.path.join(os.path.join(resultfolder,hostname),printgrp+"_"+test+".tex")
+ sizelist = []
+ sizeindex = []
+ lentry = "north east"
+ if legend(plain_file, marker_file) == "so":
+ lentry = "south east"
+ for i,variant in enumerate(test_set[group][test]["variants"]):
+ sizelist.append(variant)
+ sizeindex.append(str((i+0.5)*test_set[group][test]["RUNS"]))
fp = open(filename,'w')
fp.write("\documentclass{article}\n")
fp.write("\usepackage{pgfplots}\n")
fp.write("\\begin{document}\n")
fp.write("% cut from here\n")
fp.write("\\begin{tikzpicture}\n")
- fp.write("\\begin{axis}[xlabel={Run}, ylabel={MFlops/s / MBytes/s},title={%s\_%s},legend pos=south east,xtick=data,width=.75\\textwidth]\n" % (group.replace("_","\_"),test.replace("_","\_"),))
+ fp.write("\\begin{axis}[xmin=0,xmax=%d,xlabel={Size - %d runs each}, ylabel={%s},title={Group %s - Test %s},legend pos=%s,xtick=data,width=.75\\textwidth,xticklabels={%s},xtick={%s}]\n" % (test_set[group][test]["RUNS"]*len(test_set[group][test]["variants"]),test_set[group][test]["RUNS"],units[group],printgrp.replace("_","\_"),test.replace("_","\_"),lentry,",".join(sizelist),",".join(sizeindex)))
fp.write("\\addplot+[red,mark=square*,mark options={draw=red, fill=red}] table {%s};\n" % (os.path.basename(plain_file),))
- fp.write("\\addlegendentry{plain};\n")
- fp.write("\\addplot+[blue,mark=diamond*,mark options={draw=blue, fill=blue}] table {%s};\n" % (os.path.basename(marker_file),))
- fp.write("\\addlegendentry{marker};\n")
+ fp.write("\\addlegendentry{bench};\n")
+ if scale > 0.0:
+ fp.write("\\addplot+[blue,mark=*,mark options={draw=blue, fill=blue}] table[x index=0, y expr=\\thisrowno{1}*%f] {%s};\n" % (scale, os.path.basename(plain_file),))
+ fp.write("\\addlegendentry{scaled bench};\n")
+ fp.write("\\addplot+[green,mark=diamond*,mark options={draw=green, fill=green}] table {%s};\n" % (os.path.basename(marker_file),))
+ fp.write("\\addlegendentry{perfctr};\n")
if papi and papi_file:
- fp.write("\\addplot+[green,mark=triangle*,mark options={draw=green, fill=green}] table {%s};\n" % (os.path.basename(papi_file),))
+ fp.write("\\addplot+[black,mark=triangle*,mark options={draw=black, fill=black}] table {%s};\n" % (os.path.basename(papi_file),))
fp.write("\\addlegendentry{papi};\n")
fp.write("\\end{axis}\n")
fp.write("\\end{tikzpicture}\n")
@@ -161,21 +214,36 @@ def write_pgf(group, test, plain_file, marker_file, papi_file=None, execute=Fals
if script:
script.write("pdflatex %s\n" % (os.path.basename(filename),))
return filename
-
-def write_gnuplot(group, test, plain_file, marker_file, papi_file, execute=False, script=None):
- filename = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+".plot")
+
+def write_gnuplot(group, test, plain_file, marker_file, scale = 1.0, papi_file=None, execute=False, script=None):
+ printgrp = group
+ if translate_group.has_key(group):
+ printgrp = translate_group[group]
+ filename = os.path.join(os.path.join(resultfolder,hostname),printgrp+"_"+test+".plot")
fp = open(filename,'w')
for i,color in enumerate(gnu_colors):
fp.write("set style line %d linetype 1 linecolor rgb '%s' lw 2 pt %s\n" % (i+1, color,gnu_marks[i]))
fp.write("set terminal jpeg\n")
- fp.write("set title '%s_%s'\n" % (group, test,))
- fp.write("set output '%s'\n" % (os.path.basename(os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+".jpg")),))
- fp.write("set xlabel 'Run'\n")
- fp.write("set ylabel 'MFlops/s / MBytes/s'\n")
- #fp.write("set xtics 1\n")
- plot_string = "plot '%s' using 1:2 title 'plain' with linespoints ls 1, \\\n '%s' using 1:2 title 'marker' with linespoints ls 2" % (os.path.basename(plain_file), os.path.basename(marker_file),)
+ fp.write("set encoding utf8\n")
+ fp.write("set title 'Group %s - Test %s'\n" % (printgrp, test,))
+ if legend(plain_file, marker_file) == "no":
+ fp.write("set key top right\n")
+ else:
+ fp.write("set key bottom right\n")
+ fp.write("set output '%s'\n" % (os.path.basename(os.path.join(os.path.join(resultfolder,hostname),printgrp+"_"+test+".jpg")),))
+ fp.write("set xlabel 'Size - %d runs each'\n" % (test_set[group][test]["RUNS"],))
+ fp.write("set ylabel '%s'\n" % (units[group],))
+ fp.write("set yrange [0:]\n")
+ #fp.write("set xtics 0,%d,%d\n" % (test_set[group][test]["RUNS"], test_set[group][test]["RUNS"]*len(test_set[group][test]["variants"]),))
+ fp.write("set xtics %d\n" % (test_set[group][test]["RUNS"]*len(test_set[group][test]["variants"]),))
+ for i,variant in enumerate(test_set[group][test]["variants"]):
+ fp.write("set xtics add (\"%s\" %f)\n" % (variant, (i*test_set[group][test]["RUNS"])+(0.5*test_set[group][test]["RUNS"]),))
+ plot_string = "plot '%s' using 1:2 title 'bench' with linespoints ls 1, \\\n" % (os.path.basename(plain_file),)
+ if scale > 0.0:
+ plot_string = plot_string+" '%s' using 1:($2*%f) title 'scaled bench' with linespoints ls 2, \\\n" % (os.path.basename(plain_file), scale,)
+ plot_string = plot_string+" '%s' using 1:2 title 'perfctr' with linespoints ls 3" % (os.path.basename(marker_file),)
if papi and papi_file:
- plot_string += ", \\\n '%s' using 1:2 title 'papi' with linespoints ls 3\n" % (os.path.basename(papi_file),)
+ plot_string += ", \\\n '%s' using 1:2 title 'papi' with linespoints ls 4\n" % (os.path.basename(papi_file),)
fp.write(plot_string+"\n")
fp.close()
if execute:
@@ -190,31 +258,38 @@ def write_gnuplot(group, test, plain_file, marker_file, papi_file, execute=False
script.write("gnuplot %s\n" % (os.path.basename(filename),))
return filename
-def write_grace(group, test, plain_file, marker_file, papi_file=None, execute=False, script=None):
- filename = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+".bat")
- agrname = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+".agr")
- pngname = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+".png")
+def write_grace(group, test, plain_file, correct_file, marker_file, papi_file=None, execute=False, script=None):
+ printgrp = group
+ if translate_group.has_key(group):
+ printgrp = translate_group[group]
+ filename = os.path.join(os.path.join(resultfolder,hostname),printgrp+"_"+test+".bat")
+ agrname = os.path.join(os.path.join(resultfolder,hostname),printgrp+"_"+test+".agr")
+ pngname = os.path.join(os.path.join(resultfolder,hostname),printgrp+"_"+test+".png")
if execute or script:
plain_file = os.path.basename(plain_file)
marker_file = os.path.basename(marker_file)
+ correct_file = os.path.basename(correct_file)
if papi_file: papi_file = os.path.basename(papi_file)
pngname = os.path.basename(pngname)
agrname = os.path.basename(agrname)
- cmd_options = "-autoscale xy -nxy %s -nxy %s "% (plain_file,marker_file,)
+ cmd_options = "-autoscale xy -nxy %s -nxy %s -nxy %s " % (plain_file, correct_file, marker_file,)
if papi and papi_file:
cmd_options += "-nxy %s " % (papi_file,)
out_options = "-hdevice PNG -printfile %s " % (pngname,)
out_options += "-saveall %s" % (agrname,)
fp = open(filename,'w')
- fp.write("title \"%s_%s\"\n" % (group, test,))
+ fp.write("title \"Group %s - Test %s\"\n" % (printgrp, test,))
fp.write("xaxis label \"Run\"\n")
fp.write("xaxis label char size 1.2\n")
- fp.write("xaxis ticklabel char size 1.2\n")
- fp.write("yaxis label \"MFlops/s / MBytes/s\"\n")
+ fp.write("xaxis ticklabel char size 1.2\n" % (units[group],))
+ fp.write("yaxis label \"%s\"\n")
fp.write("yaxis label char size 1.2\n")
fp.write("yaxis ticklabel char size 1.2\n")
- fp.write("legend 0.8,0.7\n")
- fp.write("s0 legend \"plain\"\n")
+ if legend(plain_file, marker_file) == "no":
+ fp.write("legend 0.8,0.7\n")
+ else:
+ fp.write("legend 0.2,0.7\n")
+ fp.write("s0 legend \"bench\"\n")
fp.write("s0 symbol 2\n")
fp.write("s0 symbol size 1\n")
fp.write("s0 symbol color 2\n")
@@ -228,7 +303,7 @@ def write_grace(group, test, plain_file, marker_file, papi_file=None, execute=Fa
fp.write("s0 line linestyle 1\n")
fp.write("s0 line linewidth 2\n")
fp.write("s0 line pattern 1\n")
- fp.write("s1 legend \"marker\"\n")
+ fp.write("s1 legend \"scaled bench\"\n")
fp.write("s1 symbol 3\n")
fp.write("s1 symbol size 1\n")
fp.write("s1 symbol color 4\n")
@@ -242,21 +317,35 @@ def write_grace(group, test, plain_file, marker_file, papi_file=None, execute=Fa
fp.write("s1 line linestyle 1\n")
fp.write("s1 line linewidth 2\n")
fp.write("s1 line pattern 1\n")
+ fp.write("s2 legend \"perfctr\"\n")
+ fp.write("s2 symbol 4\n")
+ fp.write("s2 symbol size 1\n")
+ fp.write("s2 symbol color 3\n")
+ fp.write("s2 symbol pattern 1\n")
+ fp.write("s2 symbol fill color 3\n")
+ fp.write("s2 symbol fill pattern 1\n")
+ fp.write("s2 symbol linewidth 2\n")
+ fp.write("s2 symbol linestyle 1\n")
+ fp.write("s2 line type 1\n")
+ fp.write("s2 line color 3\n")
+ fp.write("s2 line linestyle 1\n")
+ fp.write("s2 line linewidth 2\n")
+ fp.write("s2 line pattern 1\n")
if papi and papi_file:
- fp.write("s2 legend \"papi\"\n")
- fp.write("s2 symbol 4\n")
- fp.write("s2 symbol size 1\n")
- fp.write("s2 symbol color 3\n")
- fp.write("s2 symbol pattern 1\n")
- fp.write("s2 symbol fill color 3\n")
- fp.write("s2 symbol fill pattern 1\n")
- fp.write("s2 symbol linewidth 2\n")
- fp.write("s2 symbol linestyle 1\n")
- fp.write("s2 line type 1\n")
- fp.write("s2 line color 3\n")
- fp.write("s2 line linestyle 1\n")
- fp.write("s2 line linewidth 2\n")
- fp.write("s2 line pattern 1\n")
+ fp.write("s3 legend \"papi\"\n")
+ fp.write("s3 symbol 5\n")
+ fp.write("s3 symbol size 1\n")
+ fp.write("s3 symbol color \"black\"\n")
+ fp.write("s3 symbol pattern 1\n")
+ fp.write("s3 symbol fill color \"black\"\n")
+ fp.write("s3 symbol fill pattern 1\n")
+ fp.write("s3 symbol linewidth 2\n")
+ fp.write("s3 symbol linestyle 1\n")
+ fp.write("s3 line type 1\n")
+ fp.write("s3 line color \"black\"\n")
+ fp.write("s3 line linestyle 1\n")
+ fp.write("s3 line linewidth 2\n")
+ fp.write("s3 line pattern 1\n")
fp.close()
if execute:
cmd = "cd %s && gracebat %s -param %s %s && cd -" % (os.path.dirname(filename), cmd_options, os.path.basename(filename),out_options,)
@@ -269,8 +358,10 @@ def write_grace(group, test, plain_file, marker_file, papi_file=None, execute=Fa
script.write("gracebat %s -param %s %s\n" % (cmd_options, os.path.basename(filename),out_options,))
return filename
+
+
try:
- opts, args = getopt.getopt(sys.argv[1:], "hs:", ["help", "sets=","script","scriptname=","wiki","only_wiki","pgf","gnuplot","grace","papi"])
+ opts, args = getopt.getopt(sys.argv[1:], "hs:c:", ["help", "sets=","script","scriptname=","wiki","only_wiki=","pgf","gnuplot","grace","papi"])
except getopt.GetoptError as err:
print str(err)
usage()
@@ -288,8 +379,15 @@ for o, a in opts:
wiki = True
if o == "--only_wiki":
only_wiki = True
+ hostname = a
if o == "--papi":
papi = True
+ if o == "-c":
+ try:
+ nrThreads = int(a)
+ except:
+ print "Argument to -c not valid. Must be a number"
+ sys.exit(1)
if o == "--pgf":
out_pgf = True
if o == "--gnuplot":
@@ -303,55 +401,74 @@ for o, a in opts:
if o == "--scriptname":
scriptfilename = a
-if not os.path.exists(testlist):
+if len(sets) == 0 and not os.path.exists(testlist):
print "Cannot find file %s containing list of testgroups" % (testlist,)
sys.exit(1)
if not os.path.exists(testfolder):
print "Cannot find folder %s containing the testgroups" % (testfolder,)
sys.exit(1)
-test_set = {}
-plain_set = {}
-marker_set = {}
-papi_set = {}
-fp = open(testlist,'r')
-for line in fp.read().split("\n"):
+
+if len(sets) == 0:
+ fp = open(testlist,'r')
+ tmp = fp.read().split("\n")
+ for item in tmp:
+ if not item.strip() or item.startswith("#"): continue
+ sets.append(item)
+ fp.close()
+for line in sets:
if not line.strip() or line.startswith("#"): continue
- if os.path.exists("%s/%s.txt" % (testfolder,line.strip(),)):
- test_set[line.strip()] = {}
- plain_set[line.strip()] = {}
- marker_set[line.strip()] = {}
- papi_set[line.strip()] = {}
- testfp = open("%s/%s.txt" % (testfolder,line.strip(),),'r')
+ filename = "%s/%s.txt" % (testfolder,line.strip(),)
+ if os.path.exists(filename):
+ groupname = line.strip()
+ testfp = open(filename,'r')
+ for line in testfp.read().split("\n"):
+ if line.startswith("GROUP"):
+ match = re.match("^GROUP\s+(\.+)")
+ if match:
+ groupname = match.group(1)
+ break
+ testfp.close()
+ test_set[groupname] = {}
+ plain_set[groupname] = {}
+ corrected_set[groupname] = {}
+ marker_set[groupname] = {}
+ papi_set[groupname] = {}
+ testfp = open(filename,'r')
test = None
for i,testline in enumerate(testfp.read().split("\n")):
if test and not testline.strip(): test = None
if testline.startswith("REGEX_BENCH"):
- test_set[line.strip()]["REGEX_BENCH"] = re.compile(" ".join(testline.split(" ")[1:]))
+ test_set[groupname]["REGEX_BENCH"] = re.compile(" ".join(testline.split(" ")[1:]))
if testline.startswith("REGEX_PERF"):
- test_set[line.strip()]["REGEX_PERF"] = re.compile(" ".join(testline.split(" ")[1:]))
+ test_set[groupname]["REGEX_PERF"] = re.compile(" ".join(testline.split(" ")[1:]))
if testline.startswith("REGEX_PAPI"):
- test_set[line.strip()]["REGEX_PAPI"] = re.compile(" ".join(testline.split(" ")[1:]))
+ test_set[groupname]["REGEX_PAPI"] = re.compile(" ".join(testline.split(" ")[1:]))
if testline.startswith("TEST"):
test = testline.split(" ")[1]
- test_set[line.strip()][test] = {}
- plain_set[line.strip()][test] = {}
- marker_set[line.strip()][test] = {}
- papi_set[line.strip()][test] = {}
+ test_set[groupname][test] = {}
+ test_set[groupname][test]["WA_FACTOR"] = 0.0
+ plain_set[groupname][test] = {}
+ corrected_set[groupname][test] = {}
+ marker_set[groupname][test] = {}
+ papi_set[groupname][test] = {}
if testline.startswith("RUNS") and test:
- test_set[line.strip()][test]["RUNS"] = int(testline.split(" ")[1])
+ test_set[groupname][test]["RUNS"] = int(testline.split(" ")[1])
+ if testline.startswith("WA_FACTOR") and test:
+ test_set[groupname][test]["WA_FACTOR"] = float(testline.split(" ")[1])
if testline.startswith("VARIANT") and test:
linelist = re.split("\s+",testline);
variant = linelist[1]
- if not test_set[line.strip()][test].has_key("variants"):
- test_set[line.strip()][test]["variants"] = []
- test_set[line.strip()][test][variant] = linelist[2]
- test_set[line.strip()][test]["variants"].append(linelist[1])
- plain_set[line.strip()][test][variant] = []
- marker_set[line.strip()][test][variant] = []
- papi_set[line.strip()][test][variant] = []
+ if not test_set[groupname][test].has_key("variants"):
+ test_set[groupname][test]["variants"] = []
+ test_set[groupname][test][variant] = linelist[2]
+ test_set[groupname][test]["variants"].append(linelist[1])
+ plain_set[groupname][test][variant] = []
+ corrected_set[groupname][test][variant] = []
+ marker_set[groupname][test][variant] = []
+ papi_set[groupname][test][variant] = []
testfp.close()
-fp.close()
+
if len(test_set.keys()) == 0:
@@ -362,22 +479,24 @@ if not os.path.exists(resultfolder):
os.mkdir(resultfolder)
if not os.path.exists(os.path.join(resultfolder,hostname)):
os.mkdir(os.path.join(resultfolder,hostname))
-
+write_topology(os.path.join(resultfolder,hostname))
if not only_wiki:
scriptfile = os.path.join(os.path.join(resultfolder,hostname),scriptfilename)
script = open(scriptfile,'w')
script.write("#!/bin/bash\n")
for group in test_set.keys():
- perfctr_string = "%s -c S0:0 -g %s -m " % (perfctr,group,)
+ perfctr_string = "%s -C E:N:%d:1:2 -g %s -m " % (perfctr,nrThreads, group,)
+ no_scale = False
for test in test_set[group].keys():
if test.startswith("REGEX"): continue
file_plain = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_plain.dat")
raw_plain = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_plain.raw")
+ file_correct = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_correct.dat")
file_marker = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_marker.dat")
raw_marker = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_marker.raw")
outfp_plain = open(file_plain,'w')
- rawfp_plain = open(raw_plain,'w')
+ outfp_correct = open(file_correct,'w')
outfp_marker = open(file_marker,'w')
rawfp_marker = open(raw_marker,'w')
if papi:
@@ -389,44 +508,17 @@ if not only_wiki:
file_papi = None
raw_papi = None
counter = 1
+ print "Group %s Test %s" % (group, test,)
for size in test_set[group][test]["variants"]:
if size.startswith("RUNS"): continue
- bench_options = "-t %s -i %s -g 1 -w N:%s:1" % (test, test_set[group][test][size], size,)
+ print "Size "+size+": ",
+ bench_options = "-t %s -w N:%s:%d" % (test, size, nrThreads)
for i in range(0,test_set[group][test]["RUNS"]):
- # Run with plain likwid-bench
- p = subprocess.Popen(bench_plain+" "+bench_options, shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
- try:
- p.wait()
- stdout = p.stdout.read()
- p.stdout.close()
- except:
- sys.exit(1)
- for line in stdout.split("\n"):
- if p.returncode != 0: print line
- match = test_set[group]["REGEX_BENCH"].match(line)
- if match:
- plain_set[group][test][size].append(match.group(1))
- outfp_plain.write(str(counter)+" "+match.group(1)+"\n")
- rawfp_plain.write(line+"\n")
- # Run with papi instrumented likwid-bench
- if papi:
- os.environ["PAPI_BENCH"] = str(group)
- p = subprocess.Popen(bench_papi+" "+bench_options, shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
- try:
- p.wait()
- stdout = p.stdout.read()
- p.stdout.close()
- except:
- sys.exit(1)
- for line in stdout.split("\n"):
- if p.returncode != 0: print line
- match = test_set[group]["REGEX_PAPI"].match(line)
- if match:
- papi_set[group][test][size].append(match.group(1))
- outfp_papi.write(str(counter)+" "+match.group(1)+"\n")
- rawfp_papi.write(line+"\n")
+ print "*",
+ sys.stdout.flush()
# Run with LIKWID instrumented likwid-bench and likwid-perfctr
- p = subprocess.Popen(perfctr_string+" "+bench_marker+" "+bench_options, shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
+ rawfp_marker.write(perfctr_string+" "+bench_marker+" "+bench_options+"\n")
+ p = subprocess.Popen(perfctr_string+" "+bench_marker+" "+bench_options, shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT,executable="/bin/bash")
stdout = ""
try:
p.wait()
@@ -434,100 +526,50 @@ if not only_wiki:
p.stdout.close()
except:
sys.exit(1)
+ found_bench = False
+ found_perfctr = False
for line in stdout.split("\n"):
- if p.returncode != 0: print line
- match = test_set[group]["REGEX_PERF"].match(line)
- if match:
- marker_set[group][test][size].append(float(match.group(1)))
- outfp_marker.write(str(counter)+" "+str(float(match.group(1)))+"\n")
+ if p.returncode == 0:
+ match = test_set[group]["REGEX_PERF"].match(line)
+ if match:
+ marker_set[group][test][size].append(float(match.group(1)))
+ outfp_marker.write(str(counter)+" "+str(float(match.group(1)))+"\n")
+ found_perfctr = True
+ match = test_set[group]["REGEX_BENCH"].match(line)
+ if match:
+ found_bench = True
+ value = float(match.group(1)) * test_set[group][test]["WA_FACTOR"]
+ plain_set[group][test][size].append(match.group(1))
+ corrected_set[group][test][size].append(str(value))
+ outfp_plain.write(str(counter)+" "+match.group(1)+"\n")
+ outfp_correct.write(str(counter)+" "+str(value)+"\n")
rawfp_marker.write(line+"\n")
+ if not found_bench:
+ value = str(test_set[group][test]["WA_FACTOR"])
+ plain_set[group][test][size].append(value)
+ corrected_set[group][test][size].append(value)
+ outfp_plain.write(str(counter)+" "+value+"\n")
+ outfp_correct.write(str(counter)+" "+value+"\n")
+ no_scale = True
+ if not found_perfctr:
+ marker_set[group][test][size].append(0)
+ outfp_marker.write(str(counter)+" "+str(0)+"\n")
counter += 1
+ print("")
outfp_plain.close()
- rawfp_plain.close()
+ outfp_correct.close()
outfp_marker.close()
rawfp_marker.close()
if papi:
outfp_papi.close()
rawfp_papi.close()
- if out_pgf: pgf_file = write_pgf(group, test, file_plain, file_marker, file_papi, script=script)
- if out_gnuplot: plot_file = write_gnuplot(group, test, file_plain, file_marker, file_papi, script=script)
- if out_grace: grace_file = write_grace(group, test, file_plain, file_marker, file_papi, script=script)
+ if no_scale:
+ test_set[group][test]["WA_FACTOR"] = 0.0
+ if out_pgf:
+ pgf_file = write_pgf(group, test, file_plain, file_marker, test_set[group][test]["WA_FACTOR"],file_papi, script=script)
+ if out_gnuplot: plot_file = write_gnuplot(group, test, file_plain,file_marker, test_set[group][test]["WA_FACTOR"], file_papi, script=script)
+ if out_grace: grace_file = write_grace(group, test, file_plain, file_correct, file_marker, file_papi, script=script)
script.close()
os.chmod(scriptfile, stat.S_IRWXU)
-#if only_wiki:
-# for group in test_set.keys():
-# for test in test_set[group].keys():
-# if test.startswith("REGEX"): continue
-# filename = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_plain.dat")
-# for i,size in enumerate(test_set[group][test]["variants"]):
-# start = i*test_set[group][test]["RUNS"]
-# end = (i+1)*test_set[group][test]["RUNS"]
-# runs = test_set[group][test]["RUNS"]
-# print "Read file %s for size %s from %d to %d" % (filename,size, start, end,)
-# plain_set[group][test][size] = get_values_from_file(filename, start, runs)
-# if len(plain_set[group][test][size]) == 0: plain_set[group][test][size].append(0)
-# filename = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_marker.dat")
-# for i,size in enumerate(test_set[group][test]["variants"]):
-# start = i*test_set[group][test]["RUNS"]
-# end = (i+1)*test_set[group][test]["RUNS"]
-# runs = test_set[group][test]["RUNS"]
-# print "Read file %s for size %s from %d to %d" % (filename,size, start, end,)
-# marker_set[group][test][size] = get_values_from_file(filename, start, runs)
-# if len(marker_set[group][test][size]) == 0: marker_set[group][test][size].append(0)
-
-
-if wiki or only_wiki:
- name, sockets, corespersocket, threadspercore = get_system_info();
- groups = get_groups()
- testable_groups = get_test_groups(groups)
- #print groups
- #print testable_groups
- #if testable_groups.has_key("FLOPS_DP"): del testable_groups["FLOPS_DP"]
-
- print "#summary Accuracy Tests for %s\n" % (name,)
- print "= Hardware description ="
- print "Sockets: %d<br>" % (sockets,)
- print "Cores per socket: %d<br>" % (corespersocket,)
- print "Threads per core: %d<br>" % (threadspercore,)
- print "Total number of processing units: %d<br>" % (sockets * corespersocket * threadspercore)
- print
- print "= Available groups ="
- print "Each architecture defines a different set of groups. Here all the groups available for the %s are listed:<br>" % (name,)
- for grp in groups.keys():
- print "%s: %s<br>" % (grp, groups[grp],)
- print
- print "= Available verification tests ="
- print "Not all groups can be tested for accuracy. Here only the groups are listed that can be verified. Each group is followed by the low-level benchmarks that are performed for comparison.<br>"
- #print testable_groups
- for grp in testable_groups.keys():
- print "%s: %s<br>" % (grp, ", ".join (testable_groups[grp]))
- print
- print "= Accuracy comparison ="
- print "For each varification group, the tests are performed twice. Once in a plain manner without measuring but calculating the resulting values and once through an instumented code with LIKWID.<br>"
-
-
- for grp in testable_groups.keys():
- print "== Verification of Group %s ==" % (grp,)
- for test in testable_groups[grp]:
- #print grp, test, test_set[grp][test]
- print "=== Verification of Group %s with Test %s ===" % (grp, test,)
- print "|| *Stream size* || *Iterations* ||"
- for variant in test_set[grp][test]["variants"]:
- print "|| %s || %s ||" % (variant, test_set[grp][test][variant], )
- print
- print "Each data size is tested %d times, hence the first %d entries on the x-axis correspond to the %d runs for the first data size of %s and so on.<br>" % (test_set[grp][test]["RUNS"],test_set[grp][test]["RUNS"],test_set[grp][test]["RUNS"],test_set[grp][test]["variants"][0],)
- print "%s/accuracy/%s/%s_%s.png" % (picture_base,hostname, grp, test,)
- print
- file_plain = os.path.join(os.path.join(resultfolder,hostname),grp+"_"+test+"_plain.dat")
- file_marker = os.path.join(os.path.join(resultfolder,hostname),grp+"_"+test+"_marker.dat")
- print "|| Variant || Plain (Min) || LIKWID (Min) || Plain (Max) || LIKWID (Max) || Plain (Avg) || LIKWID (Avg) ||"
- for i, variant in enumerate(test_set[grp][test]["variants"]):
- results_plain = get_values_from_file(file_plain, i*test_set[grp][test]["RUNS"], test_set[grp][test]["RUNS"])
- results_marker = get_values_from_file(file_marker, i*test_set[grp][test]["RUNS"], test_set[grp][test]["RUNS"])
- if results_plain == []: results_plain.append(0)
- if results_marker == []: results_marker.append(0)
- print "|| %s || %d || %d || %d || %d || %d || %d ||" % (variant, min(results_plain), min(results_marker), max(results_plain), max(results_marker), int(statistics.mean(results_plain)), int(statistics.mean(results_marker)),)
- print
- print
diff --git a/test/accuracy/likwid-adjust-test-sizes.py b/test/accuracy/likwid-adjust-test-sizes.py
new file mode 100755
index 0000000..0deb5dd
--- /dev/null
+++ b/test/accuracy/likwid-adjust-test-sizes.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+
+
+import os, sys, os.path, re, subprocess
+
+topology_exec = "../../likwid-topology"
+topology_re_size = re.compile("^Size:\s+(.*)")
+re_size_unit = re.compile("(\d+)\s(\w+)")
+
+cachesizes = []
+
+def get_caches():
+ level = 0
+ p = subprocess.Popen(topology_exec, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+ p.wait()
+ if p.returncode != 0:
+ return level
+ for line in p.stdout.read().split("\n"):
+ if line.startswith("Size:"):
+ string = topology_re_size.match(line).group(1).strip()
+ size, unit = re_size_unit.match(string).groups()
+ if unit == "kB":
+ size = int(size)*1024
+ elif unit == "MB":
+ size = int(size)*1024*1024
+ cachesizes.append(size)
+ level += 1
+ fp = open("/proc/meminfo")
+ f = fp.read().strip().split("\n")
+ fp.close()
+ for line in f:
+ if line.startswith("MemTotal:"):
+ linelist = re.split("\s+", line)
+ size = int(linelist[1])
+ if linelist[2] == "kB":
+ size *= 1024
+ elif linelist[2] == "MB":
+ size *= 1024*1024
+ if size > 1024*1024*1024:
+ size = 1024*1024*1024
+ cachesizes.append(size)
+ return level
+
+def get_important_tests():
+ important = ["L2", "L3", "MEM", "CLOCK", "UOPS"]
+ adjust = []
+ regular = []
+ fp = open("SET.txt")
+ f = fp.read().strip().split("\n")
+ fp.close()
+ for line in f:
+ found = False
+ for imp in important:
+ if imp in line:
+ adjust.append(line)
+ found = True
+ if not found:
+ regular.append(line)
+ return adjust, regular
+
+def adjust_tests(testgroup):
+ fp = open("TESTS/"+testgroup+".txt", "r")
+ f = fp.read().strip().split("\n")
+ fp.close()
+ newdata = []
+ level = re.match("L(\d+)", testgroup)
+ if level:
+ level = int(level.group(1))-1
+ else:
+ level = len(cachesizes)-1
+ min_size = int((cachesizes[level-1] + (0.3*cachesizes[level-1]))/1024)
+ max_size = int((cachesizes[level] - (0.2*cachesizes[level]))/1024)
+ diff = (cachesizes[level] - cachesizes[level-1])/1024
+ step = diff/5
+ i = 0
+ while i < len(f):
+ if not f[i].startswith("VARIANT"):
+ newdata.append(f[i]+"\n")
+ i+=1
+ else:
+ count = 0
+ for j in range(i,i+4):
+ if f[j].startswith("VARIANT"):
+ count += 1
+ else: break
+ i += count
+ newdata.append("VARIANT %dkB 1000\n" % (int(min_size+step),))
+ newdata.append("VARIANT %dkB 1000\n" % (int(min_size+(2*step)),))
+ newdata.append("VARIANT %dkB 1000\n" % (int(min_size+(3*step)),))
+ newdata.append("VARIANT %dkB 1000\n" % (int(min_size+(4*step)),))
+ fp = open("TESTS/"+testgroup+".txt", "w")
+ for line in newdata:
+ fp.write(line)
+ fp.close()
+
+level = get_caches()
+adjust, regular = get_important_tests()
+for testgroup in adjust:
+ print("Adjusting "+testgroup)
+ adjust_tests(testgroup)
+if len(regular) > 0:
+ print("Not adjusting:")
+ print(regular)
+
+
diff --git a/test/accuracy/likwid-tester b/test/accuracy/likwid-tester
deleted file mode 100755
index ea264ae..0000000
--- a/test/accuracy/likwid-tester
+++ /dev/null
@@ -1,220 +0,0 @@
-#!/usr/bin/perl
-use lib '../../perl';
-use strict;
-use warnings;
-use xmgrace;
-use Cwd 'abs_path';
-use Data::Dumper;
-use File::Copy;
-
-my $domain = 'S0';
-
-my $hostname = `hostname`;
-chomp $hostname;
-my %GROUPS;
-my $TEST_ROOT = abs_path('./');
-my $RESULT_DIR = "$TEST_ROOT/RESULTS/$hostname";
-my $LIKWID_ROOT = "$TEST_ROOT/../..";
-my $PERFCTR = "$LIKWID_ROOT/likwid-perfctr";
-my $BENCH_PLAIN = "$LIKWID_ROOT/likwid-bench-plain";
-my $BENCH_MARKER = "$LIKWID_ROOT/likwid-bench-marker";
-
-sub extract_result
-{
- my $type = shift;
- my $REGEX;
- my $REGEX_PLAIN = shift;
- my $REGEX_MARKER = shift;
-
- if ( $type eq 'plain' ) {
- $REGEX = $REGEX_PLAIN;
- }
- elsif ($type eq 'marker') {
- $REGEX = $REGEX_MARKER;
- }
-
- open (INPUT,"<out-$hostname.txt");
- while (<INPUT>) {
- if (/$REGEX/) {
- return $1;
- }
- }
- close INPUT;
-
- return 0;
-}
-
-# determine capabilities of platform
-open (INPUT, "$PERFCTR -a |");
-
-while (<INPUT>) {
- if (/(.+):/) {
- $GROUPS{$1}='true';
- }
-}
-
-close INPUT;
-
-mkdir $RESULT_DIR if (not -d $RESULT_DIR);
-
-
-# collect tests
-chdir ("$TEST_ROOT/TESTS") or die "Cannot change in $TEST_ROOT/TESTS $!\n";
-opendir (DIR, './') or die "Cannot open current directory: $!\n";
-my $TESTS = {};
-my $test_ptr;
-
-while (defined(my $file = readdir(DIR))) {
- if ($file !~ /^\./) {
- print "SCANNING $file\n";
- open (TESTCASE, "<$file");
- $file =~ s/.txt//;
- $TESTS->{$file}->{benchmarks} = [];
-
- while ( <TESTCASE> ) {
-
- if (/REGEX_BENCH[ ](.+)/) {
- $TESTS->{$file}->{REGEX_BENCH} = $1;
- } elsif (/REGEX_PERF[ ](.+)/) {
- $TESTS->{$file}->{REGEX_PERF} = $1;
- } elsif (/TEST\s+(.+)/) {
- push (@{ $TESTS->{$file}->{benchmarks} },
- {name => $1,
- runs => 0,
- variants => []});
-
- $test_ptr = $TESTS->{$file}->{benchmarks}[-1];
-
- } elsif (/RUNS\s+(.+)/) {
- $test_ptr->{runs} = $1;
- } elsif (/VARIANT\s+(.+B)\s+([0-9]+)/) {
- push (@{ $test_ptr->{variants} },{size => $1, iter => $2});
- }
- }
- close TESTCASE;
- }
-}
-
-closedir DIR;
-chdir "$TEST_ROOT";
-
-# Read in Test set
-my %FILTER;
-open FILE,"<SET.txt";
-while ( <FILE> ) {
- if ( not /^#/ ) {
- chomp;
- $FILTER{$_} = 'true';
- }
-}
-close FILE;
-
-#run tests
-foreach my $test ( keys %$TESTS ) {
-
- if ((exists $GROUPS{$test}) and (exists $FILTER{$test})) {
- print "RUNNING $test : ";
-
- foreach my $bench ( @{ $TESTS->{$test}->{benchmarks} } ) {
- my $benchmark = $bench->{name};
- my $runs = $bench->{runs};
- open (DATAFILE1, ">out-$hostname-1.dat");
- open (DATAFILE2, ">out-$hostname-2.dat");
- my $globalrun = 0;
- print "$bench->{name} ";
-
- foreach my $variant ( @{ $bench->{variants} } ) {
- foreach ( 0 ... $runs ) {
- print DATAFILE1 "$globalrun ";
- print DATAFILE2 "$globalrun ";
- #print "$BENCH_PLAIN -g 1 -t $benchmark -i $variant->{iter} -w $domain:$variant->{size}:1 > out-$hostname.txt";
- system ("$BENCH_PLAIN -g 1 -t $benchmark -i $variant->{iter} -w $domain:$variant->{size}:1 > out-$hostname.txt");
- my $result = extract_result('plain',$TESTS->{$test}->{REGEX_BENCH},$TESTS->{$test}->{REGEX_PERF});
- print DATAFILE1 "$result\n";
- #print "$PERFCTR -C E:". $domain .":0 -m -g $test $BENCH_MARKER -g 1 -t $benchmark -i $variant->{iter} -w $domain:$variant->{size}:1 > out-$hostname.txt";
- system ("$PERFCTR -C E:". $domain .":1 -m -g $test $BENCH_MARKER -g 1 -t $benchmark -i $variant->{iter} -w $domain:$variant->{size}:1 > out-$hostname.txt");
- $result = extract_result('marker',$TESTS->{$test}->{REGEX_BENCH},$TESTS->{$test}->{REGEX_PERF});
- print DATAFILE2 "$result\n";
- $globalrun++;
- }
- }
-
- close DATAFILE1;
- close DATAFILE2;
-
-#output results
- if (system('gracebat >/dev/null 2>&1') ) {
- mkdir "$RESULT_DIR/tmp" if (not -d "$RESULT_DIR/tmp");
-
- copy ("$LIKWID_ROOT/test/accuracy/out-$hostname-1.dat", "$RESULT_DIR/tmp/$test-$bench->{name}-plain.dat");
- copy ("$LIKWID_ROOT/test/accuracy/out-$hostname-2.dat", "$RESULT_DIR/tmp/$test-$bench->{name}-marker.dat");
-
- } else {
-
- my $series = [];
-
- push @{$series},
- { "title" => "plain",
- "data file" => "$LIKWID_ROOT/test/accuracy/out-$hostname-1.dat",
- "line" => {
- "type" => "1",
- "color" => "2",
- "linewidth" => "2",
- "linestyle" => "1",
- "pattern" => "1",
- },
- "symbol" => {
- "type" => "2",
- "color" => "2",
- "pattern" => "1",
- "linewidth" => "2",
- "linestyle" => "1",
- "size" => "1",
- "fill pattern" => "1",
- "fill color" => "2",
- }
- };
-
- push @{$series},
- { "title" => "marker",
- "data file" => "$LIKWID_ROOT/test/accuracy/out-$hostname-2.dat",
- "line" => {
- "type" => "1",
- "color" => "4",
- "linewidth" => "2",
- "linestyle" => "1",
- "pattern" => "1",
- },
- "symbol" => {
- "type" => "3",
- "color" => "4",
- "pattern" => "1",
- "linewidth" => "2",
- "linestyle" => "1",
- "size" => "1",
- "fill pattern" => "1",
- "fill color" => "4",
- }
- };
-
- xmgrace ({"title" => "$test",
- "subtitle" => "$bench->{name}",
- "legend" => "0.8,0.7",
- "device" => 'PNG',
- "output file" => "$RESULT_DIR/$test\_".$bench->{name}.".png",
- "grace output file" => "$RESULT_DIR/$test\_".$bench->{name}.".agr",
- "xaxis label" => "run",
- "yaxis label" => "MFlops/s / MBytes/s"
- },
- $series);
- }
- }
- print "\n";
- }
-}
-
-unlink 'out-$hostname.txt';
-unlink 'out-$hostname-1.dat';
-unlink 'out-$hostname-2.dat';
-
-
diff --git a/test/accuracy/likwid-tester-plot b/test/accuracy/likwid-tester-plot
deleted file mode 100755
index ec6af41..0000000
--- a/test/accuracy/likwid-tester-plot
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/usr/bin/perl
-use lib '../../perl';
-use strict;
-use warnings;
-use xmgrace;
-use Cwd 'abs_path';
-
-my $TEST_ROOT = abs_path('./');
-my $machine = $ARGV[0];
-my $RESULT_DIR = "$TEST_ROOT/RESULTS/$machine";
-
-chdir "$TEST_ROOT/RESULTS/$machine/tmp/";
-opendir (DIR, './') or die "Cannot open current directory: $!\n";
-
-while (defined(my $file = readdir(DIR))) {
-
- if ($file =~ /([A-Z0-9_]+)-(.*)-marker\.dat/) {
- my $series = [];
- my $test = $1;
- my $name = $2;
-
- push @{$series},
- { "title" => "plain",
- "data file" => "$TEST_ROOT/RESULTS/$machine/tmp/$test-$name-plain.dat",
- "line" => {
- "type" => "1",
- "color" => "2",
- "linewidth" => "2",
- "linestyle" => "1",
- "pattern" => "1",
- },
- "symbol" => {
- "type" => "2",
- "color" => "2",
- "pattern" => "1",
- "linewidth" => "2",
- "linestyle" => "1",
- "size" => "1",
- "fill pattern" => "1",
- "fill color" => "2",
- }
- };
-
- push @{$series},
- { "title" => "marker",
- "data file" => "$TEST_ROOT/RESULTS/$machine/tmp/$file",
- "line" => {
- "type" => "1",
- "color" => "4",
- "linewidth" => "2",
- "linestyle" => "1",
- "pattern" => "1",
- },
- "symbol" => {
- "type" => "3",
- "color" => "4",
- "pattern" => "1",
- "linewidth" => "2",
- "linestyle" => "1",
- "size" => "1",
- "fill pattern" => "1",
- "fill color" => "4",
- }
- };
-
- xmgrace ({"title" => "$test",
- "subtitle" => "$name",
- "legend" => "0.8,0.7",
- "device" => 'PNG',
- "output file" => "$RESULT_DIR/$test\_".$name.".png",
- "grace output file" => "$RESULT_DIR/$test\_".$name.".agr",
- "xaxis label" => "run",
- "yaxis label" => "MFlops/s / MBytes/s"
- },
- $series);
- }
-}
-
diff --git a/test/executable_tests/Makefile b/test/executable_tests/Makefile
index 08acc2a..11b12b7 100644
--- a/test/executable_tests/Makefile
+++ b/test/executable_tests/Makefile
@@ -1,6 +1,6 @@
-all: topology pin perfctr memsweeper powermeter features bench genCfg setFreq
+all: topology pin markerAPI perfctr memsweeper powermeter bench genTopoCfg setFrequencies
topology:
./tester.sh likwid-topology
@@ -12,11 +12,11 @@ memsweeper:
./tester.sh likwid-memsweeper
powermeter:
./tester.sh likwid-powermeter
-features:
- ./tester.sh likwid-features
bench:
./tester.sh likwid-bench
-genCfg:
- ./tester.sh likwid-genCfg
-setFreq:
- ./tester.sh likwid-setFreq
+genTopoCfg:
+ ./tester.sh likwid-genTopoCfg
+setFrequencies:
+ ./tester.sh likwid-setFrequencies
+markerAPI:
+ make -s -C .. streamGCC
diff --git a/test/executable_tests/README b/test/executable_tests/README
index 99ab560..45fbe9a 100644
--- a/test/executable_tests/README
+++ b/test/executable_tests/README
@@ -6,3 +6,6 @@ For batch testing all executables simply type make
All lines in the <executable>.txt file are executed and the output evaluated.
Only simple checks are made using grep.
+
+for testing likwid-mpirun, Intel MPI must be present on your system. It only
+tests the command line options and runs only on the local host.
diff --git a/test/executable_tests/likwid-bench.txt b/test/executable_tests/likwid-bench.txt
index 474b160..72670d9 100644
--- a/test/executable_tests/likwid-bench.txt
+++ b/test/executable_tests/likwid-bench.txt
@@ -3,27 +3,22 @@
-v | EXIT 0 | GREP likwid-bench
-p | EXIT 0 | GREP Domain
-a | EXIT 0 | GREP sum
--i | EXIT 1 | GREP requires an argument
+-i | EXIT 1 | GREP option requires an argument
-i 0 | EXIT 1 | GREP Iterations must be greater than 0
--i 100 | EXIT 1 | GREP Number of Workgroups must be 1 or greater
--l | EXIT 1 | GREP requires an argument
+-i 100 | EXIT 1 | GREP At least one workgroup (-w) must be set on commandline
+-l | EXIT 1 | GREP option requires an argument
-l sum | EXIT 0 | GREP Name: sum
--l XXX | EXIT 0 | GREP Unknown test case XXX
--t | EXIT 1 | GREP requires an argument
--t sum | EXIT 1 | GREP Number of Workgroups must be 1 or greater
--t XXX | EXIT 1 | GREP Number of Workgroups must be 1 or greater
--g | EXIT 1 | GREP requires an argument
--g 0 | EXIT 1 | GREP Number of Workgroups must be 1 or greater
--g 1 | EXIT 1 | GREP workgroups requested but only 0 given on commandline
--g X | EXIT 1 | GREP Number of Workgroups must be 1 or greater
--w | EXIT 1 | GREP requires an argument
--g 1 -w X | EXIT 1 | GREP You need to specify a test case first
--t sum -g 1 -w X | EXIT 1 | GREP Error in parsing workgroup string
--t sum -g 1 -w N:1 | EXIT 1 | GREP Cannot parse string
--t XXX -g 1 -w N:1MB:1 | EXIT 1 | GREP You need to specify a test case first
--g 1 -w N:100kB:1 | EXIT 1 | GREP You need to specify a test case first
--i 100 -t sum -g 1 -w N:100kB:1 | EXIT 0 | GREP Number of Flops
--i 100 -t sum -g 2 -w N:100kB:1 | EXIT 1 | GREP workgroups requested but only 1 given on commandline
--i 100 -t sum -g 2 -w N:100kB:1 -w N:100kB:1 | EXIT 0 | GREP Number of Flops
--i 100 -t sum -g 1 -w N:100kB:2:1 | EXIT 1 | GREP Error in parsing workgroup string
--i 100 -t sum -g 1 -w N:100kB:2:1:2 | EXIT 0 | GREP Number of Flops
+-l XXX | EXIT 1 | GREP Unknown test case XXX
+-t | EXIT 1 | GREP option requires an argument
+-t sum | EXIT 1 | GREP At least one workgroup (-w) must be set on commandline
+-t XXX | EXIT 1 | GREP Unknown test case XXX
+-w | EXIT 1 | GREP option requires an argument
+-w X | EXIT 1 | GREP Unknown test case. Please check likwid-bench -a for available tests
+-t sum -w X | EXIT 1 | GREP Misformated workgroup string
+-t sum -w N:1 | EXIT 1 | GREP Stream size cannot be read
+-t XXX -w N:1MB:1 | EXIT 1 | GREP Unknown test case XXX
+-w N:100kB:1 | EXIT 1 | GREP Unknown test case. Please check likwid-bench -a for available tests
+-t sum -w N:100kB:1 | EXIT 0 | GREP Number of Flops
+-t sum -w N:100kB:1 -w N:100kB:1 | EXIT 0 | GREP Number of Flops
+-t sum -w N:100kB:2:1 | EXIT 1 | GREP Misformated workgroup string
+-t sum -w N:100kB:2:1:2 | EXIT 0 | GREP Number of Flops
diff --git a/test/executable_tests/likwid-features.txt b/test/executable_tests/likwid-features.txt
deleted file mode 100644
index ce95592..0000000
--- a/test/executable_tests/likwid-features.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-| EXIT 0 | GREP Performance monitoring | GREP CPU core id
--h | EXIT 0 | GREP Help message
--v | EXIT 0 | GREP likwid-features
--c | EXIT 1 | GREP option requires an argument
--s | EXIT 1 | GREP option requires an argument
--u | EXIT 1 | GREP option requires an argument
--c 0 | EXIT 0 | GREP Performance monitoring | GREP CPU core id
--s HW_PREFETCHER | EXIT 0 | GREP Performance monitoring | GREP CPU core id
--u HW_PREFETCHER | EXIT 0 | GREP Performance monitoring | GREP CPU core id
diff --git a/test/executable_tests/likwid-genCfg.txt b/test/executable_tests/likwid-genCfg.txt
deleted file mode 100644
index 6369b70..0000000
--- a/test/executable_tests/likwid-genCfg.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-| EXIT 1 | GREP Permission denied
--h | EXIT 0 | GREP Help message
--v | EXIT 0 | GREP likwid-genCfg
--o | EXIT 1 | GREP option requires an argument
--o /tmp/topo.txt | EXIT 0 | GREP CPU name
diff --git a/test/executable_tests/likwid-genTopoCfg.txt b/test/executable_tests/likwid-genTopoCfg.txt
new file mode 100644
index 0000000..1323e7b
--- /dev/null
+++ b/test/executable_tests/likwid-genTopoCfg.txt
@@ -0,0 +1,5 @@
+| EXIT 1 | GREP Cannot open file
+-h | EXIT 0 | GREP Help message
+-v | EXIT 0 | GREP likwid-genTopoCfg
+-o | EXIT 1 | GREP Option requires an argument
+-o /tmp/topo.txt | EXIT 0
diff --git a/test/executable_tests/likwid-memsweeper.txt b/test/executable_tests/likwid-memsweeper.txt
index 6c4cd0e..b90fbe8 100644
--- a/test/executable_tests/likwid-memsweeper.txt
+++ b/test/executable_tests/likwid-memsweeper.txt
@@ -1,8 +1,8 @@
| EXIT 0 | GREP Sweeping domain
-h | EXIT 0 | GREP Help message
-v | EXIT 0 | GREP likwid-memsweeper
--c | EXIT 1 | GREP option requires an argument
--c - | EXIT 1 | GREP Cannot parse string
--c -1 | EXIT 0 | GREP Sweeping domain
+-c | EXIT 1 | GREP Option requires an argument
+-c - | EXIT 1 | GREP Cannot parse node string
+-c -1 | EXIT 1 | GREP Cannot parse node string
-c 0 | EXIT 0 | GREP Sweeping domain
--c 10 | EXIT 1 | GREP ERROR | GREP numa
+-c 10 | EXIT 1 | GREP Cannot parse node string
diff --git a/test/executable_tests/likwid-mpirun.txt b/test/executable_tests/likwid-mpirun.txt
new file mode 100644
index 0000000..6287100
--- /dev/null
+++ b/test/executable_tests/likwid-mpirun.txt
@@ -0,0 +1,39 @@
+-h | EXIT 0 | GREP Help message
+-v | EXIT 0 | GREP likwid-mpirun
+-d | EXIT 1 | GREP No option
+-np | EXIT 1 | GREP Option requires an argument
+-nperdomain | EXIT 1 | GREP Option requires an argument
+-pin | EXIT 1 | GREP Option requires an argument
+-s | EXIT 1 | GREP Option requires an argument
+-mpi | EXIT 1 | GREP Option requires an argument
+-omp | EXIT 1 | GREP Option requires an argument
+-hostfile | EXIT 1 | GREP Option requires an argument
+-g | EXIT 1 | GREP Option requires an argument
+-m | EXIT 1 | GREP No option
+-O | EXIT 1 | GREP No option
+-f | EXIT 1 | GREP No option
+-np 1 | EXIT 1 | GREP No executable given on commandline
+-nperdomain N:1 | EXIT 1 | GREP No executable given on commandline
+-pin N:1 | EXIT 1 | GREP No executable given on commandline
+-mpi asd | EXIT 1 | GREP No option
+-omp asd | EXIT 1 | GREP No option
+-hostfile asd | EXIT 1 | GREP No option
+-g asd | EXIT 1 | GREP No option
+-np 1 cat /proc/version | EXIT 0 | GREP Linux
+-nperdomain N:1 cat /proc/version | EXIT 0 | GREP Linux
+-pin N:1 cat /proc/version | EXIT 0 | GREP Linux
+-np 1 -d cat /proc/version | EXIT 0 | GREP Linux
+-nperdomain N:1 -d cat /proc/version | EXIT 0 | GREP Linux
+-pin N:1 -d cat /proc/version | EXIT 0 | GREP Linux
+-np 1 -mpi intelmpi cat /proc/version | EXIT 0 | GREP Linux
+-np 1 -mpi openmpi cat /proc/version | EXIT 1 | GREP Cannot find executable
+-np 1 -mpi mvapich2 cat /proc/version | EXIT 1 | GREP Cannot find executable
+-np 1 -g ASD -f cat /proc/version | EXIT 1 | GREP Empty event list
+-np 1 -g CLOCK -f cat /proc/version | EXIT 0 | GREP Linux | GREP CPI
+-np 1 -nperdomain N:1 cat /proc/version | EXIT 0 | GREP Linux
+-np 1 -pin N:1 cat /proc/version | EXIT 0 | GREP Linux
+-np 1 -pin N:1 -s 0x1 cat /proc/version | EXIT 0 | GREP Linux
+-np 1 -hostfile 123456789 cat /proc/version | EXIT 1 | GREP Cannot open hostfile 123456789
+-np 1 -g CLOCK -f -O cat /proc/version | EXIT 0 | GREP Linux | GREP CPI,
+-np 1 -g CLOCK -f -O -m ../streamGCC | EXIT 0 | GREP Region: triad | GREP CPI,
+-np 1 -f -O -m ../streamGCC | EXIT 1 | GREP You selected the MarkerAPI feature
diff --git a/test/executable_tests/likwid-perfctr.txt b/test/executable_tests/likwid-perfctr.txt
index 80ac60d..e3a7fa9 100644
--- a/test/executable_tests/likwid-perfctr.txt
+++ b/test/executable_tests/likwid-perfctr.txt
@@ -2,37 +2,44 @@
-h | EXIT 0 | GREP Help message
-v | EXIT 0 | GREP likwid-perfctr
-i | EXIT 0 | GREP CPU family
--V -c 0 hostname | EXIT 0 | GREP NOTICE
--V | EXIT 1 | GREP You must specify at least one processor
--g | EXIT 1 | GREP option requires an argument
+-V 1 -c 0 hostname | EXIT 0 | GREP Option(s) -g <string> must be given on commandline
+-V 1 | EXIT 1 | GREP Option -c <list> or -C <list> must be given on commandline
+-V | EXIT 1 | GREP Option requires an argument
+-g | EXIT 1 | GREP Option requires an argument
-g BRANCH -H | EXIT 0 | GREP Group BRANCH:
--a | EXIT 0 | GREP Available groups
--V -e | EXIT 0 | GREP This architecture
--t 200ms | EXIT 1 | GREP You must specify at least one processor
--c | EXIT 1 | GREP option requires an argument
--c 0 | EXIT 1 | GREP You have to specify a program to measure as argument
--t 200ms -c 0 | EXIT 1 | GREP Executable must be given on commandline
--S | EXIT 1 | GREP option requires an argument
--o | EXIT 1 | GREP option requires an argument
--o /tmp/test | EXIT 1 | GREP Outputfile has no filetype suffix
--o /tmp/test.txt | EXIT 1 | GREP You must specify at least one processor
--S 1 | EXIT 1 | GREP You must specify at least one processor
--S 1 -c 0 | EXIT 1 | GREP You have to specify a group or event set to measure using the -g option.
--S 1 -C 0 | EXIT 1 | GREP You have to specify a group or event set to measure using the -g option.
--S 1 -c 0 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP Branch
--S 1 -C 0 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP Branch
--S 1 -c 0,1 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP core 1 | GREP Branch
--S 1 -c 0-1 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP core 1 | GREP Branch
--S 1 -c 0,1-1 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP core 1 | GREP Branch
--S 1 -C 0,1 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP core 1 | GREP Branch
--S 1 -C 0-1 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP core 1 | GREP Branch
--S 1 -C 0,1-1 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP core 1 | GREP Branch
--S 1 -c E:N:2 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
--S 1 -c E:N:2:1:2 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
--S 1 -c M:scatter -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
--S 1 -C E:N:2 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
--S 1 -C E:N:2:1:2 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
--S 1 -C M:scatter -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
--c 0 -g BRANCH hostname | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
--C 0 -g BRANCH hostname | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
--C 0 -g BRANCH -m hostname | EXIT 1 | GREP The marker result file could not be found
+-a | EXIT 0 | GREP Group Name | GREP Description
+-e | EXIT 0 | GREP This architecture
+-t 200ms | EXIT 1 | GREP Option -c <list> or -C <list> must be given on commandline
+-c | EXIT 1 | GREP Option requires an argument
+-c 0 | EXIT 1 | GREP Option(s) -g <string> must be given on commandline
+-t 200ms -c 0 | EXIT 1 | GREP Option(s) -g <string> must be given on commandline
+-S | EXIT 1 | GREP Option requires an argument
+-o | EXIT 1 | GREP Option requires an argument
+-o /tmp/test | EXIT 1 | GREP Option -c <list> or -C <list> must be given on commandline
+-c 0 -o /tmp/test | EXIT 1 | GREP Option(s) -g <string> must be given on commandline
+-C 0 -g BRANCH -o /tmp/test | EXIT 1 | GREP No Executable can be found on commandline
+-C 0 -g BRANCH -o /tmp/test hostname | EXIT 0
+-o /tmp/test.txt | EXIT 1 | GREP Option -c <list> or -C <list> must be given on commandline
+-C 0 -g BRANCH -o /tmp/test.txt hostname | EXIT 0 | NGREP Cannot find filter script, save output in CSV format
+-S 1s | EXIT 1 | GREP Option -c <list> or -C <list> must be given on commandline
+-S 1s -c 0 | EXIT 1 | GREP Option(s) -g <string> must be given on commandline
+-S 1s -C 0 | EXIT 1 | GREP Option(s) -g <string> must be given on commandline
+-S 1s -c 0 -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP Branch
+-S 1s -C 0 -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP Branch
+-S 1s -c 0,1 -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP core 1 | GREP Branch
+-S 1s -c 0-1 -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP core 1 | GREP Branch
+-S 1s -c 0,1-1 -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP core 1 | GREP Branch
+-S 1s -C 0,1 -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP core 1 | GREP Branch
+-S 1s -C 0-1 -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP core 1 | GREP Branch
+-S 1s -C 0,1-1 -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP core 1 | GREP Branch
+-S 1s -c E:N:2 -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP Branch
+-S 1s -c E:N:2:1:2 -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP Branch
+-S 1s -c M:scatter -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP Branch
+-S 1s -C E:N:2 -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP Branch
+-S 1s -C E:N:2:1:2 -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP Branch
+-S 1s -C M:scatter -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP Branch
+-c 0 -g BRANCH -f hostname | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP Branch
+-C 0 -g BRANCH -f hostname | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP Branch
+-C 0 -g BRANCH -f -m hostname | EXIT 1 | GREP No regions
+-C 0 -g BRANCH -f -t 200ms hostname | EXIT 0 | GREP CORES: 0
+-C 0 -g BRANCH -f -m ../streamGCC | EXIT 0 | GREP Region triad | GREP Region copy
diff --git a/test/executable_tests/likwid-pin.txt b/test/executable_tests/likwid-pin.txt
index 801f79c..64d2d96 100644
--- a/test/executable_tests/likwid-pin.txt
+++ b/test/executable_tests/likwid-pin.txt
@@ -4,23 +4,23 @@
-i hostname | EXIT 0 | GREP Set mem_policy to interleaved
-S | EXIT 1 |GREP Executable must be given on commandline
-S hostname | EXIT 0 | GREP Sweeping memory
--c | EXIT 1 |GREP option requires an argument
--p | EXIT 0 | GREP Domain | GREP Tag
+-c | EXIT 1 |GREP Option requires an argument
+-p | EXIT 0 | GREP Domain
-c 0 | EXIT 1 | GREP Executable must be given on commandline
-c 0 -p | EXIT 0 | GREP 0
-c N:0 -p | EXIT 0 | GREP 0
-c S0:0-1 -p | EXIT 0 | GREP 0,1
-c N:0 at N:1 -p | EXIT 0 | GREP 0,1
-c N:0 at N:1 at N:2 -p | EXIT 0 | GREP 0,1,2
--c C0:1-0 -p | EXIT 1 | GREP Range End
+-c C0:1-0 -p | EXIT 0 | GREP 1,0
-c E:N:1 -p | EXIT 0 | GREP 0
-c E:N:2 -p | EXIT 0 | LISTLEN , 2
-c E:N:2:1:2 -p | EXIT 0 | LISTLEN , 2
-c E:N:2:1:2 -d . -p | EXIT 0 | LISTLEN . 2
-c M:scatter -p | EXIT 0
--s | EXIT 1 | GREP option requires an argument
+-s | EXIT 1 | GREP Option requires an argument
-s 0x1 | EXIT 1 | GREP Executable must be given on commandline
--s 0x1 hostname | EXIT 0 | GREP Main PID
+-s 0x1 hostname | EXIT 0
-q | EXIT 1 | GREP Executable must be given on commandline
--q hostname | EXIT 1 | NGREP Main PID
+-q hostname | EXIT 0
diff --git a/test/executable_tests/likwid-powermeter.txt b/test/executable_tests/likwid-powermeter.txt
index f733b06..b09412e 100644
--- a/test/executable_tests/likwid-powermeter.txt
+++ b/test/executable_tests/likwid-powermeter.txt
@@ -1,14 +1,20 @@
-| EXIT 0 | GREP Help message
+| EXIT 0 | GREP Runtime: 2
-h | EXIT 0 | GREP Help message
-v | EXIT 0 | GREP likwid-powermeter
-i | EXIT 0 | GREP Base clock | GREP Power
--c | EXIT 1 | GREP option requires an argument | GREP Help message
--s | EXIT 1 | GREP option requires an argument | GREP Help message
--M | EXIT 1 | GREP option requires an argument | GREP Help message
--s 1 | EXIT 0 | GREP consumed
--c 0 | EXIT 1 | GREP Commandline option -c requires an executable if not used in combination with -s
--p | EXIT 1 | GREP Commandline option -p requires an executable
--c 0 -s 1 | EXIT 0 | GREP consumed | GREP Socket 0
--p hostname | EXIT 0 | Measuring group CLOCK
--c 0 hostname | EXIT 0 | GREP consumed | GREP Socket 0
--M 1 | EXIT 1 | GREP Either -s <seconds> or executable must be given on commandline
+-c | EXIT 1 | GREP Option requires an argument
+-s | EXIT 1 | GREP Option requires an argument
+-M | EXIT 1 | GREP Option requires an argument
+-V | EXIT 1 | GREP Option requires an argument
+-V 1 | EXIT 0 | GREP Base clock | GREP Runtime: 2
+-s 1 | EXIT 1 | GREP Cannot parse time
+-s 1s | EXIT 0 | GREP Runtime: 1
+-c 0 | EXIT 0 | GREP Runtime: 2
+-p | EXIT 0 | GREP Group 1: CLOCK
+-c 0 -s 1 | EXIT 0 | GREP Cannot parse time
+-p hostname | EXIT 0 | Group 1: CLOCK
+-c 0 hostname | EXIT 0 | GREP consumed | GREP socket 0
+-M 1 | EXIT 0 | GREP Runtime: 2
+-M 0 | EXIT 1 | GREP Operation not permitted
+-t | EXIT 0 | GREP Current core temperatures: | GREP Socket 0 Core 0:
+-f | EXIT 0 | GREP Current core temperatures: | GREP Socket 0 Core 0:
diff --git a/test/executable_tests/likwid-setFreq.txt b/test/executable_tests/likwid-setFreq.txt
deleted file mode 100644
index 56c495b..0000000
--- a/test/executable_tests/likwid-setFreq.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-| EXIT 1 | GREP Usage
-0 | EXIT 1 | GREP Usage
-0 0 | EXIT 1 | GREP Frequency must be greater than 0
-0 -1 | EXIT 1 | GREP Frequency must be greater than 0
--1 -1 | EXIT 1 | GREP not a valid CPU ID. Range from 0 to
-100 0 | EXIT 1 | GREP not a valid CPU ID. Range from 0 to
diff --git a/test/executable_tests/likwid-setFrequencies.txt b/test/executable_tests/likwid-setFrequencies.txt
new file mode 100644
index 0000000..821f925
--- /dev/null
+++ b/test/executable_tests/likwid-setFrequencies.txt
@@ -0,0 +1,14 @@
+| EXIT 0 | GREP Help message
+-h | EXIT 0 | GREP Help message
+-v | EXIT 0 | GREP likwid-setFrequencies
+-p | EXIT 0 | GREP Current frequencies: | GREP CPU | grep GHz
+-l | EXIT 0 | GREP Available frequencies:
+-m | EXIT 0 | GREP Available governors:
+-c | EXIT 1 | GREP Option requires an argument
+-c 0 | EXIT 1 | GREP You need to set either a frequency or governor for the selected CPUs on commandline
+-g | EXIT 1 | GREP Option requires an argument
+-f | EXIT 1 | GREP Option requires an argument
+-g performance | EXIT 0
+-f FREQ | EXIT 0
+-c 0 -g conservative | EXIT 0
+-c 0 -f FREQ | EXIT 0
diff --git a/test/executable_tests/likwid-topology.txt b/test/executable_tests/likwid-topology.txt
index 810b1e9..3e6eed5 100644
--- a/test/executable_tests/likwid-topology.txt
+++ b/test/executable_tests/likwid-topology.txt
@@ -1,11 +1,14 @@
--h | EXIT 0 | Help message
+-h | EXIT 0 | GREP Options
-v | EXIT 0 | GREP likwid-topology
-c | EXIT 0 | GREP Cache line size
-C | EXIT 0 | GREP CPU clock
-g | EXIT 0 | GREP +--------
+-V | EXIT 1 | GREP Option requires an argument
+-V 1 | EXIT 0 | grep Hardware Thread Topology
-g -v | EXIT 0 | GREP likwid-topology
-c -g | EXIT 0 | GREP +-------- | GREP Cache line size
-c -g -C | EXIT 0 | GREP +-------- | GREP Cache line size | GREP CPU clock
--o | EXIT 1
--o /tmp/out | EXIT 1 | GREP filter suffix
+-O | EXIT 0 | GREP STRUCT,Info
+-o | EXIT 1 | GREP Option requires an argument
+-o /tmp/out | EXIT 0
-o /tmp/out.txt | EXIT 0
diff --git a/test/executable_tests/tester.sh b/test/executable_tests/tester.sh
index 71342df..119613f 100755
--- a/test/executable_tests/tester.sh
+++ b/test/executable_tests/tester.sh
@@ -1,13 +1,13 @@
#!/bin/bash
-
if [ $# -ne 1 ]; then
echo "You need to give application to test on commandline"
exit 1
fi
-EXECPATH=../..
+EXECPATH=/usr/local/bin
EXEC=$1
TMPFILE=/tmp/testout
+FREQ="2.3"
f_grep() {
ARG="$1"
@@ -41,15 +41,25 @@ if [ ! -e ${EXEC}.txt ]; then
echo "Cannot find testfile ${EXEC}.txt"
exit 1
fi
+if [ "${EXEC}" == "likwid-setFrequencies" ]; then
+ FREQ=$(likwid-setFrequencies -l | grep -v frequencies | awk '{print $2}')
+ CURFREQ=$(likwid-setFrequencies -p | head -n2 | tail -n 1 | rev | awk '{print $2}' | rev)
+fi
+if [ "${EXEC}" == "likwid-mpirun" ]; then
+ if [ -z "$(which mpiexec)" ] && [ -z "$(which mpiexec.hydra)" ] && [ -z "$(which mpirun)" ]; then
+ echo "Cannot find MPI implementation, neither mpiexec, mpiexec.hydra nor mpirun can be found in any directory in PATH"
+ exit 1
+ fi
+fi
while read -r LINE || [[ -n $LINE ]]; do
if [ -z "${LINE}" ]; then continue; fi
if [[ "${LINE}" =~ \#.* ]]; then continue; fi
-
OPTIONS=$(echo "${LINE}" | cut -d '|' -f 1)
+ OPTIONS=${OPTIONS//'FREQ'/"${FREQ}"}
RESULTS=$(echo "${LINE}" | cut -d '|' -f 2-)
NUM_RESULTS="${RESULTS//[^|]}"
- EXITCODE=$(${EXECPATH}/${EXEC} ${OPTIONS} 1>${TMPFILE} 2>&1 ; echo $?)
+ EXITCODE=$(${EXECPATH}/${EXEC} ${OPTIONS} 1>${TMPFILE} 2>&1 </dev/null; echo $?)
STATE=0
for ((i=1;i<=${#NUM_RESULTS}+1;i++)); do
RESULT=$(echo ${RESULTS} | cut -d '|' -f ${i})
@@ -78,3 +88,9 @@ while read -r LINE || [[ -n $LINE ]]; do
done < ${EXEC}.txt
+if [ "${EXEC}" == "likwid-setFrequencies" ]; then
+ ${EXEC} -f "${CURFREQ}"
+fi
+
+rm -f /tmp/topo.txt /tmp/test /tmp/test.txt /tmp/out.txt /tmp/out
+
diff --git a/test/serial.c b/test/serial.c
new file mode 100644
index 0000000..3debf10
--- /dev/null
+++ b/test/serial.c
@@ -0,0 +1,43 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <likwid.h>
+
+int main(int argc, char* argv[])
+{
+ int i, j;
+ int size;
+ double* vector;
+ if (argc != 2)
+ return 1;
+
+ size = atoi(argv[1]);
+ vector = (double*) malloc(size * sizeof(double));
+ if (!vector)
+ return 2;
+
+ LIKWID_MARKER_INIT;
+
+ LIKWID_MARKER_START("init");
+ for (i=0;i<size;i++)
+ vector[i] = 2.0;
+ LIKWID_MARKER_STOP("init");
+
+
+ LIKWID_MARKER_START("pow");
+ for (j=0;j<10;j++)
+ {
+ for (i=0;i<size;i++)
+ vector[i] = vector[i] * vector[i];
+ }
+ LIKWID_MARKER_STOP("pow");
+
+ LIKWID_MARKER_CLOSE;
+
+ free(vector);
+ return 0;
+
+
+
+}
diff --git a/test/stream.c b/test/stream.c
deleted file mode 100644
index 0214747..0000000
--- a/test/stream.c
+++ /dev/null
@@ -1,423 +0,0 @@
-/*-----------------------------------------------------------------------*/
-/* Program: Stream */
-/* Revision: $Id: stream.c,v 5.8 2007/02/19 23:57:39 mccalpin Exp mccalpin $ */
-/* Original code developed by John D. McCalpin */
-/* Programmers: John D. McCalpin */
-/* Joe R. Zagar */
-/* */
-/* This program measures memory transfer rates in MB/s for simple */
-/* computational kernels coded in C. */
-/*-----------------------------------------------------------------------*/
-/* Copyright 1991-2005: John D. McCalpin */
-/*-----------------------------------------------------------------------*/
-/* License: */
-/* 1. You are free to use this program and/or to redistribute */
-/* this program. */
-/* 2. You are free to modify this program for your own use, */
-/* including commercial use, subject to the publication */
-/* restrictions in item 3. */
-/* 3. You are free to publish results obtained from running this */
-/* program, or from works that you derive from this program, */
-/* with the following limitations: */
-/* 3a. In order to be referred to as "STREAM benchmark results", */
-/* published results must be in conformance to the STREAM */
-/* Run Rules, (briefly reviewed below) published at */
-/* http://www.cs.virginia.edu/stream/ref.html */
-/* and incorporated herein by reference. */
-/* As the copyright holder, John McCalpin retains the */
-/* right to determine conformity with the Run Rules. */
-/* 3b. Results based on modified source code or on runs not in */
-/* accordance with the STREAM Run Rules must be clearly */
-/* labelled whenever they are published. Examples of */
-/* proper labelling include: */
-/* "tuned STREAM benchmark results" */
-/* "based on a variant of the STREAM benchmark code" */
-/* Other comparable, clear and reasonable labelling is */
-/* acceptable. */
-/* 3c. Submission of results to the STREAM benchmark web site */
-/* is encouraged, but not required. */
-/* 4. Use of this program or creation of derived works based on this */
-/* program constitutes acceptance of these licensing restrictions. */
-/* 5. Absolutely no warranty is expressed or implied. */
-/*-----------------------------------------------------------------------*/
-#define _GNU_SOURCE
-#include <stdlib.h>
-# include <stdio.h>
-# include <math.h>
-# include <float.h>
-# include <omp.h>
-# include <limits.h>
-# include <sys/time.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <unistd.h>
-#include <sched.h>
-#include <time.h>
-#include <pthread.h>
-
-/* INSTRUCTIONS:
- *
- * 1) Stream requires a good bit of memory to run. Adjust the
- * value of 'N' (below) to give a 'timing calibration' of
- * at least 20 clock-ticks. This will provide rate estimates
- * that should be good to about 5% precision.
- */
-
-# define N 60000000
-# define NTIMES 10
-# define OFFSET 0
-
-/*
- * 3) Compile the code with full optimization. Many compilers
- * generate unreasonably bad code before the optimizer tightens
- * things up. If the results are unreasonably good, on the
- * other hand, the optimizer might be too smart for me!
- *
- * Try compiling with:
- * cc -O stream_omp.c -o stream_omp
- *
- * This is known to work on Cray, SGI, IBM, and Sun machines.
- *
- *
- * 4) Mail the results to mccalpin at cs.virginia.edu
- * Be sure to include:
- * a) computer hardware model number and software revision
- * b) the compiler flags
- * c) all of the output from the test case.
- * Thanks!
- *
- */
-#define gettid() syscall(SYS_gettid)
-#include <likwid.h>
-
-# define HLINE "-------------------------------------------------------------\n"
-
-# ifndef MIN
-# define MIN(x,y) ((x)<(y)?(x):(y))
-# endif
-# ifndef MAX
-# define MAX(x,y) ((x)>(y)?(x):(y))
-# endif
-
-static double a[N+OFFSET],
- b[N+OFFSET],
- c[N+OFFSET];
-
-static double avgtime[4] = {0}, maxtime[4] = {0},
- mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
-
-static char *label[4] = {"Copy: ", "Scale: ",
- "Add: ", "Triad: "};
-
-static double bytes[4] = {
- 2 * sizeof(double) * N,
- 2 * sizeof(double) * N,
- 3 * sizeof(double) * N,
- 3 * sizeof(double) * N
- };
-
-static int
-getProcessorID(cpu_set_t* cpu_set)
-{
- int processorId;
-
- for (processorId=0;processorId<128;processorId++)
- {
- if (CPU_ISSET(processorId,cpu_set))
- {
- break;
- }
- }
- return processorId;
-}
-
-int threadGetProcessorId()
-{
- cpu_set_t cpu_set;
- CPU_ZERO(&cpu_set);
- sched_getaffinity(gettid(),sizeof(cpu_set_t), &cpu_set);
-
- return getProcessorID(&cpu_set);
-}
-
-extern double mysecond();
-extern void checkSTREAMresults();
-#ifdef _OPENMP
-extern int omp_get_num_threads();
-#endif
-int
-main()
- {
- int quantum, checktick();
- int BytesPerWord;
- register int j, k;
- double scalar, t, times[4][NTIMES];
-
- /* --- SETUP --- determine precision and check timing --- */
-
- printf(HLINE);
- printf("STREAM version $Revision: 5.8 $\n");
- printf(HLINE);
- BytesPerWord = sizeof(double);
- printf("This system uses %d bytes per DOUBLE PRECISION word.\n",
- BytesPerWord);
-
- printf(HLINE);
- printf("Array size = %d, Offset = %d\n" , N, OFFSET);
- printf("Total memory required = %.1f MB.\n",
- (3.0 * BytesPerWord) * ( (double) N / 1048576.0));
- printf("Each test is run %d times, but only\n", NTIMES);
- printf("the *best* time for each is used.\n");
-
-#ifdef LIKWID_PERFMON
- printf("Using likwid\n");
-#endif
-
- LIKWID_MARKER_INIT;
-
-#ifdef _OPENMP
- printf(HLINE);
-#pragma omp parallel
- {
- LIKWID_MARKER_THREADINIT;
-#pragma omp master
- {
- k = omp_get_num_threads();
- printf ("Number of Threads requested = %i\n",k);
- }
-
- printf ("Thread %d running on processor %d ....\n",omp_get_thread_num(),threadGetProcessorId());
- }
-#endif
-
- LIKWID_MARKER_START("init");
- /* Get initial value for system clock. */
-//#pragma omp parallel for
- for (j=0; j<N; j++) {
- a[j] = 1.0;
- b[j] = 2.0;
- c[j] = 0.0;
- }
- LIKWID_MARKER_STOP("init");
-
- printf(HLINE);
-
- if ( (quantum = checktick()) >= 1)
- printf("Your clock granularity/precision appears to be "
- "%d microseconds.\n", quantum);
- else {
- printf("Your clock granularity appears to be "
- "less than one microsecond.\n");
- quantum = 1;
- }
-
- t = mysecond();
-#pragma omp parallel for
- for (j = 0; j < N; j++)
- a[j] = 2.0E0 * a[j];
- t = 1.0E6 * (mysecond() - t);
-
- printf("Each test below will take on the order"
- " of %d microseconds.\n", (int) t );
- printf(" (= %d clock ticks)\n", (int) (t/quantum) );
- printf("Increase the size of the arrays if this shows that\n");
- printf("you are not getting at least 20 clock ticks per test.\n");
-
- printf(HLINE);
-
- printf("WARNING -- The above is only a rough guideline.\n");
- printf("For best results, please be sure you know the\n");
- printf("precision of your system timer.\n");
- printf(HLINE);
-
- /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
-
- scalar = 3.0;
- for (k=0; k<NTIMES; k++)
- {
- times[0][k] = mysecond();
-#pragma omp parallel
- {
- LIKWID_MARKER_START("copy");
-#pragma omp for
- for (j=0; j<N; j++)
- c[j] = a[j];
- LIKWID_MARKER_STOP("copy");
- }
- times[0][k] = mysecond() - times[0][k];
-
- times[1][k] = mysecond();
-#pragma omp parallel
- {
- LIKWID_MARKER_START("scale");
-#pragma omp for
- for (j=0; j<N; j++)
- b[j] = scalar*c[j];
- LIKWID_MARKER_STOP("scale");
- }
- times[1][k] = mysecond() - times[1][k];
-
- times[2][k] = mysecond();
-#pragma omp parallel
- {
- LIKWID_MARKER_START("add");
-#pragma omp for
- for (j=0; j<N; j++)
- c[j] = a[j]+b[j];
- LIKWID_MARKER_STOP("add");
- }
- times[2][k] = mysecond() - times[2][k];
-
- times[3][k] = mysecond();
-#pragma omp parallel
- {
- LIKWID_MARKER_START("triad");
-#pragma omp for
- for (j=0; j<N; j++)
- a[j] = b[j]+scalar*c[j];
- LIKWID_MARKER_STOP("triad");
- }
- times[3][k] = mysecond() - times[3][k];
- }
-
- /* --- SUMMARY --- */
-
- for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
- {
- for (j=0; j<4; j++)
- {
- avgtime[j] = avgtime[j] + times[j][k];
- mintime[j] = MIN(mintime[j], times[j][k]);
- maxtime[j] = MAX(maxtime[j], times[j][k]);
- }
- }
-
- printf("Function Rate (MB/s) Avg time Min time Max time\n");
- for (j=0; j<4; j++) {
- avgtime[j] = avgtime[j]/(double)(NTIMES-1);
-
- printf("%s%11.4f %11.4f %11.4f %11.4f\n", label[j],
- 1.0E-06 * bytes[j]/mintime[j],
- avgtime[j],
- mintime[j],
- maxtime[j]);
- }
- printf(HLINE);
-
- /* --- Check Results --- */
- checkSTREAMresults();
- printf(HLINE);
-
- LIKWID_MARKER_CLOSE;
- return 0;
-}
-
-# define M 20
-
-int
-checktick()
- {
- int i, minDelta, Delta;
- double t1, t2, timesfound[M];
-
-/* Collect a sequence of M unique time values from the system. */
-
- for (i = 0; i < M; i++) {
- t1 = mysecond();
- while( ((t2=mysecond()) - t1) < 1.0E-6 )
- ;
- timesfound[i] = t1 = t2;
- }
-
-/*
- * Determine the minimum difference between these M values.
- * This result will be our estimate (in microseconds) for the
- * clock granularity.
- */
-
- minDelta = 1000000;
- for (i = 1; i < M; i++) {
- Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
- minDelta = MIN(minDelta, MAX(Delta,0));
- }
-
- return(minDelta);
- }
-
-
-
-/* A gettimeofday routine to give access to the wall
- clock timer on most UNIX-like systems. */
-
-#include <sys/time.h>
-
-double mysecond()
-{
- struct timeval tp;
- struct timezone tzp;
- int i;
-
- i = gettimeofday(&tp,&tzp);
- return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
-}
-
-void checkSTREAMresults ()
-{
- double aj,bj,cj,scalar;
- double asum,bsum,csum;
- double epsilon;
- int j,k;
-
- /* reproduce initialization */
- aj = 1.0;
- bj = 2.0;
- cj = 0.0;
- /* a[] is modified during timing check */
- aj = 2.0E0 * aj;
- /* now execute timing loop */
- scalar = 3.0;
- for (k=0; k<NTIMES; k++)
- {
- cj = aj;
- bj = scalar*cj;
- cj = aj+bj;
- aj = bj+scalar*cj;
- }
- aj = aj * (double) (N);
- bj = bj * (double) (N);
- cj = cj * (double) (N);
-
- asum = 0.0;
- bsum = 0.0;
- csum = 0.0;
- for (j=0; j<N; j++) {
- asum += a[j];
- bsum += b[j];
- csum += c[j];
- }
-
-#ifndef abs
-#define abs(a) ((a) >= 0 ? (a) : -(a))
-#endif
- epsilon = 1.e-8;
-
- if (abs(aj-asum)/asum > epsilon) {
- printf ("Failed Validation on array a[]\n");
- printf (" Expected : %f \n",aj);
- printf (" Observed : %f \n",asum);
- }
- else if (abs(bj-bsum)/bsum > epsilon) {
- printf ("Failed Validation on array b[]\n");
- printf (" Expected : %f \n",bj);
- printf (" Observed : %f \n",bsum);
- }
- else if (abs(cj-csum)/csum > epsilon) {
- printf ("Failed Validation on array c[]\n");
- printf (" Expected : %f \n",cj);
- printf (" Observed : %f \n",csum);
- }
- else {
- printf ("Solution Validates\n");
- }
-}
-
diff --git a/test/test-likwidAPI.c b/test/test-likwidAPI.c
new file mode 100644
index 0000000..7a2001f
--- /dev/null
+++ b/test/test-likwidAPI.c
@@ -0,0 +1,2099 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <math.h>
+
+#include <likwid.h>
+//#include <configuration.h>
+//#include <access.h>
+//#include <types.h>
+//#include <perfmon.h>
+
+typedef struct {
+ char* testname;
+ int(*testfunc)(void);
+ int result;
+} test;
+
+static int verbose = 0;
+
+static char eventset_ok[] = "INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1,CPU_CLK_UNHALTED_REF:FIXC2";
+static char event1_ok[] = "INSTR_RETIRED_ANY";
+static char event2_ok[] = "CPU_CLK_UNHALTED_CORE";
+static char event3_ok[] = "CPU_CLK_UNHALTED_REF";
+static char ctr1_ok[] = "FIXC0";
+static char ctr2_ok[] = "FIXC1";
+static char ctr3_ok[] = "FIXC2";
+static char eventset_option[] = "INSTR_RETIRED_ANY:FIXC0:ANYTHREAD,CPU_CLK_UNHALTED_CORE:FIXC1:ANYTHREAD,CPU_CLK_UNHALTED_REF:FIXC2:ANYTHREAD";
+static int isIntel = 0;
+static char perfgroup_ok[] = "BRANCH";
+static char perfgroup_fail[] = "BRAN";
+
+
+
+
+
+int test_initconfig()
+{
+ int ret;
+ ret = init_configuration();
+ if (ret != 0)
+ goto fail;
+ Configuration_t config = get_configuration();
+ if (config == NULL)
+ goto fail;
+ if ((config->daemonMode != ACCESSMODE_DIRECT) && (config->daemonMode != ACCESSMODE_DAEMON))
+ goto fail;
+ if ((config->daemonMode == ACCESSMODE_DAEMON) && (config->daemonPath == NULL))
+ goto fail;
+ destroy_configuration();
+ return 1;
+fail:
+ destroy_configuration();
+ return 0;
+}
+
+int enable_configuration()
+{
+ init_configuration();
+ return 1;
+}
+
+int disable_configuration()
+{
+ destroy_configuration();
+ return 1;
+}
+
+int test_hpmmode()
+{
+ Configuration_t config;
+ config = get_configuration();
+ int def = config->daemonMode;
+ HPMmode(ACCESSMODE_DIRECT);
+ if (config->daemonMode != ACCESSMODE_DIRECT)
+ goto fail;
+ HPMmode(ACCESSMODE_DAEMON);
+ if (config->daemonMode != ACCESSMODE_DAEMON)
+ goto fail;
+ HPMmode(def);
+ HPMmode(ACCESSMODE_DAEMON+1);
+ if (config->daemonMode != def)
+ goto fail;
+ return 1;
+fail:
+ return 0;
+}
+
+int test_hpminit()
+{
+ int ret = HPMinit();
+ if (ret != 0)
+ return 0;
+ HPMfinalize();
+ return 1;
+}
+
+int test_hpmaddthread()
+{
+ HPMinit();
+ int ret = HPMaddThread(0);
+ if (ret != 0)
+ return 0;
+ HPMfinalize();
+ return 1;
+}
+
+int enable_hpm()
+{
+ HPMinit();
+ HPMaddThread(0);
+ return 1;
+}
+
+int disable_hpm()
+{
+ HPMfinalize();
+ return 1;
+}
+
+int test_topologyinit()
+{
+ int i, j;
+ int ret = topology_init();
+ if (ret != 0)
+ goto fail;
+ CpuInfo_t cpuinfo = get_cpuInfo();
+ if (cpuinfo == NULL)
+ goto fail;
+ if (cpuinfo->family == 0)
+ goto fail;
+ if (cpuinfo->model == 0)
+ goto fail;
+ if (cpuinfo->osname == NULL)
+ goto fail;
+ if (cpuinfo->name == NULL)
+ goto fail;
+ if (cpuinfo->features == NULL)
+ goto fail;
+ CpuTopology_t cputopo = get_cpuTopology();
+ if (cputopo->threadPool == NULL)
+ goto fail;
+ if (cputopo->cacheLevels == NULL)
+ goto fail;
+ if (cputopo->numHWThreads == 0)
+ goto fail;
+ if (cputopo->activeHWThreads == 0)
+ goto fail;
+ if (cputopo->numSockets == 0)
+ goto fail;
+ if (cputopo->numCoresPerSocket < 1)
+ goto fail;
+ if (cputopo->numThreadsPerCore < 1)
+ goto fail;
+ if (cputopo->numHWThreads > 0)
+ {
+ for (i = 0; i < cputopo->numHWThreads; i++)
+ {
+ for (j=0;j< cputopo->numHWThreads; j++)
+ {
+ if ((i != j) && (cputopo->threadPool[i].apicId == cputopo->threadPool[j].apicId))
+ goto fail;
+ }
+ if (cputopo->threadPool[i].threadId >= cputopo->numThreadsPerCore)
+ {
+ goto fail;
+ }
+ if (cputopo->threadPool[i].packageId >= cputopo->numSockets)
+ {
+ goto fail;
+ }
+ }
+ }
+ if (cputopo->numCacheLevels > 0)
+ {
+ for (i=0;i<cputopo->numCacheLevels;i++)
+ {
+ if (cputopo->cacheLevels[i].level > cputopo->numCacheLevels)
+ {
+ goto fail;
+ }
+
+ }
+ }
+ isIntel = cpuinfo->isIntel;
+ topology_finalize();
+ return 1;
+fail:
+ topology_finalize();
+ return 0;
+}
+
+int enable_topology()
+{
+ topology_init();
+ return 1;
+}
+
+int disable_topology()
+{
+ topology_finalize();
+ return 1;
+}
+
+int test_numainit()
+{
+ int i = 0;
+ topology_init();
+ numa_init();
+ NumaTopology_t numainfo = get_numaTopology();
+ if (numainfo == NULL)
+ goto fail;
+ if (numainfo->numberOfNodes <= 0)
+ goto fail;
+ if (likwid_getNumberOfNodes() <= 0)
+ goto fail;
+ for (i = 0; i < likwid_getNumberOfNodes(); i++)
+ {
+ if (numainfo->nodes[i].totalMemory == 0)
+ goto fail;
+ if (numainfo->nodes[i].freeMemory == 0)
+ goto fail;
+ if (numainfo->nodes[i].numberOfProcessors == 0)
+ goto fail;
+ if (numainfo->nodes[i].numberOfDistances == 0)
+ goto fail;
+ if (numainfo->nodes[i].numberOfDistances != likwid_getNumberOfNodes())
+ goto fail;
+ }
+ numa_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ numa_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_affinityinit()
+{
+ int i = 0;
+ topology_init();
+ CpuTopology_t cputopo = get_cpuTopology();
+ numa_init();
+ affinity_init();
+ AffinityDomains_t doms = get_affinityDomains();
+ if (doms == NULL)
+ goto fail;
+ if (doms->numberOfSocketDomains != cputopo->numSockets)
+ goto fail;
+ if (doms->numberOfNumaDomains == 0)
+ goto fail;
+ if (doms->numberOfProcessorsPerSocket == 0)
+ goto fail;
+ if (doms->numberOfAffinityDomains == 0)
+ goto fail;
+ if (doms->numberOfCacheDomains == 0)
+ goto fail;
+ if (doms->numberOfCoresPerCache == 0)
+ goto fail;
+ if (doms->numberOfProcessorsPerCache == 0)
+ goto fail;
+ if (doms->numberOfProcessorsPerCache < doms->numberOfCoresPerCache)
+ goto fail;
+ if (doms->domains == NULL)
+ goto fail;
+ for (i = 0; i < doms->numberOfAffinityDomains; i++)
+ {
+ if (doms->domains[i].numberOfProcessors == 0)
+ goto fail;
+ if (doms->domains[i].numberOfCores == 0)
+ goto fail;
+ if (doms->domains[i].numberOfProcessors < doms->domains[i].numberOfCores)
+ goto fail;
+ if (doms->domains[i].processorList == NULL)
+ goto fail;
+ }
+ affinity_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ affinity_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_cpustring_logical()
+{
+ int test[5];
+ int len = 5;
+ int ret = cpustr_to_cpulist("S0:0-3", test, len);
+ if (ret < 0)
+ {
+ if (verbose) printf("Returned %d\n", ret);
+ return 0;
+ }
+ if (ret != 4)
+ {
+ if (verbose) printf("Returned with %d not enough CPUs\n", ret);
+ return 0;
+ }
+ return 1;
+}
+
+int test_cpustring_physical()
+{
+ int test[5];
+ int len = 5;
+ int ret = cpustr_to_cpulist("0,1,2,3", test, len);
+ if (ret < 0)
+ {
+ if (verbose) printf("Returned %d\n", ret);
+ return 0;
+ }
+ if (ret != 4)
+ {
+ if (verbose) printf("Returned with %d not enough CPUs\n", ret);
+ return 0;
+ }
+ return 1;
+}
+
+int test_cpustring_expression()
+{
+ int test[5];
+ int len = 5;
+ int ret = cpustr_to_cpulist("E:S0:4:1:2", test, len);
+ if (ret < 0)
+ {
+ if (verbose) printf("Returned %d\n", ret);
+ return 0;
+ }
+ if (ret != 4)
+ {
+ if (verbose) printf("Returned with %d not enough CPUs\n", ret);
+ return 0;
+ }
+ return 1;
+}
+
+int test_cpustring_scatter()
+{
+ int test[100];
+ int len = 100;
+ int ret = cpustr_to_cpulist("S:scatter", test, len);
+ if (ret < 0)
+ {
+ if (verbose) printf("Returned %d\n", ret);
+ return 0;
+ }
+ CpuTopology_t cputopo = get_cpuTopology();
+ if (ret != cputopo->numHWThreads)
+ {
+ if (verbose) printf("Returned with %d not enough CPUs (%d)\n", ret, cputopo->numHWThreads);
+ return 0;
+ }
+ return 1;
+}
+
+int test_cpustring_combined()
+{
+ int test[100];
+ int len = 100;
+ int ret = cpustr_to_cpulist("N:0-3 at S0:0-3", test, len);
+ if (ret < 0)
+ {
+ if (verbose) printf("Returned %d\n", ret);
+ return 0;
+ }
+ if (ret != 8)
+ {
+ if (verbose) printf("Returned with %d not enough CPUs\n", ret);
+ return 0;
+ }
+ return 1;
+}
+
+int test_perfmoninit_faulty()
+{
+ int cpu = 0;
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ perfmon_finalize();
+ return 0;
+fail:
+ perfmon_finalize();
+ return 1;
+}
+
+int test_perfmoninit_valid()
+{
+ int cpu = 0;
+ topology_init();
+ affinity_init();
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ if (perfmon_getNumberOfGroups() != 0)
+ goto fail;
+ if (perfmon_getNumberOfThreads() != 1)
+ goto fail;
+ perfmon_finalize();
+ affinity_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ affinity_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmoninit()
+{
+ int cpu = 0;
+ int i;
+ topology_init();
+ affinity_init();
+ for(i=0;i<10;i++)
+ {
+ perfmon_init(1, &cpu);
+ perfmon_finalize();
+ }
+ affinity_finalize();
+ topology_finalize();
+ return 1;
+}
+
+int test_perfmonfinalize()
+{
+ perfmon_finalize();
+ return 1;
+}
+
+int test_perfmonaddeventset()
+{
+ char eventset_fail1[] = "INSTR_RETIRED.ANY:FIXC0";
+ char eventset_fail2[] = "INSTR_RETIRED-ANY:FIXC0";
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0) {
+ if (verbose > 0) printf("Perfmon init failed\n");
+ goto fail;
+ }
+ if (perfmon_getNumberOfGroups() != 0) {
+ if (verbose > 0) printf("Perfmon number of groups != 0\n");
+ goto fail;
+ }
+ if (perfmon_getNumberOfThreads() != 1) {
+ if (verbose > 0) printf("Perfmon number of threads != 1\n");
+ goto fail;
+ }
+ if (perfmon_getIdOfActiveGroup() != -1) {
+ if (verbose > 0) printf("Perfmon id of active group != -1\n");
+ goto fail;
+ }
+ ret = perfmon_addEventSet(eventset_ok);
+ if (ret != 0) {
+ if (verbose > 0) printf("Perfmon addEventSet(ok) failed\n");
+ goto fail;
+ }
+ if (perfmon_getNumberOfGroups() != 1) {
+ if (verbose > 0) printf("Perfmon number of groups != 1\n");
+ goto fail;
+ }
+ if (perfmon_getNumberOfEvents(ret) != 3) {
+ if (verbose > 0) printf("Perfmon number of events != 3\n");
+ goto fail;
+ }
+ if (perfmon_getIdOfActiveGroup() != -1) {
+ if (verbose > 0) printf("Perfmon id of active group != -1\n");
+ goto fail;
+ }
+ ret = perfmon_addEventSet(eventset_option);
+ if (ret != 1) {
+ if (verbose > 0) printf("Perfmon addEventSet(options) failed\n");
+ goto fail;
+ }
+ if (perfmon_getNumberOfGroups() != 2) {
+ if (verbose > 0) printf("Perfmon number of groups != 2\n");
+ goto fail;
+ }
+ if (perfmon_getNumberOfEvents(ret) != 3) {
+ if (verbose > 0) printf("Perfmon number of events != 3\n");
+ goto fail;
+ }
+ if (perfmon_getIdOfActiveGroup() != -1) {
+ if (verbose > 0) printf("Perfmon id of active group != -1\n");
+ goto fail;
+ }
+ ret = perfmon_addEventSet(eventset_fail1);
+ if (ret >= 0) {
+ if (verbose > 0) printf("Perfmon addEventSet(fail1) failed\n");
+ goto fail;
+ }
+ if (perfmon_getNumberOfGroups() != 2) {
+ if (verbose > 0) printf("Perfmon number of groups != 2\n");
+ goto fail;
+ }
+ ret = perfmon_addEventSet(eventset_fail2);
+ if (ret >= 0) {
+ if (verbose > 0) printf("Perfmon addEventSet(fail2) failed\n");
+ goto fail;
+ }
+ if (perfmon_getNumberOfGroups() != 2) {
+ if (verbose > 0) printf("Perfmon number of groups != 2\n");
+ goto fail;
+ }
+ if (perfmon_getIdOfActiveGroup() != -1) {
+ if (verbose > 0) printf("Perfmon id of active group != -1\n");
+ goto fail;
+ }
+ perfmon_finalize();
+ affinity_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ affinity_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonaddeventset_noinit()
+{
+ int ret = perfmon_addEventSet(eventset_ok);
+ if (ret == 0)
+ goto fail;
+ return 1;
+fail:
+ return 0;
+}
+
+int test_perfmoncustomgroup()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0) {
+ if (verbose > 0) printf("Perfmon init failed\n");
+ goto fail;
+ }
+ ret = perfmon_addEventSet(eventset_ok);
+ if (ret != 0) {
+ if (verbose > 0) printf("Perfmon addEventSet(ok) failed\n");
+ goto fail;
+ }
+ if (perfmon_getNumberOfEvents(ret) != 3) {
+ if (verbose > 0) printf("Perfmon number of events != 3\n");
+ goto fail;
+ }
+ if (perfmon_getNumberOfMetrics(ret) != 0) {
+ if (verbose > 0) printf("Perfmon number of metrics != 0\n");
+ goto fail;
+ }
+ if (strcmp(perfmon_getEventName(ret, 0), event1_ok) != 0)
+ {
+ goto fail;
+ }
+ if (strcmp(perfmon_getEventName(ret, 1), event2_ok) != 0)
+ {
+ goto fail;
+ }
+ if (strcmp(perfmon_getEventName(ret, 2), event3_ok) != 0)
+ {
+ goto fail;
+ }
+ if (strcmp(perfmon_getCounterName(ret, 0), ctr1_ok) != 0)
+ {
+ goto fail;
+ }
+ if (strcmp(perfmon_getCounterName(ret, 1), ctr2_ok) != 0)
+ {
+ goto fail;
+ }
+ if (strcmp(perfmon_getCounterName(ret, 2), ctr3_ok) != 0)
+ {
+ goto fail;
+ }
+
+ if (strcmp(perfmon_getGroupName(ret), "Custom") != 0)
+ {
+ goto fail;
+ }
+ if (strcmp(perfmon_getGroupInfoShort(ret), "Custom") != 0)
+ {
+ goto fail;
+ }
+ if (strcmp(perfmon_getGroupInfoLong(ret), "Custom") != 0)
+ {
+ goto fail;
+ }
+ if (perfmon_getLastTimeOfGroup(ret) != 0)
+ {
+ goto fail;
+ }
+
+ perfmon_finalize();
+ affinity_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ affinity_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmongetgroups()
+{
+ int i;
+ topology_init();
+ char** glist = NULL;
+ char** slist = NULL;
+ char** llist = NULL;
+ int ret = perfmon_getGroups(&glist, &slist, &llist);
+
+ if (ret <= 0)
+ {
+ goto fail;
+ }
+ for (i=0; i< ret; i++)
+ {
+ if (strcmp(glist[i], "") == 0)
+ {
+ goto fail;
+ }
+ if (strcmp(slist[i], "") == 0)
+ {
+ goto fail;
+ }
+ if (strcmp(llist[i], "") == 0)
+ {
+ goto fail;
+ }
+ }
+ perfmon_returnGroups(ret, glist, slist, llist);
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_returnGroups(ret, glist, slist, llist);
+ topology_finalize();
+ return 0;
+}
+
+int _test_perfmonperfgroup(char* perfgroup)
+{
+ CpuInfo_t cpuinfo;
+ int i;
+ int cpu = 0;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0) {
+ if (verbose > 0) printf("Perfmon init failed\n");
+ goto fail;
+ }
+ ret = perfmon_addEventSet(perfgroup);
+ if (ret != 0) {
+ if (verbose > 0) printf("Perfmon addEventSet(%s) failed\n", perfgroup);
+ goto fail;
+ }
+ if (perfmon_getNumberOfEvents(ret) == 0) {
+ if (verbose > 0) printf("Perfmon number of events == 0\n");
+ goto fail;
+ }
+ if (perfmon_getNumberOfMetrics(ret) == 0) {
+ if (verbose > 0) printf("Perfmon number of metrics == 0\n");
+ goto fail;
+ }
+ for (i=0; i<perfmon_getNumberOfEvents(ret); i++) {
+ if (strcmp(perfmon_getEventName(ret, i), "") == 0)
+ {
+ if (verbose > 0) printf("Perfmon event name zero\n");
+ goto fail;
+ }
+ if (strcmp(perfmon_getCounterName(ret, i), "") == 0)
+ {
+ if (verbose > 0) printf("Perfmon counter name zero\n");
+ goto fail;
+ }
+ }
+ if (strcmp(perfmon_getGroupName(ret), "Custom") == 0)
+ {
+ if (verbose > 0) if (verbose > 0) printf("Perfmon groupName %s == %s\n", perfgroup, perfmon_getGroupName(ret));
+ goto fail;
+ }
+ if (strcmp(perfmon_getGroupInfoShort(ret), "Custom") == 0)
+ {
+ printf("Perfmon shortInfo %s == %s\n", perfgroup, perfmon_getGroupInfoShort(ret));
+ goto fail;
+ }
+ if (strcmp(perfmon_getGroupInfoLong(ret), "Custom") == 0)
+ {
+ if (verbose > 0) printf("Perfmon longInfo %s == %s\n", perfgroup, perfmon_getGroupInfoShort(ret));
+ goto fail;
+ }
+ if (perfmon_getLastTimeOfGroup(ret) != 0)
+ {
+ if (verbose > 0) printf("Perfmon last time of %s: %f\n", perfgroup, perfmon_getLastTimeOfGroup(ret));
+ goto fail;
+ }
+ if (perfmon_getTimeOfGroup(ret) != 0)
+ {
+ if (verbose > 0) printf("Perfmon time of %s: %f\n", perfgroup, perfmon_getTimeOfGroup(ret));
+ goto fail;
+ }
+ perfmon_setupCounters(ret);
+ perfmon_startCounters();
+ sleep(1);
+ perfmon_stopCounters();
+ for (i=0; i<perfmon_getNumberOfMetrics(ret); i++) {
+ if (strcmp(perfmon_getMetricName(ret, i), "") == 0)
+ {
+ if (verbose > 0) printf("Perfmon metric name zero\n");
+ goto fail;
+ }
+ double res = perfmon_getMetric(ret, i, 0);
+ if ((res != 0.0) && (res < 0))
+ {
+ if (verbose > 0) printf("Perfmon metric %s result %f\n", perfmon_getMetricName(ret, i), res );
+ goto fail;
+ }
+ double lastres = perfmon_getLastMetric(ret, i, 0);
+ if ((ret >= 0) &&
+ (lastres >= 0) &&
+ (res != lastres))
+ {
+ if (verbose > 0) printf("Perfmon metric %s result %f not equal to last %f\n", perfmon_getMetricName(ret, i), res, lastres);
+ goto fail;
+ }
+ }
+ perfmon_finalize();
+ affinity_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ affinity_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonperfgroup_ok()
+{
+ return _test_perfmonperfgroup(perfgroup_ok);
+}
+
+int test_perfmonperfgroup_fail()
+{
+ return !_test_perfmonperfgroup(perfgroup_fail);
+}
+
+int test_perfmonsetup()
+{
+ CpuInfo_t cpuinfo;
+ int group1, group2;
+ int cpu = 0;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ if (perfmon_getNumberOfGroups() != 0)
+ goto fail;
+ if (perfmon_getNumberOfThreads() != 1)
+ goto fail;
+ ret = perfmon_addEventSet(eventset_ok);
+ if (ret != 0)
+ goto fail;
+ group1 = ret;
+ if (perfmon_getNumberOfGroups() != 1)
+ goto fail;
+ if (perfmon_getNumberOfEvents(group1) != 3)
+ goto fail;
+ ret = perfmon_setupCounters(group1);
+ if (ret != 0)
+ goto fail;
+ if (perfmon_getIdOfActiveGroup() != group1)
+ goto fail;
+ ret = perfmon_addEventSet(eventset_option);
+ if (ret != 1)
+ goto fail;
+ group2 = ret;
+ if (perfmon_getIdOfActiveGroup() != group1)
+ goto fail;
+ if (perfmon_getNumberOfGroups() != 2)
+ goto fail;
+ if (perfmon_getNumberOfEvents(group1) != 3)
+ goto fail;
+ if (perfmon_getNumberOfEvents(group2) != 3)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonswitch()
+{
+ CpuInfo_t cpuinfo;
+ int group1, group2;
+ int cpu = 0;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_addEventSet(eventset_ok);
+ if (ret != 0)
+ goto fail;
+ group1 = ret;
+ ret = perfmon_addEventSet(eventset_option);
+ if (ret != 1)
+ goto fail;
+ group2 = ret;
+ ret = perfmon_setupCounters(group1);
+ if (ret != 0)
+ goto fail;
+ if (perfmon_getIdOfActiveGroup() != group1)
+ goto fail;
+ ret = perfmon_switchActiveGroup(group2);
+ if (perfmon_getIdOfActiveGroup() != group2)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonstart()
+{
+ CpuInfo_t cpuinfo;
+ int group1, group2;
+ int cpu = 0;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_addEventSet(eventset_ok);
+ if (ret != 0)
+ goto fail;
+ group1 = ret;
+ ret = perfmon_setupCounters(group1);
+ if (ret != 0)
+ goto fail;
+ if (perfmon_getIdOfActiveGroup() != group1)
+ goto fail;
+ ret = perfmon_startCounters();
+ if (ret != 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonsetup_noinit()
+{
+ int ret = perfmon_setupCounters(0);
+ if (ret == 0)
+ goto fail;
+ return 1;
+fail:
+ return 0;
+}
+
+int test_perfmonsetup_noadd()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_setupCounters(0);
+ if (ret == 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonstart_noinit()
+{
+ int ret = perfmon_startCounters();
+ if (ret == 0)
+ goto fail;
+ return 1;
+fail:
+ return 0;
+}
+
+int test_perfmonstart_noadd()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_startCounters();
+ if (ret == 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonstop()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ int group;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_addEventSet(eventset_ok);
+ if (ret != 0)
+ goto fail;
+ group = ret;
+ ret = perfmon_setupCounters(group);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_startCounters();
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_stopCounters();
+ if (ret != 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonstop_noinit()
+{
+ int ret = perfmon_stopCounters();
+ if (ret == 0)
+ goto fail;
+ return 1;
+fail:
+ return 0;
+}
+
+int test_perfmonstop_noadd()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ int group;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_stopCounters();
+ if (ret == 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonstop_nosetup()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ int group;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_addEventSet(eventset_ok);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_stopCounters();
+ if (ret == 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonstop_nostart()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ int group;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_addEventSet(eventset_ok);
+ if (ret != 0)
+ goto fail;
+ group = ret;
+ ret = perfmon_setupCounters(group);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_stopCounters();
+ if (ret == 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonresult_noinit()
+{
+ double result = perfmon_getResult(0,0,0);
+ if (result != 0)
+ goto fail;
+ return 1;
+fail:
+ return 0;
+}
+
+int test_perfmonresult_noadd()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ int group;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ double result = perfmon_getResult(0,0,0);
+ if (result != 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonresult_nosetup()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ int group;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_addEventSet(eventset_ok);
+ if (ret != 0)
+ goto fail;
+ group = ret;
+ double result = perfmon_getResult(group,0,0);
+ if (result != 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonresult_nostart()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ int group;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_addEventSet(eventset_ok);
+ if (ret != 0)
+ goto fail;
+ group = ret;
+ ret = perfmon_setupCounters(group);
+ if (ret != 0)
+ goto fail;
+ double result = perfmon_getResult(group,0,0);
+ if (result != 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonresult_nostop()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ int group;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_addEventSet(eventset_ok);
+ if (ret != 0)
+ goto fail;
+ group = ret;
+ ret = perfmon_setupCounters(group);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_startCounters();
+ if (ret != 0)
+ goto fail;
+ double result = perfmon_getResult(group,0,0);
+ if (result != 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonresult()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ int group;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_addEventSet(eventset_ok);
+ if (ret != 0)
+ goto fail;
+ group = ret;
+ ret = perfmon_setupCounters(group);
+ if (ret != 0)
+ goto fail;
+
+ ret = perfmon_startCounters();
+ if (ret != 0)
+ goto fail;
+ sleep(1);
+ ret = perfmon_stopCounters();
+ if (ret != 0)
+ goto fail;
+ if ((perfmon_getResult(group,0,0) == 0)||(perfmon_getResult(group,1,0) == 0))
+ goto fail;
+ if (perfmon_getTimeOfGroup(group) == 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonlastresult_noinit()
+{
+ double result = perfmon_getLastResult(0,0,0);
+ if (result != 0)
+ goto fail;
+ return 1;
+fail:
+ return 0;
+}
+
+int test_perfmonlastresult_noadd()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ int group;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ double result = perfmon_getLastResult(0,0,0);
+ if (result != 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonlastresult_nosetup()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ int group;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_addEventSet(eventset_ok);
+ if (ret != 0)
+ goto fail;
+ group = ret;
+ double result = perfmon_getLastResult(group,0,0);
+ if (result != 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonlastresult_nostart()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ int group;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_addEventSet(eventset_ok);
+ if (ret != 0)
+ goto fail;
+ group = ret;
+ ret = perfmon_setupCounters(group);
+ if (ret != 0)
+ goto fail;
+ double result = perfmon_getLastResult(group,0,0);
+ if (result != 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonlastresult_nostop()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ int group;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_addEventSet(eventset_ok);
+ if (ret != 0)
+ goto fail;
+ group = ret;
+ ret = perfmon_setupCounters(group);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_startCounters();
+ if (ret != 0)
+ goto fail;
+ double result = perfmon_getLastResult(group,0,0);
+ if (result != 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonlastresult()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ int group;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_addEventSet(eventset_ok);
+ if (ret != 0)
+ goto fail;
+ group = ret;
+ ret = perfmon_setupCounters(group);
+ if (ret != 0)
+ goto fail;
+
+ ret = perfmon_startCounters();
+ if (ret != 0)
+ goto fail;
+ sleep(1);
+ ret = perfmon_stopCounters();
+ if (ret != 0)
+ goto fail;
+ if ((perfmon_getLastResult(group,0,0) == 0)||(perfmon_getLastResult(group,1,0) == 0))
+ goto fail;
+ if (perfmon_getLastResult(group,0,0) != perfmon_getResult(group,0,0))
+ goto fail;
+ if (perfmon_getTimeOfGroup(group) == 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonmetric_noinit()
+{
+ double result = perfmon_getMetric(0,0,0);
+ if (result != 0)
+ goto fail;
+ return 1;
+fail:
+ return 0;
+}
+
+int test_perfmonmetric_noadd()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ int group;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ double result = perfmon_getMetric(0,0,0);
+ if (result != 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonmetric_nosetup()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ int group;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_addEventSet(eventset_ok);
+ if (ret != 0)
+ goto fail;
+ group = ret;
+ double result = perfmon_getMetric(group,0,0);
+ if (result != 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonmetric_nostart()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ int group;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_addEventSet(eventset_ok);
+ if (ret != 0)
+ goto fail;
+ group = ret;
+ ret = perfmon_setupCounters(group);
+ if (ret != 0)
+ goto fail;
+ double result = perfmon_getMetric(group,0,0);
+ if (result != 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonmetric_nostop()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ int group;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_addEventSet(eventset_ok);
+ if (ret != 0)
+ goto fail;
+ group = ret;
+ ret = perfmon_setupCounters(group);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_startCounters();
+ if (ret != 0)
+ goto fail;
+ double result = perfmon_getMetric(group,0,0);
+ if (result != 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonmetric_ok()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ int group;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_addEventSet(perfgroup_ok);
+ if (ret != 0)
+ goto fail;
+ group = ret;
+ ret = perfmon_setupCounters(group);
+ if (ret != 0)
+ goto fail;
+
+ ret = perfmon_startCounters();
+ if (ret != 0)
+ goto fail;
+ sleep(1);
+ ret = perfmon_stopCounters();
+ if (ret != 0)
+ goto fail;
+ if ((perfmon_getMetric(group,0,0) == 0)||(perfmon_getMetric(group,1,0) == 0))
+ goto fail;
+ if (perfmon_getTimeOfGroup(group) == 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonlastmetric_noinit()
+{
+ double result = perfmon_getLastMetric(0,0,0);
+ if (result != 0)
+ goto fail;
+ return 1;
+fail:
+ return 0;
+}
+
+int test_perfmonlastmetric_noadd()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ int group;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ double result = perfmon_getLastMetric(0,0,0);
+ if (result != 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonlastmetric_nosetup()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ int group;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_addEventSet(eventset_ok);
+ if (ret != 0)
+ goto fail;
+ group = ret;
+ double result = perfmon_getLastMetric(group,0,0);
+ if (result != 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonlastmetric_nostart()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ int group;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_addEventSet(eventset_ok);
+ if (ret != 0)
+ goto fail;
+ group = ret;
+ ret = perfmon_setupCounters(group);
+ if (ret != 0)
+ goto fail;
+ double result = perfmon_getLastMetric(group,0,0);
+ if (result != 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonlastmetric_nostop()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ int group;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_addEventSet(eventset_ok);
+ if (ret != 0)
+ goto fail;
+ group = ret;
+ ret = perfmon_setupCounters(group);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_startCounters();
+ if (ret != 0)
+ goto fail;
+ double result = perfmon_getLastMetric(group,0,0);
+ if (result != 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+int test_perfmonlastmetric_ok()
+{
+ CpuInfo_t cpuinfo;
+ int cpu = 0;
+ int group;
+ topology_init();
+ cpuinfo = get_cpuInfo();
+ if (cpuinfo->isIntel == 0)
+ {
+ topology_finalize();
+ return 1;
+ }
+ int ret = perfmon_init(1, &cpu);
+ if (ret != 0)
+ goto fail;
+ ret = perfmon_addEventSet(perfgroup_ok);
+ if (ret != 0)
+ goto fail;
+ group = ret;
+ ret = perfmon_setupCounters(group);
+ if (ret != 0)
+ goto fail;
+
+ ret = perfmon_startCounters();
+ if (ret != 0)
+ goto fail;
+ sleep(1);
+ ret = perfmon_stopCounters();
+ if (ret != 0)
+ goto fail;
+ if ((perfmon_getLastMetric(group,0,0) == 0)||(perfmon_getLastMetric(group,1,0) == 0))
+ goto fail;
+ if (perfmon_getLastMetric(group,0,0) != perfmon_getMetric(group,0,0))
+ goto fail;
+ if (perfmon_getLastMetric(group,1,0) != perfmon_getMetric(group,1,0))
+ goto fail;
+ if (perfmon_getTimeOfGroup(group) == 0)
+ goto fail;
+ perfmon_finalize();
+ topology_finalize();
+ return 1;
+fail:
+ perfmon_finalize();
+ topology_finalize();
+ return 0;
+}
+
+
+
+int test_timerinit()
+{
+ timer_init();
+ uint64_t clock = timer_getCpuClock();
+ if (clock == 0)
+ goto fail;
+ timer_finalize();
+ return 1;
+fail:
+ timer_finalize();
+ return 0;
+}
+
+int test_timerfinalize()
+{
+ timer_finalize();
+ return 1;
+}
+
+int test_timerprint_noinit()
+{
+ TimerData timer;
+ timer_reset(&timer);
+ double time = timer_print(&timer);
+ if (time != 0)
+ goto fail;
+ return 1;
+fail:
+ return 0;
+}
+
+int test_timerprint()
+{
+ TimerData timer;
+ timer_reset(&timer);
+ timer_init();
+ double time = timer_print(&timer);
+ if (time != 0)
+ goto fail;
+ uint64_t cycles = timer_printCycles(&timer);
+ if (cycles != 0)
+ goto fail;
+ timer_finalize();
+ return 1;
+fail:
+ timer_finalize();
+ return 0;
+}
+
+int test_timerprint_start()
+{
+ TimerData timer;
+ timer_reset(&timer);
+ timer_init();
+ timer_start(&timer);
+ double time = timer_print(&timer);
+ if (time == 0)
+ goto fail;
+ uint64_t cycles = timer_printCycles(&timer);
+ if (cycles == 0)
+ goto fail;
+ timer_finalize();
+ return 1;
+fail:
+ timer_finalize();
+ return 0;
+}
+
+int test_timerprint_stop()
+{
+ TimerData timer;
+ timer_init();
+ timer_reset(&timer);
+ timer_start(&timer);
+ timer_stop(&timer);
+ double time = timer_print(&timer);
+ if (time > 1)
+ goto fail;
+ if (time == 0)
+ goto fail;
+ uint64_t cycles = timer_printCycles(&timer);
+ if (cycles == 0)
+ goto fail;
+ if (cycles > timer_getCpuClock())
+ goto fail;
+ timer_finalize();
+ return 1;
+fail:
+ timer_finalize();
+ return 0;
+}
+
+int test_timercpuclock_noinit()
+{
+ uint64_t cyc = timer_getCpuClock();
+ if (cyc != 0)
+ return 0;
+ return 1;
+}
+
+int test_timercpuclock()
+{
+ timer_init();
+ uint64_t cyc = timer_getCpuClock();
+ if (cyc == 0)
+ return 0;
+ timer_finalize();
+ return 1;
+fail:
+ timer_finalize();
+ return 0;
+}
+
+int test_timerbaseline_noinit()
+{
+ uint64_t cyc = timer_getBaseline();
+ if (cyc != 0)
+ return 0;
+ return 1;
+}
+
+int test_timerbaseline()
+{
+ timer_init();
+ uint64_t cyc = timer_getBaseline();
+ if (cyc == 0)
+ return 0;
+ timer_finalize();
+ return 1;
+fail:
+ timer_finalize();
+ return 0;
+}
+
+int test_timersleep_noinit()
+{
+ timer_sleep(1E4);
+ return 1;
+}
+
+int test_timersleep()
+{
+ timer_init();
+ TimerData timer;
+ timer_start(&timer);
+ timer_sleep(1E6);
+ timer_stop(&timer);
+ if (timer_print(&timer) < 0.9E6*1E-6)
+ {
+ printf("Sleeping too short. timer is %f instead of 1 s\n", timer_print(&timer));
+ goto fail;
+ }
+ if (timer_print(&timer) > 1.1E6*1E-6)
+ {
+ printf("Sleeping too long. timer is %f instead of 1 s\n", 2E6*1E-6, timer_print(&timer));
+ goto fail;
+ }
+ timer_finalize();
+ return 1;
+fail:
+ timer_finalize();
+ return 0;
+}
+
+static test testlist[] = {
+ {"Test configuration initialization", test_initconfig, 1},
+ {"Enable configuration for following tests", enable_configuration, 1},
+ {"Test setting of access mode", test_hpmmode, 1},
+ {"Test access initialization", test_hpminit, 1},
+ {"Test adding CPU to access module", test_hpmaddthread, 1},
+ {"Disable configuration", disable_configuration, 1},
+ {"Test perfmon initialization without topology information", test_perfmoninit_faulty, 1},
+ {"Test topology module initialization", test_topologyinit, 1},
+ {"Test NUMA module initialization", test_numainit, 1},
+ {"Test affinity module initialization", test_affinityinit, 1},
+ {"Test perfmon initialization with topology information", test_perfmoninit_valid, 1},
+ {"Test adding event sets to perfmon module", test_perfmonaddeventset, 1},
+ {"Test adding event sets to perfmon module without initialization of perfmon", test_perfmonaddeventset_noinit, 1},
+ {"Test setting up an event set", test_perfmonsetup, 1},
+ {"Test switching between event sets", test_perfmonswitch, 1},
+ {"Test starting an event set", test_perfmonstart, 1},
+ {"Test setting up an event set without initialization", test_perfmonsetup_noinit, 1},
+ {"Test starting an event set without initialization", test_perfmonstart_noinit, 1},
+ {"Test setting up an event set without adding one", test_perfmonsetup_noadd, 1},
+ {"Test getting all performance groups", test_perfmongetgroups, 1},
+ {"Test setting up a custom event set and test group handling", test_perfmoncustomgroup, 1},
+ {"Test setting up a valid performance group and test group handling", test_perfmonperfgroup_ok, 1},
+ {"Test setting up a invalid performance group and test group handling", test_perfmonperfgroup_fail, 1},
+ {"Test starting an event set without adding one", test_perfmonstart_noadd, 1},
+ {"Test stopping an event set", test_perfmonstop, 1},
+ {"Test stopping an event set without initialization", test_perfmonstop_noinit, 1},
+ {"Test stopping an event set without adding one", test_perfmonstop_noadd, 1},
+ {"Test stopping an event set without setting one up", test_perfmonstop_nosetup, 1},
+ {"Test stopping an event set without starting one", test_perfmonstop_nostart, 1},
+ {"Test perfmon finalization", test_perfmonfinalize, 1},
+ {"Test perfmon result without initialization", test_perfmonresult_noinit, 1},
+ {"Test perfmon result without adding one", test_perfmonresult_noadd, 1},
+ {"Test perfmon result without setting up one", test_perfmonresult_nosetup, 1},
+ {"Test perfmon result without starting", test_perfmonresult_nostart, 1},
+ {"Test perfmon result without stopping", test_perfmonresult_nostop, 1},
+ {"Test perfmon result", test_perfmonresult, 1},
+ {"Test perfmon last result without initialization", test_perfmonlastresult_noinit, 1},
+ {"Test perfmon last result without adding one", test_perfmonlastresult_noadd, 1},
+ {"Test perfmon last result without setting up one", test_perfmonlastresult_nosetup, 1},
+ {"Test perfmon last result without starting", test_perfmonlastresult_nostart, 1},
+ {"Test perfmon last result without stopping", test_perfmonlastresult_nostop, 1},
+ {"Test perfmon last result", test_perfmonlastresult, 1},
+ {"Test initialization of timer module", test_timerinit, 1},
+ {"Test printing time without initialization", test_timerprint_noinit, 1},
+ {"Test printing time", test_timerprint, 1},
+ {"Test timer module finalization", test_timerfinalize, 1},
+ {"Test printing time for started clock", test_timerprint_start, 1},
+ {"Test printing time for started/stopped clock", test_timerprint_stop, 1},
+ {"Test reading cpu clock without initialization", test_timercpuclock_noinit, 1},
+ {"Test reading cpu clock", test_timercpuclock, 1},
+ {"Test reading baseline without initialization", test_timerbaseline_noinit, 1},
+ {"Test reading baseline", test_timerbaseline, 1},
+ {"Test sleeping with timer module without initialization", test_timersleep_noinit, 1},
+ {"Test sleeping with timer module", test_timersleep, 1},
+ {"Test perfmon metric without initialization", test_perfmonmetric_noinit, 1},
+ {"Test perfmon metric without adding one", test_perfmonmetric_noadd, 1},
+ {"Test perfmon metric without setting up one", test_perfmonmetric_nosetup, 1},
+ {"Test perfmon metric without starting", test_perfmonmetric_nostart, 1},
+ {"Test perfmon metric without stopping", test_perfmonmetric_nostop, 1},
+ {"Test perfmon metric", test_perfmonmetric_ok, 1},
+ {"Test perfmon last metric without initialization", test_perfmonlastmetric_noinit, 1},
+ {"Test perfmon last metric without adding one", test_perfmonlastmetric_noadd, 1},
+ {"Test perfmon last metric without setting up one", test_perfmonlastmetric_nosetup, 1},
+ {"Test perfmon last metric without starting", test_perfmonlastmetric_nostart, 1},
+ {"Test perfmon last metric without stopping", test_perfmonlastmetric_nostop, 1},
+ {"Test perfmon last metric", test_perfmonlastmetric_ok, 1},
+ {"Test cpustring with logical input", test_cpustring_logical, 1},
+ {"Test cpustring with physical input", test_cpustring_physical, 1},
+ {"Test cpustring with expression input", test_cpustring_expression, 1},
+ {"Test cpustring with scatter input", test_cpustring_scatter, 1},
+ {"Test cpustring with combined input", test_cpustring_combined, 1},
+ {NULL, NULL, 0},
+};
+
+int main()
+{
+ int i = 0;
+ //fclose(stderr);
+ if (verbose > 0) perfmon_setVerbosity(3);
+ while (testlist[i].testfunc != NULL)
+ {
+ printf("%s:\t", testlist[i].testname);
+ if (verbose > 0) printf("\n");
+ if (testlist[i].testfunc() != testlist[i].result)
+ {
+ printf("FAILED\n");
+ return 1;
+ }
+ printf("OK\n");
+ i++;
+ }
+ printf("All tests completed successfully.\n");
+ return 0;
+}
diff --git a/test/test-msr-access.c b/test/test-msr-access.c
new file mode 100644
index 0000000..1fea1ec
--- /dev/null
+++ b/test/test-msr-access.c
@@ -0,0 +1,101 @@
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+static char msr_name[] = "/dev/cpu/0/msr";
+static int msr_fd;
+
+int check_msr()
+{
+ if (access(msr_name, R_OK|W_OK))
+ {
+ fprintf(stderr,"Unable to access MSR device %s: %s\n", msr_name, strerror(errno));
+ return 1;
+ }
+ return 0;
+}
+
+int open_msr()
+{
+ msr_fd = open(msr_name, O_RDWR);
+ if (msr_fd < 0)
+ {
+ fprintf(stderr,"Cannot open MSR device %s: %s\n", msr_name, strerror(errno));
+ return 1;
+ }
+ return 0;
+}
+
+int close_msr()
+{
+ if (msr_fd > 0)
+ {
+ close(msr_fd);
+ }
+ return 0;
+}
+
+int read_msr()
+{
+ ssize_t ret;
+ uint64_t data = 0;
+ uint32_t reg = 0x38D;
+ if (msr_fd > 0)
+ {
+ ret = pread(msr_fd, &data, sizeof(uint64_t), reg);
+ if (ret < 0)
+ {
+ fprintf(stderr, "Cannot read register 0x%x at MSR %s: %s\n", reg, msr_name, strerror(errno));
+ return 1;
+ }
+ else if (ret != sizeof(uint64_t))
+ {
+ fprintf(stderr, "Incomplete read on register 0x%x at MSR %s: Only %lu bytes\n", reg, msr_name, ret);
+ return 1;
+ }
+ return 0;
+ }
+ return 1;
+}
+
+int write_msr()
+{
+ ssize_t ret;
+ uint64_t data = 0;
+ uint32_t reg = 0x38D;
+ if (msr_fd > 0)
+ {
+ ret = pwrite(msr_fd, &data, sizeof(uint64_t), reg);
+ if (ret < 0)
+ {
+ fprintf(stderr, "Cannot write register 0x%x at MSR %s: %s\n", reg, msr_name, strerror(errno));
+ return 1;
+ }
+ else if (ret != sizeof(uint64_t))
+ {
+ fprintf(stderr, "Incomplete read on register 0x%x at MSR %s: Only %lu bytes\n", reg, msr_name, ret);
+ return 1;
+ }
+ return 0;
+ }
+ return 1;
+}
+
+int main()
+{
+ int ret = 0;
+ if (check_msr()) return 1;
+ if (open_msr()) return 1;
+ if (read_msr()) return 1;
+ if (write_msr()) return 1;
+ if (close_msr()) return 1;
+ printf("All OK!\n");
+ return 0;
+}
diff --git a/test/testTBB.cc b/test/testTBB.cc
new file mode 100644
index 0000000..887400f
--- /dev/null
+++ b/test/testTBB.cc
@@ -0,0 +1,67 @@
+/*
+ File: testTBB.cc
+ Author: timday (stackoverflow)
+ Source: http://stackoverflow.com/questions/10607215/simplest-tbb-example
+
+ Extended by Thomas Roehl to do LIKWID Marker API calls and print the CPU for
+ the threads instead of 'n'
+*/
+
+#include "tbb/blocked_range.h"
+#include "tbb/parallel_for.h"
+#include "tbb/task_scheduler_init.h"
+#include <iostream>
+#include <vector>
+
+// Added by Thomas Roehl
+#include <sched.h>
+#include <likwid.h>
+
+
+struct mytask {
+ mytask(size_t n)
+ :_n(n)
+ {}
+ void operator()() {
+
+ for (int i=0;i<10000000;++i) {} // Deliberately run slow
+ std::cerr << "[" << sched_getcpu() << "]";
+
+ }
+ size_t _n;
+};
+
+struct executor
+{
+ executor(std::vector<mytask>& t)
+ :_tasks(t)
+ {}
+ executor(executor& e,tbb::split)
+ :_tasks(e._tasks)
+ {}
+
+ void operator()(const tbb::blocked_range<size_t>& r) const {
+ LIKWID_MARKER_START("TBB");
+ for (size_t i=r.begin();i!=r.end();++i)
+ _tasks[i]();
+ LIKWID_MARKER_STOP("TBB");
+ }
+
+ std::vector<mytask>& _tasks;
+};
+
+int main(int,char**) {
+
+ tbb::task_scheduler_init init; // Automatic number of threads
+
+ LIKWID_MARKER_INIT;
+ std::vector<mytask> tasks;
+ for (int i=0;i<1000;++i)
+ tasks.push_back(mytask(i));
+
+ executor exec(tasks);
+ tbb::parallel_for(tbb::blocked_range<size_t>(0,tasks.size()),exec);
+ std::cerr << std::endl;
+ LIKWID_MARKER_CLOSE;
+ return 0;
+}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/likwid/likwid.git
More information about the Likwid-commit
mailing list