Commit ea99fa9f authored by Cédric VALENSI's avatar Cédric VALENSI

Sync

parent e3f93d20
......@@ -74,6 +74,7 @@ ENDFOREACH(ARCH)
## ------------------------------------------------------------------##
SET(BASE_C_FLAGS "")
MESSAGE("-- Using C compiler ${CMAKE_C_COMPILER}")
# Use DEBUG variable to activate or not debug macros for MAQAO
IF ("x${DEBUG}" STREQUAL "xtrue")
......@@ -273,10 +274,6 @@ ELSE ()
SET (DISTCLEAN_LUA distclean_luajit)
ENDIF ()
IF ("X${IS_GIT}" STREQUAL "X")
SET(IS_GIT false)
ENDIF ()
IF( "X${VERSION_HASH}" STREQUAL "X" )
SET(VERSION_HASH "Version not available")
ENDIF ()
......
......@@ -15,8 +15,9 @@ If you choose the original lua package (and not luajit) then you will also need:
##### Installation #####
MAQAO is a cmake based project.
Go into the MAQAO folder
Get into the MAQAO folder and create a "build" folder.
If no "build" directory is present, create it
$ mkdir build
Go into the build folder:
$ cd build
Then run the following commands:
$ cmake ..
......@@ -32,15 +33,15 @@ Generated files are available trough a web browser. The LUA API documentation is
DeveloperGuide.html and the C API documentation is called CoreDeveloperGuide.html. Both
are located in MAQAO/doc
##### Running MAQAO tools #####
You can launch a MAQAO tool with the following command:
$ maqao <tool> [args]
For instance to list a binary's functions:
##### Running MAQAO modules #####
A MAQAO module can be launched with the following command:
$ maqao <module> [args]
For instance to list the functions in the binary /path_to/my_binary:
$ maqao analyze --list-functions /path_to/my_binary
The -h option provides help for a given tool.
The -h option provides help for a given module.
To execute user-defined MAQAO/Lua scripts use:
To execute user-defined MAQAO/Lua scripts, use:
$ maqao <lua-script> [args]
Note that MAQAO scripts are Lua scripts that can use the MAQAO Lua API extensions (see documentation for more information)
......
......@@ -22,8 +22,10 @@ If you choose the original lua package (and not luajit) then you will also need:
# Installation
MAQAO is a cmake based project.
Get into the MAQAO folder and create a "build" folder.
Go into the MAQAO folder
If no "build" directory is present, create it
` >$ mkdir build`
Go into the build folder:
` >$ cd build`
Then run the following commands:
` >$ cmake ..`
......@@ -40,20 +42,19 @@ Generated files are available trough a web browser. The LUA API documentation is
DeveloperGuide.html and the C API documentation is called CoreDeveloperGuide.html. Both
are located in MAQAO/doc
# Running MAQAO tools
You can launch a MAQAO tool with the following command:
` >$ maqao <tool> [args] `
For instance to list a binary's functions:
# Running MAQAO modules
A MAQAO module can be launched with the following command:
` >$ maqao <module> [args] `
For instance to list the functions in the binary /path_to/my_binary:
` >$ maqao analyze --list-functions /path_to/my_binary`
The -h option provides help for a given tool.
The -h option provides help for a given module.
To execute user-defined MAQAO/Lua scripts use:
To execute user-defined MAQAO/Lua scripts, use:
` >$ maqao <lua-script> [args] `
Note that MAQAO scripts are Lua script that can use the MAQAO Lua API extensions (see documentation for more information).
Note that MAQAO scripts are Lua scripts that can use the MAQAO Lua API extensions (see documentation for more information)
For general help on MAQAO:
......
all:
pdflatex manual.tex
\section{Function call characterization}
VPROF can instrument a binary to make it dump the parameters and the returned
values of a function call.
\subsection{Selecting function calls}
To select function calls, you have to use the following command:
\begin{center}
\texttt{--calls=[NAME-REGS,]*}
\end{center}
NAME is the name of the function (its symbol). REGS is a specification of the
registers to dump. Currently registers are specified as a bitset:
\begin{tabular}{|c|c|}
\hline
Bit & Register \\
\hline
0 & RAX \\
1 & RCX \\
2 & RDX \\
3 & RBX \\
4 & RBP \\
5 & RSI \\
6 & RDI \\
7 & R8 \\
8 & R9 \\
9 & R10 \\
10 & R11 \\
11 & R12 \\
12 & R13 \\
13 & R14 \\
14 & R15 \\
15 & FLAGS \\
16 & XMM0\_high \\
16 & XMM0\_low \\
17 & XMM1\_high \\
17 & XMM1\_low \\
18 & XMM2\_high \\
18 & XMM2\_low \\
19 & XMM3\_high \\
19 & XMM3\_low \\
20 & XMM4\_high \\
20 & XMM4\_low \\
21 & XMM5\_high \\
21 & XMM5\_low \\
22 & XMM6\_high \\
22 & XMM6\_low \\
23 & XMM7\_high \\
23 & XMM7\_low \\
24 & XMM8\_high \\
24 & XMM8\_low \\
25 & XMM9\_high \\
25 & XMM9\_low \\
26 & XMM10\_high \\
26 & XMM10\_low \\
27 & XMM11\_high \\
27 & XMM11\_low \\
28 & XMM12\_high \\
28 & XMM12\_low \\
29 & XMM13\_high \\
29 & XMM13\_low \\
30 & XMM14\_high \\
30 & XMM14\_low \\
31 & XMM15\_high \\
31 & XMM15\_low \\
32 & Returned RAX \\
33 & Returned XMMO\_high \\
33 & Returned XMMO\_low \\
\hline
\end{tabular}
\section{Introduction}
MAQAO Value Profiler (VPROF) is a tool to characterize elements of your applications
such as loops and function calls. It can be used to understand the behavior of
your programs or to determine how to specialize some of their parts to improve
performance.
\subsection{Overview}
VPROF is composed of three phases:
\begin{enumerate}
\item Instrumentation
\item Execution
\item Reporting
\end{enumerate}
The three phases can be executed independently by reusing results of the
previous phases. In order to reuse results, you must specify a project (i.e. a
folder) into which results are stored and read from.
The basic command line for this is:
\begin{center}
\texttt{maqao vprof BIN --project=PROJECT}
\end{center}
The structure of the project directory is described in Figure~\ref{fig:project}.
\begin{figure*}
\begin{description}
\item[PROJECT/BIN] \hfill \\ original binary
\item[PROJECT/config] \hfill \\ project configuration (binary name, checksum, etc.)
\item[PROJECT/latest.conf] \hfill \\
latest configuration
\item[PROJECT/vprof] \hfill \\
vprof specific files (the project directory is designed to support
other MAQAO modules)
\item[PROJECT/vprof/instruNNNN] \hfill \\
instrumentation directory where NNNN is the instrumentation number.
Can be safely deleted to remove an instrumentation
\item[PROJECT/vprof/instruNNNN/instru.conf] \hfill \\
Configuration of the instrumentation (list of key=value)
\item[PROJECT/vprof/instruNNNN/BIN] \hfill \\
Patched binary for this instrumentation
\item[PROJECT/vprof/instruNNNN/runRRRR] \hfill \\
Execution directory where RRRR is the execution number
\item[PROJECT/vprof/instruNNNN/runRRRR/run.conf] \hfill \\
Properties of the execution
\item[PROJECT/vprof/instruNNNN/runRRRR/env] \hfill \\
Execution environment
\item[PROJECT/vprof/instruNNNN/runRRRR/output] \hfill \\
STDOUT and STDERR of the executed application
\item[PROJECT/vprof/instruNNNN/runRRRR/host-HOST] \hfill \\
File dumped on a specific host (HOST is the hostname)
\item[PROJECT/vprof/instruNNNN/runRRRR/host-HOST/pid-PID] \hfill \\
File dumped by a specific process (PID is the processus identifier)
\item[PROJECT/vprof/instruNNNN/runRRRR/host-HOST/pid-PID/loopLLLL.*] \hfill \\
File dumped for a specific loop (LLLL is the loop ID in the original
binary)
\begin{description}
\item[loopLLLL.stats] Loop general statistics: minimum, maximum, average for
cycles, iterations and cycles/iteration. Buckets for
cycles/iteration.
\item[loopLLLL.iterations] Iteration count for all the instances
(optional)
\item[loopLLLL.cycles] Cycles measures for all the instances
(optional)
\item[loopLLLL.paths] Counters for the number of traversals of the
links between two blocks of the loop (optional)
\end{description}
\item[PROJECT/vprof/instruNNNN/runRRRR/host-HOST/pid-PID/FCT.calls] \hfill \\
Calls to the function FCT (with the dumped register values)
\end{description}
\caption{Project directory structure}
\label{fig:project}
\end{figure*}
\subsection{Configuration}
The current configuration of VPROF is printed before its execution. To only
display it without performing the execution, use the \texttt{--dry} command.
To reuse the latest configuration of a project, use \texttt{--latest}. The
configuration is overrided by the parameters passed on the command line.
\subsection{Mode selection}
VPROF currently supports several modes:
\begin{itemize}
\item Original: bench the original program
\item Loops: instrument loops
\item Function calls: instrument function calls
\end{itemize}
Depending on what you want to characterize, you have to select one of them with
the "mode" command:
\begin{center}
\texttt{--mode=[loops,calls,original]}
\end{center}
If no mode is specified, the "loops" mode is selected by default. The next two
sections describe each mode in details.
\subsection{Execution}
VPROF controls the execution of the instrumented binary. It allows it to
configure the execution (through environment variables) to store results into
the appropriate directory and to dump the program output into a file for
debugging purpose.
To enable the execution phase, you can use the "run" option:
\begin{center}
\texttt{--run=[cycles,iterations,none]*}
\end{center}
By default, VPROF selects the phases that have been selected in the
instrumentation phase.
\subsubsection{Application parameters}
If your application requires specific command-line parameters, you can customize
the command executed by VPROF with the "run-cmd" option. You can also use this
option to execute your program through a tierce application (e.g. mpirun).
In the following example, VPROF replaces \{MAQAO\_BIN\} with the application
path. Hence the program is executed through "mpirun" and with the "arg1"
parameter.
\begin{center}
\texttt{--run-cmd="mpirun -n 12 \{MAQAO\_BIN\} --arg1"}
\end{center}
Alternatively, you can specify command prefix and suffix independently. If you
want to specify a prefix to the command, use the following parameter:
\begin{center}
\texttt{--run-cmd-prefix="mpirun -n 12"}
\end{center}
If you want to specify parameters to the command, use "--" before the name of
your binary and its parameters:
\begin{center}
\texttt{... -- mybinary --arg1 --arg2"}
\end{center}
\subsubsection{Selecting a previous execution}
If you want to select the results from a previous execution (i.e. without
performing it again), you can select it with its unique identifier. Each
instrumentation directory contains one "runXXXX" directory per execution where
XXXX is the execution identifier.
To select the execution, use the following parameter:
\begin{center}
\texttt{--run-id=XXXX}\\
\end{center}
Note that you can specify "latest" instead of a specific identifer. It will
select the last execution.
In the multi-pass case, you can select two executions (one for cycles, one for
iterations) with the following parameters:
\begin{center}
\texttt{--run-cycle-id=XXXX}\\
\texttt{--run-iter-id=YYYY}
\end{center}
\subsection{Instrumentation}
VPROF can instrument a binary to make it dump the metrics for the selected
loops. You can control the process in different ways.
To enable the instrumentation and to select the metrics you are interested in, you
can use the "instrument" command-line parameter:
\begin{center}
\texttt{--instrument=[cycles,iterations,none]*}
\end{center}
\subsubsection{Multi-pass mode}
To measure the number of iterations, VPROF inserts some code executed at each
iteration. If you want to measure both cycles and iterations at the same time,
the inserted code will be counted into the reported cycles! While the inserted
code is often negligible, you can use the multi-pass mode to avoid this: two
instrumentations and two
executions of the program are performed (one for the cycles, one for the
iterations) and the results of both are associated. For this to work, you must
be sure that your code is executed deterministically.
Use the following parameter to enable the multi-pass mode:
\begin{center}
\texttt{--multi-pass}
\end{center}
Warning: this option forces the "store-instances" option that uses much more
memory: to compute cycles per iteration, we must store all the cycles and
iterations values instead of statistics such as average, min, max, etc.
\subsubsection{Serialization}
Intel processors use out-of-order execution of the instructions. This can be a
problem when we measure cycles: some instructions fetched before the portion of
code we measure can be included in the measure; some of the instructions we want
to measure can be excluded from the measure. To avoid this, it is possible to
use serializing instructions before and after the code we want to measure. In
VPROF, we use RDTSC and RDTSCP accordingly to Intel's whitepaper\footnote{How
to benchmark code execution times on Intel IA-32 and IA-64 instruction set
architectures}.
Use "serializing-timers" option (or "s") to enable this feature:
\begin{center}
\texttt{--serializing-timers}
\end{center}
Note that some applications can take longer to execute with this option.
\subsubsection{Storage}
Binaries instrumented with VPROF generate values (iterations, cycles, etc.) that
are later processed by VPROF to generate reports. By default, instrumented
programs keep the values in memory and write them in a file at the end of their
execution. This avoids performing costly IOs during program execution that could
change the behavior of the program.
It is undesirable, however, to have too many data in memory, because it can make
the system use swap memory. If you want VPROF to directly store data on disk,
you can use the "storage" option with the "disk" parameter.
\begin{center}
\texttt{--storage=[disk,memory]}
\end{center}
By default, it is not necessary to store metrics per instance: average, minimal
and maximal values can be computed on the fly. In some cases, however, we want
to store the cycles and iterations per instance. To do that, use the
"store-instances" option:
\begin{center}
\texttt{--store-instances}
\end{center}
This option is enforced when you use the multi-pass mode (in order to compute
cycles per iteration), when you want to compute advanced statistics that
cannot be computed on the fly (see "full-stats" reporting mode) or when you
want to dump these values (see "instances" report-mode option).
Warning: be careful when you use memory storage and the "store-instances" option: the
latter requires much more memory.
\subsubsection{Selecting a previous instrumentation}
If you want to reuse an existing instrumentation for the execution or the
reporting phase, you can select it with its identifier. Identifiers are given
when the instrumentation is done and can be found in the project directory:
"myproject/vprof/instruXXXX" where XXXX is the ID of the instrumentation.
Use the following parameters to select the instrumentations:
\begin{center}
\texttt{--instru-cycle-id=XXXX}\\
\texttt{--instru-iter-id=YYYY}
\end{center}
\section{Loop characterization}
A loop is a set of basic blocks that are executed repeatedly (partial or
conditional execution is possible). Each loop has the following metrics:
\begin{itemize}
\item The number of instances: how many times the loop is entered (and
exited)
\end{itemize}
\noindent Each loop instance has the following metrics:
\begin{itemize}
\item The number of iterations: how many times the set of basic blocks is
executed
\item Duration: the time it took (in cycles) to execute the whole instance
\end{itemize}
\subsection{Probes}
To measure the duration of a loop, VPROF inserts some code between each
predecessor of the loop entry blocks and the loop entry blocks; similarily it
inserts some code between exit blocks and their successors.
Iteration counting is a little bit trickier. Depending on the detected loop
pattern, the insertion will be different.
\begin{enumerate}
\item First, if VPROF detects a loop with a single entry and a single exit,
it detects the block evaluating the exit condition. It tries to find an
instruction before which it can insert an increment instruction (e.g. INC
in x86 ISA) without doing noticable side effects (e.g. on x86 the detected
instruction will overwrite flags modified by the INC instruction).
\item If it cannot find a correct instruction, it inserts an increment
instruction surrounded with context saving instructions to avoid making
noticable side effect. These instructions are usually costly (e.g.
PUSHFP/POPFP on x86).
\item If the loop pattern is not recognized, backedges are instrumented.
Backedges are control-flow from a block of the loop towards an entry block of
the loop.
\end{enumerate}
In the two first cases, the number of iterations is precisely determined. In the
last case, it can be wrong if the loop is too convoluted. However, the first
cases cover most of the innermost hottest loops.
\subsection{Loop selection}
In order to select the loops you are interested in characterizing, you must use
the command-line parameter "loop-id" (or "lid"). You can specify several loops
at once:
\begin{center}
\texttt{--loop-id=5,487,96,87}
\end{center}
Two parented loops cannot be instrumented at the same time, otherwise the
measure of the number of cycles would be too wrong: the outer loop would
measure the number of cycles taken to dump data on disk, etc.
By default, VPROF stops if it detects that two selected loops are parented. It
is possible, however, to let the process continue with a subset of the selected
loops by using the "allow-invalid-loops" option. In this case, for two selected
parented loops, the outer one will be selected.
\begin{center}
\texttt{--allow-invalid-loops}
\end{center}
\subsection{Loop paths}
VPROF can report paths that are heavily used in loops with the
\texttt{--enable-loop-paths} option. For every couple of consecutive blocks in a
loop, it adds a counter between both blocks that is incremented when the path is
taken.
\subsection{Reporting}
The reporting phase consists in computing statistics and generating a report for
the user. Several report formats are supported as well as different kinds of
statistics.
\subsubsection{Report modes}
To enable the report phase, use the following option with any parameter except
"none":
\begin{center}
\texttt{--report-mode=}\\
\texttt{[none,stats,full-stats,instances,mpi]}
\end{center}
This selected report mode is as follows:
\begin{itemize}
\item stats: simple statistics that can be computed on the fly (average,
minimal, maximal, etc.)
\item full-stats: statistics that require all instances (median, quartiles,
etc.)
\item instances: dump cycles and iterations for all instances (useful to
parse with your own scripts for instance)
\item mpi: statistics for several processes
\end{itemize}
By default, VPROF generates a report with the "stats" mode.\\
\noindent\textbf{MPI mode}
By default, VPROF selects the first process it finds to generate its report. To
select another process, use the following options to select the host name and
the process identifier:
\begin{center}
\texttt{--report-host=HOST}\\
\texttt{--report-pid=XXXX}
\end{center}
VPROF dumps results into "host-HOST/pid-XXXX" directories in the execution
directory.\\
\noindent\textbf{Instances mode}
When the "instances" mode is selected, the number of instances can be too large
for the report to be useful. To select only a few instances, use the
"filter-instances" option:
\begin{center}
\texttt{--filter-instances=[N,last,middle]}
\end{center}
This option accepts a comma-separated list of instance numbers. In addition, the
following keywords can be used as instance numbers:
\begin{itemize}
\item last: the last instance
\item middle: the (last/2) instance
\end{itemize}
\subsubsection{Output path}
By default the report is dumped on the standard output. To write it in a file,
use the "output-path" (or "op") option:
\begin{center}
\texttt{--output-path=my\_report.html}
\end{center}
\subsubsection{Output format}
To select the output format, use the "output-format" (or "of") option:
\begin{center}
\texttt{--output-format=[text,csv,html,arff]}
\end{center}
The output format are:
\begin{itemize}
\item text: MarkDown text format
\item csv: comma-separated-values
\item html: HTML output with fancy charts
\item arff: attribute-relation file format, similar to CSV but with typed fields. This format is recognized
by Weka data-mining software.
\end{itemize}
The default output format is "text".
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Large Colored Title Article
% LaTeX Template
% Version 1.1 (25/11/12)
%
% This template has been downloaded from:
% http://www.LaTeXTemplates.com
%
% Original author:
% Frits Wenneker (http://www.howtotex.com)
%
% License:
% CC BY-NC-SA 3.0 (http://creativecommons.org/licenses/by-nc-sa/3.0/)
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%----------------------------------------------------------------------------------------
% PACKAGES AND OTHER DOCUMENT CONFIGURATIONS
%----------------------------------------------------------------------------------------
\documentclass[DIV=calc, paper=a4, fontsize=11pt, twocolumn]{scrartcl} % A4 paper and 11pt font size
\usepackage{lipsum} % Used for inserting dummy 'Lorem ipsum' text into the template
\usepackage[english]{babel} % English language/hyphenation
\usepackage[protrusion=true,expansion=true]{microtype} % Better typography
\usepackage{amsmath,amsfonts,amsthm} % Math packages
\usepackage[svgnames]{xcolor} % Enabling colors by their 'svgnames'
\usepackage[hang, small,labelfont=bf,up,textfont=it,up]{caption} % Custom captions under/above floats in tables or figures
\usepackage{booktabs} % Horizontal rules in tables
\usepackage{fix-cm} % Custom font sizes - used for the initial letter in the document
\usepackage{sectsty} % Enables custom section titles
\allsectionsfont{\usefont{OT1}{phv}{b}{n}} % Change the font of all section commands
\usepackage{fancyhdr} % Needed to define custom headers/footers
\pagestyle{fancy} % Enables the custom headers/footers
\usepackage{lastpage} % Used to determine the number of pages in the document (for "Page X of Total")
% Headers - all currently empty
\lhead{}
\chead{}
\rhead{}
% Footers
\lfoot{}
\cfoot{}
\rfoot{\footnotesize Page \thepage\ of \pageref{LastPage}} % "Page 1 of 2"
\renewcommand{\headrulewidth}{0.0pt} % No header rule
\renewcommand{\footrulewidth}{0.4pt} % Thin footer rule
\usepackage{lettrine} % Package to accentuate the first letter of the text
\newcommand{\initial}[1]{ % Defines the command and style for the first letter
\lettrine[lines=3,lhang=0.3,nindent=0em]{
\color{DarkGoldenrod}
{\textsf{#1}}}{}}
%----------------------------------------------------------------------------------------
% TITLE SECTION
%----------------------------------------------------------------------------------------
\usepackage{titling} % Allows custom title configuration
\newcommand{\HorRule}{\color{DarkGoldenrod} \rule{\linewidth}{1pt}} % Defines the gold horizontal rule around the title
\pretitle{\vspace{-30pt} \begin{flushleft} \HorRule \fontsize{45}{45} \usefont{OT1}{phv}{b}{n} \color{DarkRed} \selectfont} % Horizontal rule before the title
\title{MAQAO Value Profiler} % Your article title
\posttitle{\par\end{flushleft}\vskip 0.5em} % Whitespace under the title
\preauthor{\begin{flushleft}\large \lineskip 0.5em \usefont{OT1}{phv}{b}{sl} \color{DarkRed}} % Author font configuration
\author{Sylvain HENRY, } % Your name
\postauthor{\footnotesize \usefont{OT1}{phv}{m}{sl} \color{Black} % Configuration for the institution name
University of Versailles % Your institution
\par\end{flushleft}\HorRule} % Horizontal rule after the title
\date{} % Add a date here if you would like one to appear underneath the title block
%----------------------------------------------------------------------------------------
\begin{document}
\maketitle % Print the title
\thispagestyle{fancy} % Enabling the custom headers/footers for the first page
%----------------------------------------------------------------------------------------
% ABSTRACT
%----------------------------------------------------------------------------------------
% The first character should be within \initial{}
\initial{V}\textbf{alue profiling consists in characterizing application
elements (loops, functions, etc.) with values such as the number of iterations,
the number of cycles, the calling parameters. In this article, we present MAQAO
Value Profiler module.}
\tableofcontents
\vfill
\newpage
\input{intro.tex}
\input{loop_intro.tex}
\input{loop_instru.tex}
\input{loop_exec.tex}
\input{loop_report.tex}
\input{call_intro.tex}
\end{document}
......@@ -3,32 +3,32 @@
# Sets the list of architectures to be handled by MAQAO, separated by ';'.
# Possible values: x86_64, k1om, ia32. Default value: "x86_64"
SET(ARCHS x86_64;k1om)
#SET(ARCHS x86_64)
# The optimisation level (higher is more optimised).
# Possible values: 0 / 1 / 2 (default) / 3
SET(OPTIM_LVL 0)
#SET(OPTIM_LVL 2)
# Set the Lua interpreter.
# Possible values: lua / luajit (default)
SET(LUA luajit)
#SET(LUA luajit)
# Specifies whether the binary must be stripped.